{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.998401023345059, "eval_steps": 300, "global_step": 2637, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.8867924528301888e-05, "logits/chosen": -4.26362943649292, "logits/rejected": -4.150611877441406, "logps/chosen": -477.07171630859375, "logps/rejected": -348.1441955566406, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "learning_rate": 0.00018867924528301889, "logits/chosen": -4.269673824310303, "logits/rejected": -4.142339706420898, "logps/chosen": -430.44708251953125, "logps/rejected": -336.1376953125, "loss": 0.6931, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": 0.0017908420413732529, "rewards/margins": 0.0009290997986681759, "rewards/rejected": 0.0008617420098744333, "step": 10 }, { "epoch": 0.02, "learning_rate": 0.00037735849056603777, "logits/chosen": -4.245087623596191, "logits/rejected": -4.13867712020874, "logps/chosen": -436.5538024902344, "logps/rejected": -343.5115661621094, "loss": 0.6723, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.12751512229442596, "rewards/margins": 0.05264924839138985, "rewards/rejected": 0.07486586272716522, "step": 20 }, { "epoch": 0.03, "learning_rate": 0.0005660377358490566, "logits/chosen": -4.245123863220215, "logits/rejected": -4.083767890930176, "logps/chosen": -446.7200622558594, "logps/rejected": -337.4728088378906, "loss": 0.6104, "rewards/accuracies": 0.659375011920929, "rewards/chosen": 0.38573843240737915, "rewards/margins": 0.29684916138648987, "rewards/rejected": 0.08888928592205048, "step": 30 }, { "epoch": 0.05, "learning_rate": 0.0007547169811320755, "logits/chosen": -4.237982273101807, "logits/rejected": -4.100085735321045, "logps/chosen": -445.2154846191406, "logps/rejected": -346.43035888671875, "loss": 0.6206, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.2663884162902832, "rewards/margins": 0.4935511648654938, "rewards/rejected": -0.22716276347637177, "step": 40 }, { "epoch": 0.06, "learning_rate": 0.0009433962264150943, "logits/chosen": -4.271880149841309, "logits/rejected": -4.103459358215332, "logps/chosen": -465.46875, "logps/rejected": -339.4776306152344, "loss": 0.5565, "rewards/accuracies": 0.703125, "rewards/chosen": 0.5158874988555908, "rewards/margins": 0.5827394723892212, "rewards/rejected": -0.06685198098421097, "step": 50 }, { "epoch": 0.07, "learning_rate": 0.0009972910216718266, "logits/chosen": -4.268113136291504, "logits/rejected": -4.074869632720947, "logps/chosen": -483.421875, "logps/rejected": -346.48541259765625, "loss": 0.5705, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.3332400321960449, "rewards/margins": 0.6556930541992188, "rewards/rejected": -0.32245302200317383, "step": 60 }, { "epoch": 0.08, "learning_rate": 0.000993421052631579, "logits/chosen": -4.23517370223999, "logits/rejected": -4.111695289611816, "logps/chosen": -457.7568359375, "logps/rejected": -357.06292724609375, "loss": 0.5674, "rewards/accuracies": 0.6968749761581421, "rewards/chosen": -0.08116689324378967, "rewards/margins": 0.6655942797660828, "rewards/rejected": -0.7467612028121948, "step": 70 }, { "epoch": 0.09, "learning_rate": 0.0009895510835913313, "logits/chosen": -4.249506950378418, "logits/rejected": -4.094361305236816, "logps/chosen": -422.6045837402344, "logps/rejected": -320.1609191894531, "loss": 0.5624, "rewards/accuracies": 0.6953125, "rewards/chosen": 0.32660627365112305, "rewards/margins": 0.6634021997451782, "rewards/rejected": -0.3367958962917328, "step": 80 }, { "epoch": 0.1, "learning_rate": 0.0009856811145510836, "logits/chosen": -4.271591663360596, "logits/rejected": -4.078057289123535, "logps/chosen": -440.7449645996094, "logps/rejected": -323.4222106933594, "loss": 0.5601, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.3649946451187134, "rewards/margins": 0.7031130790710449, "rewards/rejected": -0.33811843395233154, "step": 90 }, { "epoch": 0.11, "learning_rate": 0.000981811145510836, "logits/chosen": -4.244426250457764, "logits/rejected": -4.094484806060791, "logps/chosen": -469.42620849609375, "logps/rejected": -356.9173278808594, "loss": 0.6045, "rewards/accuracies": 0.6640625, "rewards/chosen": 0.3856131434440613, "rewards/margins": 0.6721194386482239, "rewards/rejected": -0.2865062355995178, "step": 100 }, { "epoch": 0.13, "learning_rate": 0.0009779411764705883, "logits/chosen": -4.290276527404785, "logits/rejected": -4.138627529144287, "logps/chosen": -426.1455078125, "logps/rejected": -348.8928527832031, "loss": 0.585, "rewards/accuracies": 0.6890624761581421, "rewards/chosen": -0.2585579752922058, "rewards/margins": 0.6368122696876526, "rewards/rejected": -0.8953703045845032, "step": 110 }, { "epoch": 0.14, "learning_rate": 0.0009740712074303407, "logits/chosen": -4.261538505554199, "logits/rejected": -4.124102592468262, "logps/chosen": -443.6263122558594, "logps/rejected": -346.98095703125, "loss": 0.6235, "rewards/accuracies": 0.635937511920929, "rewards/chosen": 0.010320848785340786, "rewards/margins": 0.45627111196517944, "rewards/rejected": -0.4459502696990967, "step": 120 }, { "epoch": 0.15, "learning_rate": 0.0009702012383900929, "logits/chosen": -4.262430191040039, "logits/rejected": -4.121800422668457, "logps/chosen": -443.3226623535156, "logps/rejected": -332.74212646484375, "loss": 0.5839, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.0992940291762352, "rewards/margins": 0.6375746726989746, "rewards/rejected": -0.5382806062698364, "step": 130 }, { "epoch": 0.16, "learning_rate": 0.0009663312693498452, "logits/chosen": -4.235437870025635, "logits/rejected": -4.099891662597656, "logps/chosen": -435.7318420410156, "logps/rejected": -346.4457092285156, "loss": 0.6068, "rewards/accuracies": 0.6875, "rewards/chosen": 0.02139856480062008, "rewards/margins": 0.6516961455345154, "rewards/rejected": -0.6302975416183472, "step": 140 }, { "epoch": 0.17, "learning_rate": 0.0009624613003095976, "logits/chosen": -4.21560001373291, "logits/rejected": -4.1203203201293945, "logps/chosen": -430.91741943359375, "logps/rejected": -350.86297607421875, "loss": 0.5968, "rewards/accuracies": 0.660937488079071, "rewards/chosen": -0.11091029644012451, "rewards/margins": 0.5855100154876709, "rewards/rejected": -0.6964203119277954, "step": 150 }, { "epoch": 0.18, "learning_rate": 0.0009585913312693498, "logits/chosen": -4.225467681884766, "logits/rejected": -4.082638740539551, "logps/chosen": -445.6004333496094, "logps/rejected": -339.7175598144531, "loss": 0.5798, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.4418737292289734, "rewards/margins": 0.7666674852371216, "rewards/rejected": -1.2085412740707397, "step": 160 }, { "epoch": 0.19, "learning_rate": 0.0009547213622291022, "logits/chosen": -4.233388900756836, "logits/rejected": -4.118756294250488, "logps/chosen": -458.2998962402344, "logps/rejected": -369.98284912109375, "loss": 0.5881, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.6502363681793213, "rewards/margins": 0.8370116949081421, "rewards/rejected": -1.487248182296753, "step": 170 }, { "epoch": 0.2, "learning_rate": 0.0009508513931888545, "logits/chosen": -4.2017998695373535, "logits/rejected": -4.0570549964904785, "logps/chosen": -473.88446044921875, "logps/rejected": -367.6838073730469, "loss": 0.5645, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.03701040893793106, "rewards/margins": 0.7756559252738953, "rewards/rejected": -0.8126662969589233, "step": 180 }, { "epoch": 0.22, "learning_rate": 0.0009469814241486069, "logits/chosen": -4.211549282073975, "logits/rejected": -4.062457084655762, "logps/chosen": -452.24853515625, "logps/rejected": -367.0791015625, "loss": 0.6017, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.10348516702651978, "rewards/margins": 0.7024599313735962, "rewards/rejected": -0.5989748239517212, "step": 190 }, { "epoch": 0.23, "learning_rate": 0.0009431114551083592, "logits/chosen": -4.216209411621094, "logits/rejected": -4.068336486816406, "logps/chosen": -474.145751953125, "logps/rejected": -371.8517150878906, "loss": 0.5743, "rewards/accuracies": 0.715624988079071, "rewards/chosen": 0.10046651214361191, "rewards/margins": 0.7671835422515869, "rewards/rejected": -0.666716992855072, "step": 200 }, { "epoch": 0.24, "learning_rate": 0.0009392414860681115, "logits/chosen": -4.217007637023926, "logits/rejected": -4.085082530975342, "logps/chosen": -453.36968994140625, "logps/rejected": -347.0650939941406, "loss": 0.5803, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.029905159026384354, "rewards/margins": 0.7781075239181519, "rewards/rejected": -0.8080127835273743, "step": 210 }, { "epoch": 0.25, "learning_rate": 0.0009353715170278638, "logits/chosen": -4.216742515563965, "logits/rejected": -4.122211933135986, "logps/chosen": -419.58551025390625, "logps/rejected": -356.08404541015625, "loss": 0.588, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.13175122439861298, "rewards/margins": 0.7468441724777222, "rewards/rejected": -0.6150928735733032, "step": 220 }, { "epoch": 0.26, "learning_rate": 0.0009315015479876161, "logits/chosen": -4.219125270843506, "logits/rejected": -4.066555023193359, "logps/chosen": -481.00860595703125, "logps/rejected": -359.76727294921875, "loss": 0.5648, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16599717736244202, "rewards/margins": 0.7627758979797363, "rewards/rejected": -0.5967787504196167, "step": 230 }, { "epoch": 0.27, "learning_rate": 0.0009276315789473685, "logits/chosen": -4.2346296310424805, "logits/rejected": -4.063499927520752, "logps/chosen": -418.7252502441406, "logps/rejected": -333.09735107421875, "loss": 0.5835, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.11978032439947128, "rewards/margins": 0.8224458694458008, "rewards/rejected": -0.9422264099121094, "step": 240 }, { "epoch": 0.28, "learning_rate": 0.0009237616099071207, "logits/chosen": -4.208186149597168, "logits/rejected": -4.023766994476318, "logps/chosen": -438.6776428222656, "logps/rejected": -338.7682189941406, "loss": 0.5578, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.31454938650131226, "rewards/margins": 0.8216124773025513, "rewards/rejected": -1.1361620426177979, "step": 250 }, { "epoch": 0.3, "learning_rate": 0.0009198916408668731, "logits/chosen": -4.2468719482421875, "logits/rejected": -4.018538475036621, "logps/chosen": -489.1776428222656, "logps/rejected": -366.78851318359375, "loss": 0.57, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.12536367774009705, "rewards/margins": 0.8402262926101685, "rewards/rejected": -0.9655898809432983, "step": 260 }, { "epoch": 0.31, "learning_rate": 0.0009160216718266254, "logits/chosen": -4.240145683288574, "logits/rejected": -4.0663580894470215, "logps/chosen": -436.77545166015625, "logps/rejected": -332.03851318359375, "loss": 0.5582, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.08584287017583847, "rewards/margins": 0.8074740171432495, "rewards/rejected": -0.7216311693191528, "step": 270 }, { "epoch": 0.32, "learning_rate": 0.0009121517027863777, "logits/chosen": -4.225857734680176, "logits/rejected": -4.034966468811035, "logps/chosen": -450.14349365234375, "logps/rejected": -341.7889099121094, "loss": 0.57, "rewards/accuracies": 0.714062511920929, "rewards/chosen": 0.1979350745677948, "rewards/margins": 0.8185775876045227, "rewards/rejected": -0.6206425428390503, "step": 280 }, { "epoch": 0.33, "learning_rate": 0.0009082817337461301, "logits/chosen": -4.216963291168213, "logits/rejected": -4.057393550872803, "logps/chosen": -468.74169921875, "logps/rejected": -357.6235656738281, "loss": 0.5836, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.5162747502326965, "rewards/margins": 0.9472195506095886, "rewards/rejected": -1.4634945392608643, "step": 290 }, { "epoch": 0.34, "learning_rate": 0.0009044117647058824, "logits/chosen": -4.233221054077148, "logits/rejected": -4.078063011169434, "logps/chosen": -448.629150390625, "logps/rejected": -341.4031677246094, "loss": 0.5491, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5640617609024048, "rewards/margins": 0.863238513469696, "rewards/rejected": -1.427300214767456, "step": 300 }, { "epoch": 0.34, "eval_logits/chosen": -4.225715160369873, "eval_logits/rejected": -4.059168815612793, "eval_logps/chosen": -448.5167236328125, "eval_logps/rejected": -351.014892578125, "eval_loss": 0.5718809962272644, "eval_rewards/accuracies": 0.7015333771705627, "eval_rewards/chosen": -0.5175743699073792, "eval_rewards/margins": 0.8181003332138062, "eval_rewards/rejected": -1.3356746435165405, "eval_runtime": 593.2365, "eval_samples_per_second": 3.076, "eval_steps_per_second": 1.539, "step": 300 }, { "epoch": 0.35, "learning_rate": 0.0009005417956656347, "logits/chosen": -4.243526935577393, "logits/rejected": -4.068151950836182, "logps/chosen": -455.3677673339844, "logps/rejected": -330.9251708984375, "loss": 0.5776, "rewards/accuracies": 0.703125, "rewards/chosen": -0.47484833002090454, "rewards/margins": 0.8207438588142395, "rewards/rejected": -1.2955920696258545, "step": 310 }, { "epoch": 0.36, "learning_rate": 0.000896671826625387, "logits/chosen": -4.218557834625244, "logits/rejected": -4.050337791442871, "logps/chosen": -474.8924865722656, "logps/rejected": -366.99725341796875, "loss": 0.5714, "rewards/accuracies": 0.692187488079071, "rewards/chosen": -0.0741639956831932, "rewards/margins": 0.8207231760025024, "rewards/rejected": -0.8948871493339539, "step": 320 }, { "epoch": 0.38, "learning_rate": 0.0008928018575851393, "logits/chosen": -4.208563804626465, "logits/rejected": -4.052390098571777, "logps/chosen": -457.8515625, "logps/rejected": -363.40972900390625, "loss": 0.5531, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.27610406279563904, "rewards/margins": 0.9492728114128113, "rewards/rejected": -1.225376844406128, "step": 330 }, { "epoch": 0.39, "learning_rate": 0.0008889318885448917, "logits/chosen": -4.221163272857666, "logits/rejected": -4.103056907653809, "logps/chosen": -431.69390869140625, "logps/rejected": -341.13616943359375, "loss": 0.6064, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.12021847069263458, "rewards/margins": 0.6823347210884094, "rewards/rejected": -0.562116265296936, "step": 340 }, { "epoch": 0.4, "learning_rate": 0.000885061919504644, "logits/chosen": -4.2203521728515625, "logits/rejected": -4.039450168609619, "logps/chosen": -452.77398681640625, "logps/rejected": -352.8782653808594, "loss": 0.5576, "rewards/accuracies": 0.723437488079071, "rewards/chosen": 0.0901552215218544, "rewards/margins": 0.8166009187698364, "rewards/rejected": -0.7264456748962402, "step": 350 }, { "epoch": 0.41, "learning_rate": 0.0008811919504643962, "logits/chosen": -4.21645450592041, "logits/rejected": -4.021177291870117, "logps/chosen": -459.27081298828125, "logps/rejected": -332.7095031738281, "loss": 0.5921, "rewards/accuracies": 0.667187511920929, "rewards/chosen": -0.050455838441848755, "rewards/margins": 0.8579805493354797, "rewards/rejected": -0.9084362983703613, "step": 360 }, { "epoch": 0.42, "learning_rate": 0.0008773219814241487, "logits/chosen": -4.236178874969482, "logits/rejected": -4.036398887634277, "logps/chosen": -426.27215576171875, "logps/rejected": -316.69879150390625, "loss": 0.5465, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.20080848038196564, "rewards/margins": 0.8921493291854858, "rewards/rejected": -1.092957854270935, "step": 370 }, { "epoch": 0.43, "learning_rate": 0.0008734520123839009, "logits/chosen": -4.229998588562012, "logits/rejected": -4.023078441619873, "logps/chosen": -449.428955078125, "logps/rejected": -343.252197265625, "loss": 0.5287, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.3050183355808258, "rewards/margins": 0.9389355778694153, "rewards/rejected": -1.2439539432525635, "step": 380 }, { "epoch": 0.44, "learning_rate": 0.0008695820433436534, "logits/chosen": -4.2357964515686035, "logits/rejected": -4.096763610839844, "logps/chosen": -454.3067932128906, "logps/rejected": -357.83856201171875, "loss": 0.5722, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5329670310020447, "rewards/margins": 1.023606300354004, "rewards/rejected": -1.5565732717514038, "step": 390 }, { "epoch": 0.45, "learning_rate": 0.0008657120743034056, "logits/chosen": -4.255030632019043, "logits/rejected": -4.100337982177734, "logps/chosen": -457.13336181640625, "logps/rejected": -365.8040771484375, "loss": 0.5788, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.6265755295753479, "rewards/margins": 0.8858826756477356, "rewards/rejected": -1.5124582052230835, "step": 400 }, { "epoch": 0.47, "learning_rate": 0.0008618421052631579, "logits/chosen": -4.208388328552246, "logits/rejected": -4.026553153991699, "logps/chosen": -470.4761657714844, "logps/rejected": -348.3751220703125, "loss": 0.548, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.5182205438613892, "rewards/margins": 0.9629500508308411, "rewards/rejected": -1.4811705350875854, "step": 410 }, { "epoch": 0.48, "learning_rate": 0.0008579721362229103, "logits/chosen": -4.221690654754639, "logits/rejected": -4.037428379058838, "logps/chosen": -459.80535888671875, "logps/rejected": -356.7042236328125, "loss": 0.5592, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.47319111227989197, "rewards/margins": 1.0091335773468018, "rewards/rejected": -1.482324481010437, "step": 420 }, { "epoch": 0.49, "learning_rate": 0.0008541021671826625, "logits/chosen": -4.177244663238525, "logits/rejected": -3.998875856399536, "logps/chosen": -457.3941955566406, "logps/rejected": -363.4350891113281, "loss": 0.5895, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.29423946142196655, "rewards/margins": 0.9634512662887573, "rewards/rejected": -1.257690668106079, "step": 430 }, { "epoch": 0.5, "learning_rate": 0.0008502321981424149, "logits/chosen": -4.2054243087768555, "logits/rejected": -4.038780212402344, "logps/chosen": -472.5858459472656, "logps/rejected": -368.7754821777344, "loss": 0.5887, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.5265928506851196, "rewards/margins": 0.8209060430526733, "rewards/rejected": -1.347498893737793, "step": 440 }, { "epoch": 0.51, "learning_rate": 0.0008463622291021672, "logits/chosen": -4.216706275939941, "logits/rejected": -3.987858533859253, "logps/chosen": -486.7978515625, "logps/rejected": -349.7285461425781, "loss": 0.5613, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.3457922637462616, "rewards/margins": 0.9088281393051147, "rewards/rejected": -1.2546203136444092, "step": 450 }, { "epoch": 0.52, "learning_rate": 0.0008424922600619195, "logits/chosen": -4.199111461639404, "logits/rejected": -4.025119304656982, "logps/chosen": -468.31280517578125, "logps/rejected": -367.52484130859375, "loss": 0.6077, "rewards/accuracies": 0.682812511920929, "rewards/chosen": -0.7301903367042542, "rewards/margins": 1.0099103450775146, "rewards/rejected": -1.740100622177124, "step": 460 }, { "epoch": 0.53, "learning_rate": 0.0008386222910216718, "logits/chosen": -4.244906902313232, "logits/rejected": -4.096035480499268, "logps/chosen": -436.56280517578125, "logps/rejected": -332.5130920410156, "loss": 0.5661, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.5279966592788696, "rewards/margins": 0.8724597692489624, "rewards/rejected": -1.4004563093185425, "step": 470 }, { "epoch": 0.55, "learning_rate": 0.0008347523219814242, "logits/chosen": -4.250918388366699, "logits/rejected": -4.056090354919434, "logps/chosen": -467.1234436035156, "logps/rejected": -345.65924072265625, "loss": 0.5617, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.03435497730970383, "rewards/margins": 0.9432379603385925, "rewards/rejected": -0.977592945098877, "step": 480 }, { "epoch": 0.56, "learning_rate": 0.0008308823529411765, "logits/chosen": -4.242486953735352, "logits/rejected": -4.078185081481934, "logps/chosen": -467.85528564453125, "logps/rejected": -348.805908203125, "loss": 0.5638, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.49105095863342285, "rewards/margins": 0.9864526987075806, "rewards/rejected": -1.477503776550293, "step": 490 }, { "epoch": 0.57, "learning_rate": 0.0008270123839009289, "logits/chosen": -4.2404680252075195, "logits/rejected": -4.057986259460449, "logps/chosen": -465.6255798339844, "logps/rejected": -360.9566955566406, "loss": 0.5819, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.6507476568222046, "rewards/margins": 0.8752163648605347, "rewards/rejected": -1.5259640216827393, "step": 500 }, { "epoch": 0.58, "learning_rate": 0.0008231424148606811, "logits/chosen": -4.193081855773926, "logits/rejected": -3.991605043411255, "logps/chosen": -450.4684143066406, "logps/rejected": -342.21734619140625, "loss": 0.5572, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.4209820330142975, "rewards/margins": 0.8563458323478699, "rewards/rejected": -1.2773278951644897, "step": 510 }, { "epoch": 0.59, "learning_rate": 0.0008192724458204334, "logits/chosen": -4.205255031585693, "logits/rejected": -4.04762601852417, "logps/chosen": -431.84381103515625, "logps/rejected": -349.4865417480469, "loss": 0.5625, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -0.36887282133102417, "rewards/margins": 0.9660455584526062, "rewards/rejected": -1.3349183797836304, "step": 520 }, { "epoch": 0.6, "learning_rate": 0.0008154024767801858, "logits/chosen": -4.2263288497924805, "logits/rejected": -4.053371906280518, "logps/chosen": -464.28045654296875, "logps/rejected": -359.5805358886719, "loss": 0.5652, "rewards/accuracies": 0.71875, "rewards/chosen": -0.45193976163864136, "rewards/margins": 0.9424787759780884, "rewards/rejected": -1.394418478012085, "step": 530 }, { "epoch": 0.61, "learning_rate": 0.0008115325077399381, "logits/chosen": -4.186491966247559, "logits/rejected": -4.039113521575928, "logps/chosen": -419.86212158203125, "logps/rejected": -337.95513916015625, "loss": 0.5605, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -0.5429974794387817, "rewards/margins": 0.9306566119194031, "rewards/rejected": -1.47365403175354, "step": 540 }, { "epoch": 0.63, "learning_rate": 0.0008076625386996904, "logits/chosen": -4.15422248840332, "logits/rejected": -4.037220001220703, "logps/chosen": -450.1904296875, "logps/rejected": -367.74444580078125, "loss": 0.5849, "rewards/accuracies": 0.7046874761581421, "rewards/chosen": -1.003206491470337, "rewards/margins": 0.9363033175468445, "rewards/rejected": -1.9395097494125366, "step": 550 }, { "epoch": 0.64, "learning_rate": 0.0008037925696594427, "logits/chosen": -4.185969829559326, "logits/rejected": -4.027823448181152, "logps/chosen": -459.4979553222656, "logps/rejected": -355.8067321777344, "loss": 0.5582, "rewards/accuracies": 0.7203124761581421, "rewards/chosen": -0.6537607908248901, "rewards/margins": 0.9692694544792175, "rewards/rejected": -1.623030424118042, "step": 560 }, { "epoch": 0.65, "learning_rate": 0.0007999226006191951, "logits/chosen": -4.221198081970215, "logits/rejected": -4.0832061767578125, "logps/chosen": -442.4005432128906, "logps/rejected": -345.3169250488281, "loss": 0.5523, "rewards/accuracies": 0.714062511920929, "rewards/chosen": -0.4901123046875, "rewards/margins": 1.0185575485229492, "rewards/rejected": -1.5086697340011597, "step": 570 }, { "epoch": 0.66, "learning_rate": 0.0007960526315789473, "logits/chosen": -4.259437561035156, "logits/rejected": -4.080585479736328, "logps/chosen": -478.7291564941406, "logps/rejected": -348.29296875, "loss": 0.5721, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.7010415196418762, "rewards/margins": 1.0288156270980835, "rewards/rejected": -1.7298572063446045, "step": 580 }, { "epoch": 0.67, "learning_rate": 0.0007921826625386998, "logits/chosen": -4.27069091796875, "logits/rejected": -4.115513801574707, "logps/chosen": -472.3595275878906, "logps/rejected": -377.7236328125, "loss": 0.5274, "rewards/accuracies": 0.729687511920929, "rewards/chosen": -0.7926375865936279, "rewards/margins": 1.0614356994628906, "rewards/rejected": -1.8540732860565186, "step": 590 }, { "epoch": 0.68, "learning_rate": 0.000788312693498452, "logits/chosen": -4.245499610900879, "logits/rejected": -4.060398101806641, "logps/chosen": -477.1275329589844, "logps/rejected": -353.2635803222656, "loss": 0.5906, "rewards/accuracies": 0.6859375238418579, "rewards/chosen": -0.5408458709716797, "rewards/margins": 0.9330792427062988, "rewards/rejected": -1.4739251136779785, "step": 600 }, { "epoch": 0.68, "eval_logits/chosen": -4.223949432373047, "eval_logits/rejected": -4.073083400726318, "eval_logps/chosen": -446.7061462402344, "eval_logps/rejected": -350.43695068359375, "eval_loss": 0.562454104423523, "eval_rewards/accuracies": 0.7190580368041992, "eval_rewards/chosen": -0.33651816844940186, "eval_rewards/margins": 0.9413639307022095, "eval_rewards/rejected": -1.2778822183609009, "eval_runtime": 590.319, "eval_samples_per_second": 3.092, "eval_steps_per_second": 1.547, "step": 600 }, { "epoch": 0.69, "learning_rate": 0.0007844427244582043, "logits/chosen": -4.2515058517456055, "logits/rejected": -4.036902904510498, "logps/chosen": -470.8296813964844, "logps/rejected": -350.6585998535156, "loss": 0.5399, "rewards/accuracies": 0.7359374761581421, "rewards/chosen": -0.5778743624687195, "rewards/margins": 1.0558364391326904, "rewards/rejected": -1.6337108612060547, "step": 610 }, { "epoch": 0.7, "learning_rate": 0.0007805727554179567, "logits/chosen": -4.220894813537598, "logits/rejected": -4.041962146759033, "logps/chosen": -452.2320251464844, "logps/rejected": -348.2469177246094, "loss": 0.5269, "rewards/accuracies": 0.721875011920929, "rewards/chosen": -0.9187246561050415, "rewards/margins": 1.1342394351959229, "rewards/rejected": -2.052963972091675, "step": 620 }, { "epoch": 0.72, "learning_rate": 0.0007767027863777089, "logits/chosen": -4.235379219055176, "logits/rejected": -4.08841609954834, "logps/chosen": -430.2784729003906, "logps/rejected": -358.0335998535156, "loss": 0.5621, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -1.1647307872772217, "rewards/margins": 1.0450899600982666, "rewards/rejected": -2.2098209857940674, "step": 630 }, { "epoch": 0.73, "learning_rate": 0.0007728328173374614, "logits/chosen": -4.224268913269043, "logits/rejected": -4.0653910636901855, "logps/chosen": -489.84735107421875, "logps/rejected": -373.8092346191406, "loss": 0.5993, "rewards/accuracies": 0.698437511920929, "rewards/chosen": -1.0495657920837402, "rewards/margins": 0.9578658938407898, "rewards/rejected": -2.007431745529175, "step": 640 }, { "epoch": 0.74, "learning_rate": 0.0007689628482972136, "logits/chosen": -4.2216057777404785, "logits/rejected": -4.0632643699646, "logps/chosen": -475.91986083984375, "logps/rejected": -368.77337646484375, "loss": 0.5889, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.49395233392715454, "rewards/margins": 0.7953552007675171, "rewards/rejected": -1.289307713508606, "step": 650 }, { "epoch": 0.75, "learning_rate": 0.0007650928792569659, "logits/chosen": -4.218358039855957, "logits/rejected": -4.033026218414307, "logps/chosen": -459.79925537109375, "logps/rejected": -343.3696594238281, "loss": 0.5644, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8286250233650208, "rewards/margins": 0.9686861038208008, "rewards/rejected": -1.7973114252090454, "step": 660 }, { "epoch": 0.76, "learning_rate": 0.0007612229102167183, "logits/chosen": -4.195998191833496, "logits/rejected": -4.0402655601501465, "logps/chosen": -463.3319396972656, "logps/rejected": -362.04693603515625, "loss": 0.5395, "rewards/accuracies": 0.7484375238418579, "rewards/chosen": -0.5008456110954285, "rewards/margins": 1.2037017345428467, "rewards/rejected": -1.7045472860336304, "step": 670 }, { "epoch": 0.77, "learning_rate": 0.0007573529411764706, "logits/chosen": -4.232239723205566, "logits/rejected": -4.063216209411621, "logps/chosen": -472.3545837402344, "logps/rejected": -361.09124755859375, "loss": 0.5958, "rewards/accuracies": 0.703125, "rewards/chosen": -0.38900110125541687, "rewards/margins": 0.9455035328865051, "rewards/rejected": -1.33450448513031, "step": 680 }, { "epoch": 0.78, "learning_rate": 0.000753482972136223, "logits/chosen": -4.21661901473999, "logits/rejected": -4.030766010284424, "logps/chosen": -445.18634033203125, "logps/rejected": -338.87237548828125, "loss": 0.5465, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6540600061416626, "rewards/margins": 1.0243606567382812, "rewards/rejected": -1.6784206628799438, "step": 690 }, { "epoch": 0.8, "learning_rate": 0.0007496130030959753, "logits/chosen": -4.246059417724609, "logits/rejected": -4.067647457122803, "logps/chosen": -479.13018798828125, "logps/rejected": -367.77838134765625, "loss": 0.5238, "rewards/accuracies": 0.7281249761581421, "rewards/chosen": -0.8584390878677368, "rewards/margins": 1.0988075733184814, "rewards/rejected": -1.9572465419769287, "step": 700 }, { "epoch": 0.81, "learning_rate": 0.0007457430340557275, "logits/chosen": -4.234969139099121, "logits/rejected": -4.097445487976074, "logps/chosen": -440.17401123046875, "logps/rejected": -352.2328796386719, "loss": 0.5792, "rewards/accuracies": 0.715624988079071, "rewards/chosen": -0.973364531993866, "rewards/margins": 0.9359954595565796, "rewards/rejected": -1.9093602895736694, "step": 710 }, { "epoch": 0.82, "learning_rate": 0.0007418730650154799, "logits/chosen": -4.252785682678223, "logits/rejected": -4.084169387817383, "logps/chosen": -441.3429260253906, "logps/rejected": -343.9219665527344, "loss": 0.5789, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6798723340034485, "rewards/margins": 0.9376944303512573, "rewards/rejected": -1.6175668239593506, "step": 720 }, { "epoch": 0.83, "learning_rate": 0.0007380030959752322, "logits/chosen": -4.222377300262451, "logits/rejected": -4.052145004272461, "logps/chosen": -466.407470703125, "logps/rejected": -377.0411071777344, "loss": 0.5509, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5104158520698547, "rewards/margins": 0.9706568717956543, "rewards/rejected": -1.4810726642608643, "step": 730 }, { "epoch": 0.84, "learning_rate": 0.0007341331269349845, "logits/chosen": -4.217472076416016, "logits/rejected": -4.044188499450684, "logps/chosen": -442.749267578125, "logps/rejected": -337.6766052246094, "loss": 0.543, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6436838507652283, "rewards/margins": 1.1020640134811401, "rewards/rejected": -1.7457481622695923, "step": 740 }, { "epoch": 0.85, "learning_rate": 0.0007302631578947369, "logits/chosen": -4.212157249450684, "logits/rejected": -4.056931018829346, "logps/chosen": -460.8367614746094, "logps/rejected": -363.6398620605469, "loss": 0.5689, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.9215582013130188, "rewards/margins": 0.977275013923645, "rewards/rejected": -1.8988332748413086, "step": 750 }, { "epoch": 0.86, "learning_rate": 0.0007263931888544891, "logits/chosen": -4.2109246253967285, "logits/rejected": -4.057803153991699, "logps/chosen": -448.67822265625, "logps/rejected": -354.95428466796875, "loss": 0.5578, "rewards/accuracies": 0.7171875238418579, "rewards/chosen": -0.3920624852180481, "rewards/margins": 0.9965153932571411, "rewards/rejected": -1.3885778188705444, "step": 760 }, { "epoch": 0.88, "learning_rate": 0.0007225232198142416, "logits/chosen": -4.247252464294434, "logits/rejected": -4.084366798400879, "logps/chosen": -454.74383544921875, "logps/rejected": -364.98345947265625, "loss": 0.5345, "rewards/accuracies": 0.7265625, "rewards/chosen": -0.5170811414718628, "rewards/margins": 1.0027344226837158, "rewards/rejected": -1.519815444946289, "step": 770 }, { "epoch": 0.89, "learning_rate": 0.0007186532507739938, "logits/chosen": -4.232919692993164, "logits/rejected": -4.057995796203613, "logps/chosen": -443.6820373535156, "logps/rejected": -347.55023193359375, "loss": 0.5737, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.654302716255188, "rewards/margins": 1.0428025722503662, "rewards/rejected": -1.6971051692962646, "step": 780 }, { "epoch": 0.9, "learning_rate": 0.0007147832817337462, "logits/chosen": -4.252163887023926, "logits/rejected": -4.024879455566406, "logps/chosen": -452.18499755859375, "logps/rejected": -334.55206298828125, "loss": 0.5951, "rewards/accuracies": 0.6781250238418579, "rewards/chosen": -0.4866858422756195, "rewards/margins": 0.9653654098510742, "rewards/rejected": -1.4520511627197266, "step": 790 }, { "epoch": 0.91, "learning_rate": 0.0007109133126934985, "logits/chosen": -4.257702827453613, "logits/rejected": -4.057846546173096, "logps/chosen": -453.58154296875, "logps/rejected": -333.16119384765625, "loss": 0.5084, "rewards/accuracies": 0.739062488079071, "rewards/chosen": -0.6065333485603333, "rewards/margins": 1.1738460063934326, "rewards/rejected": -1.780379295349121, "step": 800 }, { "epoch": 0.92, "learning_rate": 0.0007070433436532507, "logits/chosen": -4.221260070800781, "logits/rejected": -4.054490089416504, "logps/chosen": -462.25030517578125, "logps/rejected": -361.7946472167969, "loss": 0.538, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7678021788597107, "rewards/margins": 1.1575754880905151, "rewards/rejected": -1.9253774881362915, "step": 810 }, { "epoch": 0.93, "learning_rate": 0.0007031733746130031, "logits/chosen": -4.260991096496582, "logits/rejected": -4.087908744812012, "logps/chosen": -457.4361267089844, "logps/rejected": -356.0791320800781, "loss": 0.5477, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8215168714523315, "rewards/margins": 0.9859598875045776, "rewards/rejected": -1.8074767589569092, "step": 820 }, { "epoch": 0.94, "learning_rate": 0.0006993034055727554, "logits/chosen": -4.225368022918701, "logits/rejected": -4.0482635498046875, "logps/chosen": -435.1627502441406, "logps/rejected": -336.87432861328125, "loss": 0.5509, "rewards/accuracies": 0.707812488079071, "rewards/chosen": -0.4658663272857666, "rewards/margins": 1.0167793035507202, "rewards/rejected": -1.4826457500457764, "step": 830 }, { "epoch": 0.96, "learning_rate": 0.0006954334365325078, "logits/chosen": -4.232728481292725, "logits/rejected": -4.054437637329102, "logps/chosen": -439.06585693359375, "logps/rejected": -340.2679443359375, "loss": 0.576, "rewards/accuracies": 0.7093750238418579, "rewards/chosen": -0.1597704142332077, "rewards/margins": 0.947851836681366, "rewards/rejected": -1.1076223850250244, "step": 840 }, { "epoch": 0.97, "learning_rate": 0.00069156346749226, "logits/chosen": -4.233127593994141, "logits/rejected": -4.058409690856934, "logps/chosen": -464.802734375, "logps/rejected": -361.8938293457031, "loss": 0.5979, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.14404229819774628, "rewards/margins": 0.9432948231697083, "rewards/rejected": -1.0873371362686157, "step": 850 }, { "epoch": 0.98, "learning_rate": 0.0006876934984520124, "logits/chosen": -4.234601020812988, "logits/rejected": -4.063669681549072, "logps/chosen": -439.56231689453125, "logps/rejected": -347.49786376953125, "loss": 0.5773, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.37241753935813904, "rewards/margins": 0.894565761089325, "rewards/rejected": -1.2669832706451416, "step": 860 }, { "epoch": 0.99, "learning_rate": 0.0006838235294117647, "logits/chosen": -4.234360218048096, "logits/rejected": -4.055201053619385, "logps/chosen": -457.45562744140625, "logps/rejected": -368.09136962890625, "loss": 0.5691, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.04513268545269966, "rewards/margins": 0.9006785154342651, "rewards/rejected": -0.9458112716674805, "step": 870 }, { "epoch": 1.0, "learning_rate": 0.000679953560371517, "logits/chosen": -4.225511074066162, "logits/rejected": -4.079733848571777, "logps/chosen": -445.68115234375, "logps/rejected": -359.98541259765625, "loss": 0.5381, "rewards/accuracies": 0.7265625, "rewards/chosen": 0.019000979140400887, "rewards/margins": 1.238950490951538, "rewards/rejected": -1.2199493646621704, "step": 880 }, { "epoch": 1.01, "learning_rate": 0.0006760835913312694, "logits/chosen": -4.246570110321045, "logits/rejected": -4.105473518371582, "logps/chosen": -427.12237548828125, "logps/rejected": -368.8031311035156, "loss": 0.3229, "rewards/accuracies": 0.871874988079071, "rewards/chosen": 0.28819674253463745, "rewards/margins": 1.8582813739776611, "rewards/rejected": -1.570084571838379, "step": 890 }, { "epoch": 1.02, "learning_rate": 0.0006722136222910217, "logits/chosen": -4.274839401245117, "logits/rejected": -4.126755714416504, "logps/chosen": -456.762451171875, "logps/rejected": -371.27484130859375, "loss": 0.2857, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 0.11797485500574112, "rewards/margins": 2.11262845993042, "rewards/rejected": -1.9946534633636475, "step": 900 }, { "epoch": 1.02, "eval_logits/chosen": -4.2331862449646, "eval_logits/rejected": -4.075270652770996, "eval_logps/chosen": -447.2226257324219, "eval_logps/rejected": -353.63677978515625, "eval_loss": 0.5723168849945068, "eval_rewards/accuracies": 0.7141292691230774, "eval_rewards/chosen": -0.388161301612854, "eval_rewards/margins": 1.2097017765045166, "eval_rewards/rejected": -1.5978630781173706, "eval_runtime": 592.0319, "eval_samples_per_second": 3.083, "eval_steps_per_second": 1.542, "step": 900 }, { "epoch": 1.03, "learning_rate": 0.000668343653250774, "logits/chosen": -4.225554466247559, "logits/rejected": -4.041362285614014, "logps/chosen": -452.87408447265625, "logps/rejected": -351.7007141113281, "loss": 0.2688, "rewards/accuracies": 0.8984375, "rewards/chosen": 0.06683386862277985, "rewards/margins": 2.357351779937744, "rewards/rejected": -2.290517568588257, "step": 910 }, { "epoch": 1.05, "learning_rate": 0.0006644736842105263, "logits/chosen": -4.213166236877441, "logits/rejected": -4.0615129470825195, "logps/chosen": -448.83087158203125, "logps/rejected": -364.64617919921875, "loss": 0.2707, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.09975646436214447, "rewards/margins": 2.239344358444214, "rewards/rejected": -2.3391008377075195, "step": 920 }, { "epoch": 1.06, "learning_rate": 0.0006606037151702786, "logits/chosen": -4.205303192138672, "logits/rejected": -4.064006805419922, "logps/chosen": -465.90460205078125, "logps/rejected": -354.5012512207031, "loss": 0.2792, "rewards/accuracies": 0.871874988079071, "rewards/chosen": -0.3488604426383972, "rewards/margins": 2.1347193717956543, "rewards/rejected": -2.4835798740386963, "step": 930 }, { "epoch": 1.07, "learning_rate": 0.000656733746130031, "logits/chosen": -4.223511219024658, "logits/rejected": -4.06416130065918, "logps/chosen": -466.5729064941406, "logps/rejected": -358.587158203125, "loss": 0.2436, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.18780146539211273, "rewards/margins": 2.4119653701782227, "rewards/rejected": -2.5997672080993652, "step": 940 }, { "epoch": 1.08, "learning_rate": 0.0006528637770897833, "logits/chosen": -4.214144706726074, "logits/rejected": -4.037665367126465, "logps/chosen": -453.7357482910156, "logps/rejected": -353.1199035644531, "loss": 0.2788, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.42910218238830566, "rewards/margins": 2.359564781188965, "rewards/rejected": -2.7886669635772705, "step": 950 }, { "epoch": 1.09, "learning_rate": 0.0006489938080495356, "logits/chosen": -4.2282891273498535, "logits/rejected": -4.075848579406738, "logps/chosen": -470.90435791015625, "logps/rejected": -371.6830139160156, "loss": 0.2493, "rewards/accuracies": 0.917187511920929, "rewards/chosen": -0.45380687713623047, "rewards/margins": 2.4965717792510986, "rewards/rejected": -2.950378894805908, "step": 960 }, { "epoch": 1.1, "learning_rate": 0.000645123839009288, "logits/chosen": -4.232088088989258, "logits/rejected": -4.066346645355225, "logps/chosen": -471.94677734375, "logps/rejected": -359.829833984375, "loss": 0.262, "rewards/accuracies": 0.895312488079071, "rewards/chosen": -0.5010432600975037, "rewards/margins": 2.357407808303833, "rewards/rejected": -2.8584513664245605, "step": 970 }, { "epoch": 1.11, "learning_rate": 0.0006412538699690402, "logits/chosen": -4.201035022735596, "logits/rejected": -4.0452375411987305, "logps/chosen": -474.99005126953125, "logps/rejected": -360.5128173828125, "loss": 0.233, "rewards/accuracies": 0.921875, "rewards/chosen": -0.3553507924079895, "rewards/margins": 2.447514057159424, "rewards/rejected": -2.8028650283813477, "step": 980 }, { "epoch": 1.13, "learning_rate": 0.0006373839009287927, "logits/chosen": -4.193148136138916, "logits/rejected": -4.046982288360596, "logps/chosen": -447.2930603027344, "logps/rejected": -388.985107421875, "loss": 0.2468, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.427713543176651, "rewards/margins": 2.6149280071258545, "rewards/rejected": -3.0426418781280518, "step": 990 }, { "epoch": 1.14, "learning_rate": 0.0006335139318885449, "logits/chosen": -4.193198204040527, "logits/rejected": -4.01694393157959, "logps/chosen": -451.1202087402344, "logps/rejected": -343.1644592285156, "loss": 0.2337, "rewards/accuracies": 0.9234374761581421, "rewards/chosen": -0.187601700425148, "rewards/margins": 2.5318427085876465, "rewards/rejected": -2.719444513320923, "step": 1000 }, { "epoch": 1.15, "learning_rate": 0.0006296439628482972, "logits/chosen": -4.211307525634766, "logits/rejected": -4.038030624389648, "logps/chosen": -474.0377502441406, "logps/rejected": -378.894775390625, "loss": 0.2543, "rewards/accuracies": 0.890625, "rewards/chosen": -0.18276789784431458, "rewards/margins": 2.6085612773895264, "rewards/rejected": -2.7913291454315186, "step": 1010 }, { "epoch": 1.16, "learning_rate": 0.0006257739938080496, "logits/chosen": -4.201509475708008, "logits/rejected": -4.06945276260376, "logps/chosen": -479.3910217285156, "logps/rejected": -384.2267150878906, "loss": 0.2583, "rewards/accuracies": 0.901562511920929, "rewards/chosen": -0.1512058973312378, "rewards/margins": 2.4821362495422363, "rewards/rejected": -2.6333422660827637, "step": 1020 }, { "epoch": 1.17, "learning_rate": 0.0006219040247678018, "logits/chosen": -4.24030065536499, "logits/rejected": -4.064594745635986, "logps/chosen": -456.9732971191406, "logps/rejected": -366.6795959472656, "loss": 0.2742, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.04681391641497612, "rewards/margins": 2.263611316680908, "rewards/rejected": -2.310425281524658, "step": 1030 }, { "epoch": 1.18, "learning_rate": 0.0006180340557275542, "logits/chosen": -4.2289323806762695, "logits/rejected": -4.074105262756348, "logps/chosen": -462.69769287109375, "logps/rejected": -379.8485107421875, "loss": 0.2296, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.003292396664619446, "rewards/margins": 2.477797508239746, "rewards/rejected": -2.4745049476623535, "step": 1040 }, { "epoch": 1.19, "learning_rate": 0.0006141640866873065, "logits/chosen": -4.185556411743164, "logits/rejected": -4.048288345336914, "logps/chosen": -460.12786865234375, "logps/rejected": -387.6737060546875, "loss": 0.2573, "rewards/accuracies": 0.885937511920929, "rewards/chosen": -0.4078907072544098, "rewards/margins": 2.5121066570281982, "rewards/rejected": -2.9199976921081543, "step": 1050 }, { "epoch": 1.21, "learning_rate": 0.0006102941176470589, "logits/chosen": -4.197637557983398, "logits/rejected": -4.088327884674072, "logps/chosen": -438.728271484375, "logps/rejected": -376.8957824707031, "loss": 0.2591, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -0.31245797872543335, "rewards/margins": 2.3101813793182373, "rewards/rejected": -2.6226391792297363, "step": 1060 }, { "epoch": 1.22, "learning_rate": 0.0006064241486068111, "logits/chosen": -4.140993118286133, "logits/rejected": -4.001421928405762, "logps/chosen": -447.58270263671875, "logps/rejected": -349.8966064453125, "loss": 0.2612, "rewards/accuracies": 0.890625, "rewards/chosen": -0.1900818943977356, "rewards/margins": 2.3712151050567627, "rewards/rejected": -2.5612969398498535, "step": 1070 }, { "epoch": 1.23, "learning_rate": 0.0006025541795665635, "logits/chosen": -4.1848578453063965, "logits/rejected": -4.041143417358398, "logps/chosen": -479.5250549316406, "logps/rejected": -378.3609313964844, "loss": 0.2525, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.3640068471431732, "rewards/margins": 2.4674534797668457, "rewards/rejected": -2.8314602375030518, "step": 1080 }, { "epoch": 1.24, "learning_rate": 0.0005986842105263158, "logits/chosen": -4.173763751983643, "logits/rejected": -4.0594987869262695, "logps/chosen": -452.22222900390625, "logps/rejected": -372.42584228515625, "loss": 0.2625, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -0.711100161075592, "rewards/margins": 2.5092108249664307, "rewards/rejected": -3.220310926437378, "step": 1090 }, { "epoch": 1.25, "learning_rate": 0.0005948142414860681, "logits/chosen": -4.2016401290893555, "logits/rejected": -4.05678653717041, "logps/chosen": -466.990478515625, "logps/rejected": -377.10821533203125, "loss": 0.2426, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -0.5939540863037109, "rewards/margins": 2.577214479446411, "rewards/rejected": -3.171168565750122, "step": 1100 }, { "epoch": 1.26, "learning_rate": 0.0005909442724458205, "logits/chosen": -4.171450614929199, "logits/rejected": -4.015393257141113, "logps/chosen": -436.8499450683594, "logps/rejected": -348.46063232421875, "loss": 0.2582, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.5436960458755493, "rewards/margins": 2.3187432289123535, "rewards/rejected": -2.8624393939971924, "step": 1110 }, { "epoch": 1.27, "learning_rate": 0.0005870743034055727, "logits/chosen": -4.191903114318848, "logits/rejected": -4.0418806076049805, "logps/chosen": -467.5367126464844, "logps/rejected": -383.3409423828125, "loss": 0.2436, "rewards/accuracies": 0.910937488079071, "rewards/chosen": -0.5155719518661499, "rewards/margins": 2.421506881713867, "rewards/rejected": -2.9370789527893066, "step": 1120 }, { "epoch": 1.28, "learning_rate": 0.0005832043343653251, "logits/chosen": -4.157951354980469, "logits/rejected": -4.056535243988037, "logps/chosen": -431.4415588378906, "logps/rejected": -354.0640869140625, "loss": 0.2653, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.5177775621414185, "rewards/margins": 2.330237627029419, "rewards/rejected": -2.848015308380127, "step": 1130 }, { "epoch": 1.3, "learning_rate": 0.0005793343653250774, "logits/chosen": -4.203644752502441, "logits/rejected": -4.054340362548828, "logps/chosen": -461.27374267578125, "logps/rejected": -373.6490783691406, "loss": 0.2524, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.4027002453804016, "rewards/margins": 2.4095702171325684, "rewards/rejected": -2.8122706413269043, "step": 1140 }, { "epoch": 1.31, "learning_rate": 0.0005754643962848297, "logits/chosen": -4.178219318389893, "logits/rejected": -4.0377912521362305, "logps/chosen": -437.31182861328125, "logps/rejected": -363.735595703125, "loss": 0.2743, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.518113374710083, "rewards/margins": 2.4148178100585938, "rewards/rejected": -2.932931423187256, "step": 1150 }, { "epoch": 1.32, "learning_rate": 0.0005715944272445821, "logits/chosen": -4.177359580993652, "logits/rejected": -4.056346893310547, "logps/chosen": -449.071044921875, "logps/rejected": -380.4844665527344, "loss": 0.254, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3083496391773224, "rewards/margins": 2.4955554008483887, "rewards/rejected": -2.8039047718048096, "step": 1160 }, { "epoch": 1.33, "learning_rate": 0.0005677244582043344, "logits/chosen": -4.191530227661133, "logits/rejected": -4.008411407470703, "logps/chosen": -442.3807678222656, "logps/rejected": -363.08856201171875, "loss": 0.2517, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3207928240299225, "rewards/margins": 2.438176393508911, "rewards/rejected": -2.7589688301086426, "step": 1170 }, { "epoch": 1.34, "learning_rate": 0.0005638544891640867, "logits/chosen": -4.193958759307861, "logits/rejected": -4.037377834320068, "logps/chosen": -454.11712646484375, "logps/rejected": -388.7286376953125, "loss": 0.2363, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.7321030497550964, "rewards/margins": 2.593994617462158, "rewards/rejected": -3.3260979652404785, "step": 1180 }, { "epoch": 1.35, "learning_rate": 0.0005599845201238391, "logits/chosen": -4.171980857849121, "logits/rejected": -3.986541271209717, "logps/chosen": -452.4977111816406, "logps/rejected": -365.94158935546875, "loss": 0.2513, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -0.7076693177223206, "rewards/margins": 2.570343494415283, "rewards/rejected": -3.278012752532959, "step": 1190 }, { "epoch": 1.36, "learning_rate": 0.0005561145510835913, "logits/chosen": -4.157950401306152, "logits/rejected": -4.015085220336914, "logps/chosen": -445.6997985839844, "logps/rejected": -373.5566711425781, "loss": 0.2679, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -0.8039976358413696, "rewards/margins": 2.351738452911377, "rewards/rejected": -3.155735969543457, "step": 1200 }, { "epoch": 1.36, "eval_logits/chosen": -4.188776969909668, "eval_logits/rejected": -4.01153564453125, "eval_logps/chosen": -454.97137451171875, "eval_logps/rejected": -361.08111572265625, "eval_loss": 0.5883127450942993, "eval_rewards/accuracies": 0.7234392166137695, "eval_rewards/chosen": -1.1630417108535767, "eval_rewards/margins": 1.1792516708374023, "eval_rewards/rejected": -2.3422932624816895, "eval_runtime": 579.0657, "eval_samples_per_second": 3.152, "eval_steps_per_second": 1.577, "step": 1200 }, { "epoch": 1.38, "learning_rate": 0.0005522445820433437, "logits/chosen": -4.208713054656982, "logits/rejected": -4.017608165740967, "logps/chosen": -457.6587829589844, "logps/rejected": -358.62994384765625, "loss": 0.2412, "rewards/accuracies": 0.9046875238418579, "rewards/chosen": -0.667028546333313, "rewards/margins": 2.4550020694732666, "rewards/rejected": -3.122030735015869, "step": 1210 }, { "epoch": 1.39, "learning_rate": 0.000548374613003096, "logits/chosen": -4.185348987579346, "logits/rejected": -4.055328369140625, "logps/chosen": -465.628173828125, "logps/rejected": -389.4476013183594, "loss": 0.2522, "rewards/accuracies": 0.895312488079071, "rewards/chosen": -1.1417243480682373, "rewards/margins": 2.5681004524230957, "rewards/rejected": -3.709825038909912, "step": 1220 }, { "epoch": 1.4, "learning_rate": 0.0005445046439628482, "logits/chosen": -4.190216064453125, "logits/rejected": -4.0345048904418945, "logps/chosen": -451.000244140625, "logps/rejected": -385.5965576171875, "loss": 0.2735, "rewards/accuracies": 0.890625, "rewards/chosen": -1.5828927755355835, "rewards/margins": 2.6842010021209717, "rewards/rejected": -4.267094135284424, "step": 1230 }, { "epoch": 1.41, "learning_rate": 0.0005406346749226007, "logits/chosen": -4.179422855377197, "logits/rejected": -4.005802631378174, "logps/chosen": -452.93768310546875, "logps/rejected": -364.5310974121094, "loss": 0.2641, "rewards/accuracies": 0.879687488079071, "rewards/chosen": -0.9804186820983887, "rewards/margins": 2.460566997528076, "rewards/rejected": -3.440985918045044, "step": 1240 }, { "epoch": 1.42, "learning_rate": 0.0005367647058823529, "logits/chosen": -4.185786247253418, "logits/rejected": -4.037634372711182, "logps/chosen": -461.796142578125, "logps/rejected": -376.135986328125, "loss": 0.2546, "rewards/accuracies": 0.895312488079071, "rewards/chosen": -0.5006243586540222, "rewards/margins": 2.4437155723571777, "rewards/rejected": -2.944340229034424, "step": 1250 }, { "epoch": 1.43, "learning_rate": 0.0005328947368421054, "logits/chosen": -4.19167423248291, "logits/rejected": -4.02968168258667, "logps/chosen": -446.5478515625, "logps/rejected": -374.8179016113281, "loss": 0.2425, "rewards/accuracies": 0.910937488079071, "rewards/chosen": -0.3834829330444336, "rewards/margins": 2.4936459064483643, "rewards/rejected": -2.8771286010742188, "step": 1260 }, { "epoch": 1.44, "learning_rate": 0.0005290247678018576, "logits/chosen": -4.192545413970947, "logits/rejected": -4.0065693855285645, "logps/chosen": -462.65838623046875, "logps/rejected": -359.5196533203125, "loss": 0.268, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -0.37461215257644653, "rewards/margins": 2.4349141120910645, "rewards/rejected": -2.809526205062866, "step": 1270 }, { "epoch": 1.46, "learning_rate": 0.0005251547987616099, "logits/chosen": -4.1525092124938965, "logits/rejected": -3.9974308013916016, "logps/chosen": -438.52862548828125, "logps/rejected": -362.8804626464844, "loss": 0.2556, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -0.39167696237564087, "rewards/margins": 2.487126588821411, "rewards/rejected": -2.8788037300109863, "step": 1280 }, { "epoch": 1.47, "learning_rate": 0.0005212848297213623, "logits/chosen": -4.139183044433594, "logits/rejected": -4.010072708129883, "logps/chosen": -445.69354248046875, "logps/rejected": -367.18402099609375, "loss": 0.2545, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.35633689165115356, "rewards/margins": 2.4608588218688965, "rewards/rejected": -2.8171961307525635, "step": 1290 }, { "epoch": 1.48, "learning_rate": 0.0005174148606811145, "logits/chosen": -4.153168201446533, "logits/rejected": -3.9815585613250732, "logps/chosen": -472.58154296875, "logps/rejected": -389.10955810546875, "loss": 0.2511, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.8664466142654419, "rewards/margins": 2.678412437438965, "rewards/rejected": -3.5448594093322754, "step": 1300 }, { "epoch": 1.49, "learning_rate": 0.0005135448916408669, "logits/chosen": -4.1469502449035645, "logits/rejected": -4.007225036621094, "logps/chosen": -446.27728271484375, "logps/rejected": -362.40802001953125, "loss": 0.2424, "rewards/accuracies": 0.8921874761581421, "rewards/chosen": -1.010523796081543, "rewards/margins": 2.4572033882141113, "rewards/rejected": -3.4677271842956543, "step": 1310 }, { "epoch": 1.5, "learning_rate": 0.0005096749226006192, "logits/chosen": -4.160056114196777, "logits/rejected": -3.981990098953247, "logps/chosen": -462.85748291015625, "logps/rejected": -370.77593994140625, "loss": 0.2477, "rewards/accuracies": 0.9046875238418579, "rewards/chosen": -0.6758919954299927, "rewards/margins": 2.54825758934021, "rewards/rejected": -3.224149227142334, "step": 1320 }, { "epoch": 1.51, "learning_rate": 0.0005058049535603715, "logits/chosen": -4.164492607116699, "logits/rejected": -3.9902472496032715, "logps/chosen": -463.60333251953125, "logps/rejected": -361.80413818359375, "loss": 0.2353, "rewards/accuracies": 0.901562511920929, "rewards/chosen": -0.6484482288360596, "rewards/margins": 2.7371394634246826, "rewards/rejected": -3.3855881690979004, "step": 1330 }, { "epoch": 1.52, "learning_rate": 0.0005019349845201238, "logits/chosen": -4.146851539611816, "logits/rejected": -3.9600207805633545, "logps/chosen": -448.1449279785156, "logps/rejected": -367.8599548339844, "loss": 0.2543, "rewards/accuracies": 0.8921874761581421, "rewards/chosen": -0.7266567349433899, "rewards/margins": 2.672203779220581, "rewards/rejected": -3.398860454559326, "step": 1340 }, { "epoch": 1.54, "learning_rate": 0.0004980650154798762, "logits/chosen": -4.132704257965088, "logits/rejected": -3.9887709617614746, "logps/chosen": -456.68798828125, "logps/rejected": -364.83648681640625, "loss": 0.2553, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8011518716812134, "rewards/margins": 2.5769896507263184, "rewards/rejected": -3.378141403198242, "step": 1350 }, { "epoch": 1.55, "learning_rate": 0.0004941950464396285, "logits/chosen": -4.161304473876953, "logits/rejected": -3.9875359535217285, "logps/chosen": -456.0712890625, "logps/rejected": -364.6172790527344, "loss": 0.2491, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -0.760558009147644, "rewards/margins": 2.554994821548462, "rewards/rejected": -3.3155531883239746, "step": 1360 }, { "epoch": 1.56, "learning_rate": 0.0004903250773993809, "logits/chosen": -4.189715385437012, "logits/rejected": -4.038306713104248, "logps/chosen": -436.60198974609375, "logps/rejected": -381.1713562011719, "loss": 0.2694, "rewards/accuracies": 0.870312511920929, "rewards/chosen": -0.7771581411361694, "rewards/margins": 2.450378894805908, "rewards/rejected": -3.227536678314209, "step": 1370 }, { "epoch": 1.57, "learning_rate": 0.00048645510835913314, "logits/chosen": -4.188543319702148, "logits/rejected": -4.061593532562256, "logps/chosen": -435.73291015625, "logps/rejected": -370.1665954589844, "loss": 0.2487, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -0.6042419075965881, "rewards/margins": 2.4224419593811035, "rewards/rejected": -3.026684045791626, "step": 1380 }, { "epoch": 1.58, "learning_rate": 0.0004825851393188855, "logits/chosen": -4.186851978302002, "logits/rejected": -4.0221147537231445, "logps/chosen": -467.49114990234375, "logps/rejected": -378.09771728515625, "loss": 0.2342, "rewards/accuracies": 0.9078124761581421, "rewards/chosen": -0.432331383228302, "rewards/margins": 2.5699210166931152, "rewards/rejected": -3.0022523403167725, "step": 1390 }, { "epoch": 1.59, "learning_rate": 0.00047871517027863776, "logits/chosen": -4.145888328552246, "logits/rejected": -4.017740726470947, "logps/chosen": -463.96685791015625, "logps/rejected": -385.75238037109375, "loss": 0.2621, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -0.6082580089569092, "rewards/margins": 2.689582586288452, "rewards/rejected": -3.2978408336639404, "step": 1400 }, { "epoch": 1.6, "learning_rate": 0.0004748452012383901, "logits/chosen": -4.153185844421387, "logits/rejected": -3.987687349319458, "logps/chosen": -446.22442626953125, "logps/rejected": -362.09259033203125, "loss": 0.2374, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.4931566119194031, "rewards/margins": 2.458813428878784, "rewards/rejected": -2.951970338821411, "step": 1410 }, { "epoch": 1.61, "learning_rate": 0.00047097523219814244, "logits/chosen": -4.217062950134277, "logits/rejected": -4.046052932739258, "logps/chosen": -468.19158935546875, "logps/rejected": -385.53680419921875, "loss": 0.2485, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -0.537969172000885, "rewards/margins": 2.609943389892578, "rewards/rejected": -3.1479125022888184, "step": 1420 }, { "epoch": 1.63, "learning_rate": 0.0004671052631578948, "logits/chosen": -4.201523780822754, "logits/rejected": -4.0426812171936035, "logps/chosen": -480.47174072265625, "logps/rejected": -394.29876708984375, "loss": 0.2476, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.7579666376113892, "rewards/margins": 2.5769734382629395, "rewards/rejected": -3.3349404335021973, "step": 1430 }, { "epoch": 1.64, "learning_rate": 0.0004632352941176471, "logits/chosen": -4.188308238983154, "logits/rejected": -4.030388832092285, "logps/chosen": -437.14483642578125, "logps/rejected": -347.59490966796875, "loss": 0.2698, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -0.7368892431259155, "rewards/margins": 2.367077589035034, "rewards/rejected": -3.1039669513702393, "step": 1440 }, { "epoch": 1.65, "learning_rate": 0.00045936532507739934, "logits/chosen": -4.182221412658691, "logits/rejected": -4.031324863433838, "logps/chosen": -471.52484130859375, "logps/rejected": -376.8938903808594, "loss": 0.2493, "rewards/accuracies": 0.890625, "rewards/chosen": -0.5700691342353821, "rewards/margins": 2.515872001647949, "rewards/rejected": -3.0859413146972656, "step": 1450 }, { "epoch": 1.66, "learning_rate": 0.0004554953560371517, "logits/chosen": -4.183257102966309, "logits/rejected": -4.024346828460693, "logps/chosen": -455.00653076171875, "logps/rejected": -372.17938232421875, "loss": 0.2454, "rewards/accuracies": 0.895312488079071, "rewards/chosen": -0.6720191836357117, "rewards/margins": 2.4886131286621094, "rewards/rejected": -3.160632371902466, "step": 1460 }, { "epoch": 1.67, "learning_rate": 0.000451625386996904, "logits/chosen": -4.1930108070373535, "logits/rejected": -4.020697593688965, "logps/chosen": -444.4767150878906, "logps/rejected": -356.45159912109375, "loss": 0.2509, "rewards/accuracies": 0.9078124761581421, "rewards/chosen": -0.7854114770889282, "rewards/margins": 2.609508752822876, "rewards/rejected": -3.3949198722839355, "step": 1470 }, { "epoch": 1.68, "learning_rate": 0.00044775541795665636, "logits/chosen": -4.178145885467529, "logits/rejected": -4.020665168762207, "logps/chosen": -446.0423889160156, "logps/rejected": -379.61077880859375, "loss": 0.2293, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.5877661108970642, "rewards/margins": 2.6652259826660156, "rewards/rejected": -3.2529921531677246, "step": 1480 }, { "epoch": 1.69, "learning_rate": 0.0004438854489164087, "logits/chosen": -4.205671787261963, "logits/rejected": -4.0060319900512695, "logps/chosen": -444.4815368652344, "logps/rejected": -335.12615966796875, "loss": 0.2411, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -0.4131178855895996, "rewards/margins": 2.5823490619659424, "rewards/rejected": -2.995466709136963, "step": 1490 }, { "epoch": 1.71, "learning_rate": 0.000440015479876161, "logits/chosen": -4.159884929656982, "logits/rejected": -4.015371799468994, "logps/chosen": -468.2154846191406, "logps/rejected": -373.19708251953125, "loss": 0.231, "rewards/accuracies": 0.895312488079071, "rewards/chosen": -0.5490593910217285, "rewards/margins": 2.724466323852539, "rewards/rejected": -3.2735259532928467, "step": 1500 }, { "epoch": 1.71, "eval_logits/chosen": -4.169567108154297, "eval_logits/rejected": -4.006948947906494, "eval_logps/chosen": -456.6194152832031, "eval_logps/rejected": -365.6242370605469, "eval_loss": 0.5894538164138794, "eval_rewards/accuracies": 0.7338444590568542, "eval_rewards/chosen": -1.3278446197509766, "eval_rewards/margins": 1.4687620401382446, "eval_rewards/rejected": -2.7966067790985107, "eval_runtime": 578.8659, "eval_samples_per_second": 3.153, "eval_steps_per_second": 1.577, "step": 1500 }, { "epoch": 1.72, "learning_rate": 0.0004361455108359133, "logits/chosen": -4.16018009185791, "logits/rejected": -3.9866485595703125, "logps/chosen": -456.95550537109375, "logps/rejected": -361.4525146484375, "loss": 0.2408, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.6913250684738159, "rewards/margins": 2.644709825515747, "rewards/rejected": -3.3360352516174316, "step": 1510 }, { "epoch": 1.73, "learning_rate": 0.00043227554179566565, "logits/chosen": -4.145260810852051, "logits/rejected": -3.971754789352417, "logps/chosen": -469.93707275390625, "logps/rejected": -373.5439453125, "loss": 0.2484, "rewards/accuracies": 0.9046875238418579, "rewards/chosen": -0.6867610216140747, "rewards/margins": 2.736417293548584, "rewards/rejected": -3.423178195953369, "step": 1520 }, { "epoch": 1.74, "learning_rate": 0.000428405572755418, "logits/chosen": -4.15813684463501, "logits/rejected": -3.9965271949768066, "logps/chosen": -440.0677795410156, "logps/rejected": -370.4933776855469, "loss": 0.2486, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -0.8743346333503723, "rewards/margins": 2.8026862144470215, "rewards/rejected": -3.677021026611328, "step": 1530 }, { "epoch": 1.75, "learning_rate": 0.00042453560371517033, "logits/chosen": -4.188101768493652, "logits/rejected": -4.0458984375, "logps/chosen": -447.4637145996094, "logps/rejected": -370.666748046875, "loss": 0.2623, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.0604463815689087, "rewards/margins": 2.5134646892547607, "rewards/rejected": -3.57391095161438, "step": 1540 }, { "epoch": 1.76, "learning_rate": 0.0004206656346749226, "logits/chosen": -4.149045944213867, "logits/rejected": -3.99678373336792, "logps/chosen": -442.060791015625, "logps/rejected": -365.4596252441406, "loss": 0.2237, "rewards/accuracies": 0.90625, "rewards/chosen": -0.9488750696182251, "rewards/margins": 2.726341724395752, "rewards/rejected": -3.6752171516418457, "step": 1550 }, { "epoch": 1.77, "learning_rate": 0.0004167956656346749, "logits/chosen": -4.123335838317871, "logits/rejected": -3.9833626747131348, "logps/chosen": -461.0818786621094, "logps/rejected": -375.79510498046875, "loss": 0.2383, "rewards/accuracies": 0.9078124761581421, "rewards/chosen": -0.6976916193962097, "rewards/margins": 2.7221691608428955, "rewards/rejected": -3.419860363006592, "step": 1560 }, { "epoch": 1.79, "learning_rate": 0.00041292569659442724, "logits/chosen": -4.150379180908203, "logits/rejected": -4.000787258148193, "logps/chosen": -474.2696228027344, "logps/rejected": -383.20379638671875, "loss": 0.2629, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.4606598913669586, "rewards/margins": 2.6545047760009766, "rewards/rejected": -3.1151645183563232, "step": 1570 }, { "epoch": 1.8, "learning_rate": 0.0004090557275541796, "logits/chosen": -4.132960319519043, "logits/rejected": -4.0079851150512695, "logps/chosen": -445.99212646484375, "logps/rejected": -371.2735290527344, "loss": 0.2642, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.66888028383255, "rewards/margins": 2.4671547412872314, "rewards/rejected": -3.1360349655151367, "step": 1580 }, { "epoch": 1.81, "learning_rate": 0.0004051857585139319, "logits/chosen": -4.113916873931885, "logits/rejected": -3.9890856742858887, "logps/chosen": -450.6957092285156, "logps/rejected": -372.7410583496094, "loss": 0.2383, "rewards/accuracies": 0.9078124761581421, "rewards/chosen": -0.7629120945930481, "rewards/margins": 2.535019636154175, "rewards/rejected": -3.2979321479797363, "step": 1590 }, { "epoch": 1.82, "learning_rate": 0.00040131578947368425, "logits/chosen": -4.137990474700928, "logits/rejected": -3.9757423400878906, "logps/chosen": -466.10736083984375, "logps/rejected": -365.39996337890625, "loss": 0.2292, "rewards/accuracies": 0.9203125238418579, "rewards/chosen": -0.7588563561439514, "rewards/margins": 2.4903316497802734, "rewards/rejected": -3.24918794631958, "step": 1600 }, { "epoch": 1.83, "learning_rate": 0.00039744582043343653, "logits/chosen": -4.133988857269287, "logits/rejected": -3.958237886428833, "logps/chosen": -462.7491760253906, "logps/rejected": -367.7718200683594, "loss": 0.2483, "rewards/accuracies": 0.885937511920929, "rewards/chosen": -0.8764969706535339, "rewards/margins": 2.650912284851074, "rewards/rejected": -3.527409315109253, "step": 1610 }, { "epoch": 1.84, "learning_rate": 0.00039357585139318887, "logits/chosen": -4.102563381195068, "logits/rejected": -3.944859027862549, "logps/chosen": -484.01287841796875, "logps/rejected": -383.304931640625, "loss": 0.2296, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -0.8068784475326538, "rewards/margins": 2.7377572059631348, "rewards/rejected": -3.54463529586792, "step": 1620 }, { "epoch": 1.85, "learning_rate": 0.0003897058823529412, "logits/chosen": -4.097002983093262, "logits/rejected": -3.9392192363739014, "logps/chosen": -442.81353759765625, "logps/rejected": -360.24359130859375, "loss": 0.233, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -0.7854810953140259, "rewards/margins": 2.6622626781463623, "rewards/rejected": -3.4477438926696777, "step": 1630 }, { "epoch": 1.86, "learning_rate": 0.0003858359133126935, "logits/chosen": -4.114195346832275, "logits/rejected": -3.977809429168701, "logps/chosen": -460.42169189453125, "logps/rejected": -376.6380920410156, "loss": 0.2174, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.6236112713813782, "rewards/margins": 2.8172924518585205, "rewards/rejected": -3.440903902053833, "step": 1640 }, { "epoch": 1.88, "learning_rate": 0.00038196594427244583, "logits/chosen": -4.120942115783691, "logits/rejected": -3.9810118675231934, "logps/chosen": -457.93939208984375, "logps/rejected": -380.33819580078125, "loss": 0.2524, "rewards/accuracies": 0.8765624761581421, "rewards/chosen": -0.9801818132400513, "rewards/margins": 2.646254062652588, "rewards/rejected": -3.6264357566833496, "step": 1650 }, { "epoch": 1.89, "learning_rate": 0.0003780959752321981, "logits/chosen": -4.111827850341797, "logits/rejected": -3.9897701740264893, "logps/chosen": -476.6372985839844, "logps/rejected": -392.09515380859375, "loss": 0.2448, "rewards/accuracies": 0.9046875238418579, "rewards/chosen": -1.058961033821106, "rewards/margins": 2.797791004180908, "rewards/rejected": -3.8567519187927246, "step": 1660 }, { "epoch": 1.9, "learning_rate": 0.00037422600619195045, "logits/chosen": -4.180169105529785, "logits/rejected": -3.9965901374816895, "logps/chosen": -478.71710205078125, "logps/rejected": -381.34490966796875, "loss": 0.2304, "rewards/accuracies": 0.9140625, "rewards/chosen": -1.0559020042419434, "rewards/margins": 2.6772279739379883, "rewards/rejected": -3.7331302165985107, "step": 1670 }, { "epoch": 1.91, "learning_rate": 0.0003703560371517028, "logits/chosen": -4.142928600311279, "logits/rejected": -3.9916534423828125, "logps/chosen": -451.61859130859375, "logps/rejected": -380.14501953125, "loss": 0.2389, "rewards/accuracies": 0.901562511920929, "rewards/chosen": -1.3838818073272705, "rewards/margins": 2.7773356437683105, "rewards/rejected": -4.16121768951416, "step": 1680 }, { "epoch": 1.92, "learning_rate": 0.00036648606811145513, "logits/chosen": -4.148522853851318, "logits/rejected": -3.969480037689209, "logps/chosen": -487.909912109375, "logps/rejected": -373.3220520019531, "loss": 0.2592, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.247733235359192, "rewards/margins": 2.775735378265381, "rewards/rejected": -4.023468971252441, "step": 1690 }, { "epoch": 1.93, "learning_rate": 0.00036261609907120747, "logits/chosen": -4.14020299911499, "logits/rejected": -4.01711368560791, "logps/chosen": -440.34588623046875, "logps/rejected": -378.77960205078125, "loss": 0.2712, "rewards/accuracies": 0.8921874761581421, "rewards/chosen": -0.9871402978897095, "rewards/margins": 2.6940577030181885, "rewards/rejected": -3.6811981201171875, "step": 1700 }, { "epoch": 1.94, "learning_rate": 0.00035874613003095975, "logits/chosen": -4.149487495422363, "logits/rejected": -3.977717638015747, "logps/chosen": -448.97955322265625, "logps/rejected": -367.9267883300781, "loss": 0.2311, "rewards/accuracies": 0.9156249761581421, "rewards/chosen": -1.0905286073684692, "rewards/margins": 2.6308722496032715, "rewards/rejected": -3.721400737762451, "step": 1710 }, { "epoch": 1.96, "learning_rate": 0.0003548761609907121, "logits/chosen": -4.174777507781982, "logits/rejected": -4.006318092346191, "logps/chosen": -461.8540954589844, "logps/rejected": -360.53997802734375, "loss": 0.2242, "rewards/accuracies": 0.9140625, "rewards/chosen": -1.2176761627197266, "rewards/margins": 2.6848456859588623, "rewards/rejected": -3.902522325515747, "step": 1720 }, { "epoch": 1.97, "learning_rate": 0.00035100619195046443, "logits/chosen": -4.174556255340576, "logits/rejected": -4.025227069854736, "logps/chosen": -454.0242614746094, "logps/rejected": -364.8119201660156, "loss": 0.2751, "rewards/accuracies": 0.8890625238418579, "rewards/chosen": -1.253633737564087, "rewards/margins": 2.593334436416626, "rewards/rejected": -3.846968173980713, "step": 1730 }, { "epoch": 1.98, "learning_rate": 0.0003471362229102167, "logits/chosen": -4.1873884201049805, "logits/rejected": -4.051218509674072, "logps/chosen": -452.62615966796875, "logps/rejected": -373.3890686035156, "loss": 0.2436, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -1.1961935758590698, "rewards/margins": 2.6770997047424316, "rewards/rejected": -3.87329363822937, "step": 1740 }, { "epoch": 1.99, "learning_rate": 0.00034326625386996905, "logits/chosen": -4.1867146492004395, "logits/rejected": -4.020144462585449, "logps/chosen": -480.8916931152344, "logps/rejected": -385.32623291015625, "loss": 0.2105, "rewards/accuracies": 0.917187511920929, "rewards/chosen": -1.318253755569458, "rewards/margins": 2.838366985321045, "rewards/rejected": -4.156620979309082, "step": 1750 }, { "epoch": 2.0, "learning_rate": 0.00033939628482972133, "logits/chosen": -4.185609817504883, "logits/rejected": -4.006119728088379, "logps/chosen": -485.7627868652344, "logps/rejected": -380.66497802734375, "loss": 0.2329, "rewards/accuracies": 0.9046875238418579, "rewards/chosen": -1.6218128204345703, "rewards/margins": 2.8267197608947754, "rewards/rejected": -4.448532581329346, "step": 1760 }, { "epoch": 2.01, "learning_rate": 0.00033552631578947367, "logits/chosen": -4.1521453857421875, "logits/rejected": -4.068066596984863, "logps/chosen": -445.12445068359375, "logps/rejected": -395.56488037109375, "loss": 0.1055, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.0635168552398682, "rewards/margins": 3.527172088623047, "rewards/rejected": -4.590689659118652, "step": 1770 }, { "epoch": 2.02, "learning_rate": 0.000331656346749226, "logits/chosen": -4.171954154968262, "logits/rejected": -4.00327205657959, "logps/chosen": -453.67108154296875, "logps/rejected": -369.5701904296875, "loss": 0.0881, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.0114647150039673, "rewards/margins": 3.924828290939331, "rewards/rejected": -4.936293601989746, "step": 1780 }, { "epoch": 2.04, "learning_rate": 0.00032778637770897835, "logits/chosen": -4.139638423919678, "logits/rejected": -3.982611894607544, "logps/chosen": -461.5379333496094, "logps/rejected": -398.61822509765625, "loss": 0.0989, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2705217599868774, "rewards/margins": 3.991398334503174, "rewards/rejected": -5.261919975280762, "step": 1790 }, { "epoch": 2.05, "learning_rate": 0.0003239164086687307, "logits/chosen": -4.114060401916504, "logits/rejected": -4.0027360916137695, "logps/chosen": -465.98736572265625, "logps/rejected": -399.2384338378906, "loss": 0.0862, "rewards/accuracies": 0.9703124761581421, "rewards/chosen": -1.598183274269104, "rewards/margins": 4.0436811447143555, "rewards/rejected": -5.641863822937012, "step": 1800 }, { "epoch": 2.05, "eval_logits/chosen": -4.099196434020996, "eval_logits/rejected": -3.962447166442871, "eval_logps/chosen": -471.10467529296875, "eval_logps/rejected": -384.3660888671875, "eval_loss": 0.6626344323158264, "eval_rewards/accuracies": 0.7283680438995361, "eval_rewards/chosen": -2.776369571685791, "eval_rewards/margins": 1.8944175243377686, "eval_rewards/rejected": -4.670787334442139, "eval_runtime": 580.0301, "eval_samples_per_second": 3.146, "eval_steps_per_second": 1.574, "step": 1800 }, { "epoch": 2.06, "learning_rate": 0.00032004643962848297, "logits/chosen": -4.100186347961426, "logits/rejected": -3.9521865844726562, "logps/chosen": -454.51068115234375, "logps/rejected": -375.8501281738281, "loss": 0.089, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.7558130025863647, "rewards/margins": 4.220589637756348, "rewards/rejected": -5.9764018058776855, "step": 1810 }, { "epoch": 2.07, "learning_rate": 0.0003161764705882353, "logits/chosen": -4.091579914093018, "logits/rejected": -3.9678893089294434, "logps/chosen": -480.7119140625, "logps/rejected": -406.24951171875, "loss": 0.0802, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -1.6219885349273682, "rewards/margins": 4.287364482879639, "rewards/rejected": -5.909352779388428, "step": 1820 }, { "epoch": 2.08, "learning_rate": 0.00031230650154798765, "logits/chosen": -4.060251235961914, "logits/rejected": -3.9350249767303467, "logps/chosen": -455.95330810546875, "logps/rejected": -401.65191650390625, "loss": 0.0874, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -1.8058483600616455, "rewards/margins": 4.559729099273682, "rewards/rejected": -6.365577220916748, "step": 1830 }, { "epoch": 2.09, "learning_rate": 0.00030843653250773993, "logits/chosen": -4.081608772277832, "logits/rejected": -3.9742839336395264, "logps/chosen": -487.35919189453125, "logps/rejected": -417.44573974609375, "loss": 0.0909, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -1.7625404596328735, "rewards/margins": 4.313028335571289, "rewards/rejected": -6.075569152832031, "step": 1840 }, { "epoch": 2.1, "learning_rate": 0.00030456656346749227, "logits/chosen": -4.061827182769775, "logits/rejected": -3.9257330894470215, "logps/chosen": -484.5743103027344, "logps/rejected": -406.17889404296875, "loss": 0.0847, "rewards/accuracies": 0.9828125238418579, "rewards/chosen": -1.6442257165908813, "rewards/margins": 4.384451866149902, "rewards/rejected": -6.028677940368652, "step": 1850 }, { "epoch": 2.11, "learning_rate": 0.00030069659442724455, "logits/chosen": -4.071502208709717, "logits/rejected": -3.9825053215026855, "logps/chosen": -460.91876220703125, "logps/rejected": -427.92376708984375, "loss": 0.0944, "rewards/accuracies": 0.96875, "rewards/chosen": -1.9216229915618896, "rewards/margins": 4.300149440765381, "rewards/rejected": -6.221772193908691, "step": 1860 }, { "epoch": 2.13, "learning_rate": 0.0002968266253869969, "logits/chosen": -4.1019134521484375, "logits/rejected": -3.9470276832580566, "logps/chosen": -488.52978515625, "logps/rejected": -407.39434814453125, "loss": 0.0869, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.9397556781768799, "rewards/margins": 4.343387126922607, "rewards/rejected": -6.283143043518066, "step": 1870 }, { "epoch": 2.14, "learning_rate": 0.0002929566563467492, "logits/chosen": -4.099150657653809, "logits/rejected": -3.9682860374450684, "logps/chosen": -458.50335693359375, "logps/rejected": -384.6944274902344, "loss": 0.0853, "rewards/accuracies": 0.9703124761581421, "rewards/chosen": -2.0801169872283936, "rewards/margins": 4.236763954162598, "rewards/rejected": -6.316880702972412, "step": 1880 }, { "epoch": 2.15, "learning_rate": 0.00028908668730650156, "logits/chosen": -4.078397274017334, "logits/rejected": -3.9928581714630127, "logps/chosen": -453.841552734375, "logps/rejected": -405.6170654296875, "loss": 0.0819, "rewards/accuracies": 0.9828125238418579, "rewards/chosen": -1.8457765579223633, "rewards/margins": 4.315034866333008, "rewards/rejected": -6.160810947418213, "step": 1890 }, { "epoch": 2.16, "learning_rate": 0.0002852167182662539, "logits/chosen": -4.0579833984375, "logits/rejected": -3.919133424758911, "logps/chosen": -463.38983154296875, "logps/rejected": -393.13214111328125, "loss": 0.0822, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.86667799949646, "rewards/margins": 4.488894462585449, "rewards/rejected": -6.355571746826172, "step": 1900 }, { "epoch": 2.17, "learning_rate": 0.0002813467492260062, "logits/chosen": -4.0522894859313965, "logits/rejected": -3.9481842517852783, "logps/chosen": -456.20831298828125, "logps/rejected": -397.7898254394531, "loss": 0.0871, "rewards/accuracies": 0.979687511920929, "rewards/chosen": -1.8982452154159546, "rewards/margins": 4.308195114135742, "rewards/rejected": -6.206439971923828, "step": 1910 }, { "epoch": 2.18, "learning_rate": 0.0002774767801857585, "logits/chosen": -4.094363689422607, "logits/rejected": -3.9500765800476074, "logps/chosen": -477.19256591796875, "logps/rejected": -403.66619873046875, "loss": 0.0817, "rewards/accuracies": 0.9671875238418579, "rewards/chosen": -1.9032741785049438, "rewards/margins": 4.545943737030029, "rewards/rejected": -6.449217796325684, "step": 1920 }, { "epoch": 2.19, "learning_rate": 0.00027360681114551086, "logits/chosen": -4.063013553619385, "logits/rejected": -3.9196841716766357, "logps/chosen": -486.87493896484375, "logps/rejected": -404.94140625, "loss": 0.0773, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -1.8990163803100586, "rewards/margins": 4.428265571594238, "rewards/rejected": -6.327281951904297, "step": 1930 }, { "epoch": 2.21, "learning_rate": 0.00026973684210526315, "logits/chosen": -4.068678855895996, "logits/rejected": -3.9359519481658936, "logps/chosen": -467.9432678222656, "logps/rejected": -387.5244445800781, "loss": 0.0876, "rewards/accuracies": 0.9703124761581421, "rewards/chosen": -1.8748531341552734, "rewards/margins": 4.535095691680908, "rewards/rejected": -6.409948825836182, "step": 1940 }, { "epoch": 2.22, "learning_rate": 0.0002658668730650155, "logits/chosen": -4.046270847320557, "logits/rejected": -3.9353084564208984, "logps/chosen": -475.07928466796875, "logps/rejected": -411.2774353027344, "loss": 0.084, "rewards/accuracies": 0.9828125238418579, "rewards/chosen": -1.9962526559829712, "rewards/margins": 4.404291152954102, "rewards/rejected": -6.400543212890625, "step": 1950 }, { "epoch": 2.23, "learning_rate": 0.0002619969040247678, "logits/chosen": -4.042412757873535, "logits/rejected": -3.943819046020508, "logps/chosen": -460.7626953125, "logps/rejected": -418.08013916015625, "loss": 0.0774, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.0331170558929443, "rewards/margins": 4.446463108062744, "rewards/rejected": -6.479579925537109, "step": 1960 }, { "epoch": 2.24, "learning_rate": 0.0002581269349845201, "logits/chosen": -4.053791046142578, "logits/rejected": -3.9368233680725098, "logps/chosen": -477.2555236816406, "logps/rejected": -412.51617431640625, "loss": 0.0834, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1482975482940674, "rewards/margins": 4.569310188293457, "rewards/rejected": -6.7176079750061035, "step": 1970 }, { "epoch": 2.25, "learning_rate": 0.00025425696594427244, "logits/chosen": -4.060332298278809, "logits/rejected": -3.9399471282958984, "logps/chosen": -450.8645935058594, "logps/rejected": -383.22503662109375, "loss": 0.0912, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.0010411739349365, "rewards/margins": 4.316088676452637, "rewards/rejected": -6.317130088806152, "step": 1980 }, { "epoch": 2.26, "learning_rate": 0.0002503869969040248, "logits/chosen": -4.037812232971191, "logits/rejected": -3.9101269245147705, "logps/chosen": -483.0682067871094, "logps/rejected": -417.84381103515625, "loss": 0.0726, "rewards/accuracies": 0.979687511920929, "rewards/chosen": -1.6239198446273804, "rewards/margins": 4.670831203460693, "rewards/rejected": -6.2947516441345215, "step": 1990 }, { "epoch": 2.27, "learning_rate": 0.00024651702786377707, "logits/chosen": -4.061712741851807, "logits/rejected": -3.9138970375061035, "logps/chosen": -470.1129455566406, "logps/rejected": -381.0320129394531, "loss": 0.083, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.864711046218872, "rewards/margins": 4.578787326812744, "rewards/rejected": -6.443498134613037, "step": 2000 }, { "epoch": 2.29, "learning_rate": 0.0002426470588235294, "logits/chosen": -4.040421485900879, "logits/rejected": -3.8937900066375732, "logps/chosen": -465.3186950683594, "logps/rejected": -384.11431884765625, "loss": 0.0816, "rewards/accuracies": 0.96875, "rewards/chosen": -2.0299508571624756, "rewards/margins": 4.572933197021484, "rewards/rejected": -6.602883815765381, "step": 2010 }, { "epoch": 2.3, "learning_rate": 0.00023877708978328174, "logits/chosen": -4.028491020202637, "logits/rejected": -3.9004313945770264, "logps/chosen": -471.8551330566406, "logps/rejected": -393.68182373046875, "loss": 0.0796, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.153228282928467, "rewards/margins": 4.39694881439209, "rewards/rejected": -6.550177574157715, "step": 2020 }, { "epoch": 2.31, "learning_rate": 0.00023490712074303405, "logits/chosen": -4.057473659515381, "logits/rejected": -3.939643144607544, "logps/chosen": -480.433837890625, "logps/rejected": -440.46795654296875, "loss": 0.0835, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.0369668006896973, "rewards/margins": 4.580521106719971, "rewards/rejected": -6.617487907409668, "step": 2030 }, { "epoch": 2.32, "learning_rate": 0.0002310371517027864, "logits/chosen": -4.039044380187988, "logits/rejected": -3.885129451751709, "logps/chosen": -457.95526123046875, "logps/rejected": -398.6941223144531, "loss": 0.0953, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.229137897491455, "rewards/margins": 4.554163932800293, "rewards/rejected": -6.78330135345459, "step": 2040 }, { "epoch": 2.33, "learning_rate": 0.0002271671826625387, "logits/chosen": -4.042059898376465, "logits/rejected": -3.88604474067688, "logps/chosen": -502.5570373535156, "logps/rejected": -418.5478515625, "loss": 0.0889, "rewards/accuracies": 0.9671875238418579, "rewards/chosen": -1.9631668329238892, "rewards/margins": 4.674599647521973, "rewards/rejected": -6.637766361236572, "step": 2050 }, { "epoch": 2.34, "learning_rate": 0.000223297213622291, "logits/chosen": -4.0554070472717285, "logits/rejected": -3.9353203773498535, "logps/chosen": -477.87164306640625, "logps/rejected": -408.6524963378906, "loss": 0.0836, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.5695935487747192, "rewards/margins": 4.404808521270752, "rewards/rejected": -5.974401950836182, "step": 2060 }, { "epoch": 2.35, "learning_rate": 0.00021942724458204335, "logits/chosen": -4.036772727966309, "logits/rejected": -3.897782564163208, "logps/chosen": -455.0943298339844, "logps/rejected": -396.4984436035156, "loss": 0.0955, "rewards/accuracies": 0.9671875238418579, "rewards/chosen": -1.6834611892700195, "rewards/margins": 4.394008636474609, "rewards/rejected": -6.077469825744629, "step": 2070 }, { "epoch": 2.37, "learning_rate": 0.00021555727554179566, "logits/chosen": -4.063164710998535, "logits/rejected": -3.9161248207092285, "logps/chosen": -457.53643798828125, "logps/rejected": -383.6148681640625, "loss": 0.0738, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.6004369258880615, "rewards/margins": 4.524335861206055, "rewards/rejected": -6.124773025512695, "step": 2080 }, { "epoch": 2.38, "learning_rate": 0.000211687306501548, "logits/chosen": -4.042224884033203, "logits/rejected": -3.941323757171631, "logps/chosen": -457.99664306640625, "logps/rejected": -421.6875, "loss": 0.0722, "rewards/accuracies": 0.9828125238418579, "rewards/chosen": -1.9635288715362549, "rewards/margins": 4.540566444396973, "rewards/rejected": -6.504095554351807, "step": 2090 }, { "epoch": 2.39, "learning_rate": 0.00020781733746130034, "logits/chosen": -4.055906772613525, "logits/rejected": -3.9191513061523438, "logps/chosen": -473.22308349609375, "logps/rejected": -405.5827941894531, "loss": 0.0804, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.9071518182754517, "rewards/margins": 4.600703239440918, "rewards/rejected": -6.507855415344238, "step": 2100 }, { "epoch": 2.39, "eval_logits/chosen": -4.046696186065674, "eval_logits/rejected": -3.9127559661865234, "eval_logps/chosen": -473.6706237792969, "eval_logps/rejected": -388.81396484375, "eval_loss": 0.6818161010742188, "eval_rewards/accuracies": 0.740963876247406, "eval_rewards/chosen": -3.0329668521881104, "eval_rewards/margins": 2.0826125144958496, "eval_rewards/rejected": -5.115579128265381, "eval_runtime": 584.204, "eval_samples_per_second": 3.124, "eval_steps_per_second": 1.563, "step": 2100 }, { "epoch": 2.4, "learning_rate": 0.00020394736842105262, "logits/chosen": -4.039233207702637, "logits/rejected": -3.9270339012145996, "logps/chosen": -487.49652099609375, "logps/rejected": -416.65673828125, "loss": 0.0716, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.7781940698623657, "rewards/margins": 4.658770561218262, "rewards/rejected": -6.436964511871338, "step": 2110 }, { "epoch": 2.41, "learning_rate": 0.00020007739938080496, "logits/chosen": -4.034115314483643, "logits/rejected": -3.8932366371154785, "logps/chosen": -483.29248046875, "logps/rejected": -399.23358154296875, "loss": 0.0821, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -1.8791862726211548, "rewards/margins": 4.514316558837891, "rewards/rejected": -6.393502712249756, "step": 2120 }, { "epoch": 2.42, "learning_rate": 0.00019620743034055727, "logits/chosen": -4.022473335266113, "logits/rejected": -3.8848724365234375, "logps/chosen": -494.7567443847656, "logps/rejected": -414.5052795410156, "loss": 0.0758, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -2.037440776824951, "rewards/margins": 4.625775337219238, "rewards/rejected": -6.663216590881348, "step": 2130 }, { "epoch": 2.43, "learning_rate": 0.0001923374613003096, "logits/chosen": -4.062596321105957, "logits/rejected": -3.9234776496887207, "logps/chosen": -469.2730407714844, "logps/rejected": -388.9490966796875, "loss": 0.1002, "rewards/accuracies": 0.964062511920929, "rewards/chosen": -2.1133065223693848, "rewards/margins": 4.352668285369873, "rewards/rejected": -6.465975284576416, "step": 2140 }, { "epoch": 2.44, "learning_rate": 0.00018846749226006195, "logits/chosen": -4.073647499084473, "logits/rejected": -3.91247296333313, "logps/chosen": -493.9153747558594, "logps/rejected": -392.5951232910156, "loss": 0.0803, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -1.8712408542633057, "rewards/margins": 4.583142280578613, "rewards/rejected": -6.454382419586182, "step": 2150 }, { "epoch": 2.46, "learning_rate": 0.00018459752321981423, "logits/chosen": -4.04437255859375, "logits/rejected": -3.9470572471618652, "logps/chosen": -474.0580139160156, "logps/rejected": -421.47991943359375, "loss": 0.0761, "rewards/accuracies": 0.979687511920929, "rewards/chosen": -1.7748620510101318, "rewards/margins": 4.49957799911499, "rewards/rejected": -6.274440288543701, "step": 2160 }, { "epoch": 2.47, "learning_rate": 0.00018072755417956657, "logits/chosen": -4.056158542633057, "logits/rejected": -3.9249160289764404, "logps/chosen": -464.0899353027344, "logps/rejected": -398.947021484375, "loss": 0.0791, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.0454869270324707, "rewards/margins": 4.299257278442383, "rewards/rejected": -6.3447442054748535, "step": 2170 }, { "epoch": 2.48, "learning_rate": 0.00017685758513931888, "logits/chosen": -4.069303512573242, "logits/rejected": -3.9223875999450684, "logps/chosen": -460.90789794921875, "logps/rejected": -402.7492370605469, "loss": 0.0788, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.1296486854553223, "rewards/margins": 4.53462553024292, "rewards/rejected": -6.6642746925354, "step": 2180 }, { "epoch": 2.49, "learning_rate": 0.00017298761609907122, "logits/chosen": -4.052066326141357, "logits/rejected": -3.9370834827423096, "logps/chosen": -479.750732421875, "logps/rejected": -405.9092712402344, "loss": 0.0854, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.204987049102783, "rewards/margins": 4.4560956954956055, "rewards/rejected": -6.661083221435547, "step": 2190 }, { "epoch": 2.5, "learning_rate": 0.00016911764705882356, "logits/chosen": -4.062440872192383, "logits/rejected": -3.9102981090545654, "logps/chosen": -501.49896240234375, "logps/rejected": -419.1741638183594, "loss": 0.0938, "rewards/accuracies": 0.9609375, "rewards/chosen": -2.2208545207977295, "rewards/margins": 4.451084613800049, "rewards/rejected": -6.671938896179199, "step": 2200 }, { "epoch": 2.51, "learning_rate": 0.00016524767801857584, "logits/chosen": -4.076274394989014, "logits/rejected": -3.9363319873809814, "logps/chosen": -477.8592834472656, "logps/rejected": -414.72052001953125, "loss": 0.0819, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -1.929465889930725, "rewards/margins": 4.612143516540527, "rewards/rejected": -6.541609287261963, "step": 2210 }, { "epoch": 2.52, "learning_rate": 0.00016137770897832818, "logits/chosen": -4.062455177307129, "logits/rejected": -3.9112770557403564, "logps/chosen": -471.1143493652344, "logps/rejected": -394.84503173828125, "loss": 0.0883, "rewards/accuracies": 0.9671875238418579, "rewards/chosen": -2.0793368816375732, "rewards/margins": 4.595743179321289, "rewards/rejected": -6.675080299377441, "step": 2220 }, { "epoch": 2.54, "learning_rate": 0.0001575077399380805, "logits/chosen": -4.03839111328125, "logits/rejected": -3.9128997325897217, "logps/chosen": -501.94708251953125, "logps/rejected": -415.37213134765625, "loss": 0.072, "rewards/accuracies": 0.981249988079071, "rewards/chosen": -2.2111144065856934, "rewards/margins": 4.614539623260498, "rewards/rejected": -6.82565450668335, "step": 2230 }, { "epoch": 2.55, "learning_rate": 0.00015363777089783283, "logits/chosen": -4.03037166595459, "logits/rejected": -3.9269638061523438, "logps/chosen": -464.083251953125, "logps/rejected": -410.85693359375, "loss": 0.0958, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.3452372550964355, "rewards/margins": 4.584462642669678, "rewards/rejected": -6.929699897766113, "step": 2240 }, { "epoch": 2.56, "learning_rate": 0.00014976780185758516, "logits/chosen": -4.053762912750244, "logits/rejected": -3.9479641914367676, "logps/chosen": -489.65948486328125, "logps/rejected": -440.38616943359375, "loss": 0.0942, "rewards/accuracies": 0.9703124761581421, "rewards/chosen": -2.319965124130249, "rewards/margins": 4.512824058532715, "rewards/rejected": -6.832788944244385, "step": 2250 }, { "epoch": 2.57, "learning_rate": 0.00014589783281733745, "logits/chosen": -4.0716552734375, "logits/rejected": -3.9381439685821533, "logps/chosen": -446.44464111328125, "logps/rejected": -399.4381408691406, "loss": 0.085, "rewards/accuracies": 0.9765625, "rewards/chosen": -1.833547592163086, "rewards/margins": 4.550833702087402, "rewards/rejected": -6.384381294250488, "step": 2260 }, { "epoch": 2.58, "learning_rate": 0.00014202786377708979, "logits/chosen": -4.072835445404053, "logits/rejected": -3.949232578277588, "logps/chosen": -457.48602294921875, "logps/rejected": -401.8858642578125, "loss": 0.0766, "rewards/accuracies": 0.984375, "rewards/chosen": -1.8797399997711182, "rewards/margins": 4.4048871994018555, "rewards/rejected": -6.284627437591553, "step": 2270 }, { "epoch": 2.59, "learning_rate": 0.00013815789473684212, "logits/chosen": -4.039734363555908, "logits/rejected": -3.928999662399292, "logps/chosen": -465.676025390625, "logps/rejected": -405.8096618652344, "loss": 0.0829, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -1.936608076095581, "rewards/margins": 4.248067855834961, "rewards/rejected": -6.184675693511963, "step": 2280 }, { "epoch": 2.6, "learning_rate": 0.00013428792569659443, "logits/chosen": -4.054182529449463, "logits/rejected": -3.9241814613342285, "logps/chosen": -471.3623962402344, "logps/rejected": -393.86199951171875, "loss": 0.0735, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.8690292835235596, "rewards/margins": 4.538149833679199, "rewards/rejected": -6.407179355621338, "step": 2290 }, { "epoch": 2.62, "learning_rate": 0.00013041795665634675, "logits/chosen": -4.064959526062012, "logits/rejected": -3.9475784301757812, "logps/chosen": -460.69757080078125, "logps/rejected": -397.31842041015625, "loss": 0.1012, "rewards/accuracies": 0.9703124761581421, "rewards/chosen": -2.1720621585845947, "rewards/margins": 4.399336814880371, "rewards/rejected": -6.571398735046387, "step": 2300 }, { "epoch": 2.63, "learning_rate": 0.00012654798761609906, "logits/chosen": -4.030743598937988, "logits/rejected": -3.930020809173584, "logps/chosen": -451.305908203125, "logps/rejected": -403.08477783203125, "loss": 0.0842, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.294938087463379, "rewards/margins": 4.323108673095703, "rewards/rejected": -6.61804723739624, "step": 2310 }, { "epoch": 2.64, "learning_rate": 0.0001226780185758514, "logits/chosen": -4.060758590698242, "logits/rejected": -3.9272751808166504, "logps/chosen": -464.0152282714844, "logps/rejected": -402.55828857421875, "loss": 0.0734, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.366081953048706, "rewards/margins": 4.5486273765563965, "rewards/rejected": -6.914709568023682, "step": 2320 }, { "epoch": 2.65, "learning_rate": 0.00011880804953560372, "logits/chosen": -4.030327320098877, "logits/rejected": -3.9041049480438232, "logps/chosen": -456.3553771972656, "logps/rejected": -392.958740234375, "loss": 0.072, "rewards/accuracies": 0.984375, "rewards/chosen": -2.4610555171966553, "rewards/margins": 4.7176103591918945, "rewards/rejected": -7.178666114807129, "step": 2330 }, { "epoch": 2.66, "learning_rate": 0.00011493808049535603, "logits/chosen": -4.007082939147949, "logits/rejected": -3.889982223510742, "logps/chosen": -448.06719970703125, "logps/rejected": -381.2936096191406, "loss": 0.0713, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.1826024055480957, "rewards/margins": 4.715658187866211, "rewards/rejected": -6.898260593414307, "step": 2340 }, { "epoch": 2.67, "learning_rate": 0.00011106811145510837, "logits/chosen": -4.0117034912109375, "logits/rejected": -3.8816237449645996, "logps/chosen": -444.3555603027344, "logps/rejected": -397.4671325683594, "loss": 0.0881, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.1509411334991455, "rewards/margins": 4.741935729980469, "rewards/rejected": -6.892876625061035, "step": 2350 }, { "epoch": 2.68, "learning_rate": 0.00010719814241486069, "logits/chosen": -4.029890060424805, "logits/rejected": -3.905717372894287, "logps/chosen": -488.9482421875, "logps/rejected": -408.9283447265625, "loss": 0.0989, "rewards/accuracies": 0.9671875238418579, "rewards/chosen": -2.3174080848693848, "rewards/margins": 4.586155891418457, "rewards/rejected": -6.903563499450684, "step": 2360 }, { "epoch": 2.69, "learning_rate": 0.000103328173374613, "logits/chosen": -4.036957740783691, "logits/rejected": -3.910012722015381, "logps/chosen": -485.1014709472656, "logps/rejected": -406.50872802734375, "loss": 0.0787, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.3204379081726074, "rewards/margins": 4.678440570831299, "rewards/rejected": -6.998878479003906, "step": 2370 }, { "epoch": 2.71, "learning_rate": 9.945820433436533e-05, "logits/chosen": -3.9959442615509033, "logits/rejected": -3.8816115856170654, "logps/chosen": -477.9117126464844, "logps/rejected": -407.93109130859375, "loss": 0.0868, "rewards/accuracies": 0.9703124761581421, "rewards/chosen": -2.440147876739502, "rewards/margins": 4.586025238037109, "rewards/rejected": -7.0261735916137695, "step": 2380 }, { "epoch": 2.72, "learning_rate": 9.558823529411764e-05, "logits/chosen": -4.014527320861816, "logits/rejected": -3.876044511795044, "logps/chosen": -487.6546936035156, "logps/rejected": -417.52215576171875, "loss": 0.0668, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.38547682762146, "rewards/margins": 4.742751121520996, "rewards/rejected": -7.128228187561035, "step": 2390 }, { "epoch": 2.73, "learning_rate": 9.171826625386998e-05, "logits/chosen": -3.9865341186523438, "logits/rejected": -3.8804149627685547, "logps/chosen": -429.01910400390625, "logps/rejected": -382.8365173339844, "loss": 0.0925, "rewards/accuracies": 0.9671875238418579, "rewards/chosen": -2.5817480087280273, "rewards/margins": 4.414107799530029, "rewards/rejected": -6.995855808258057, "step": 2400 }, { "epoch": 2.73, "eval_logits/chosen": -4.013707637786865, "eval_logits/rejected": -3.890836000442505, "eval_logps/chosen": -478.9622802734375, "eval_logps/rejected": -394.195556640625, "eval_loss": 0.6947003602981567, "eval_rewards/accuracies": 0.737130343914032, "eval_rewards/chosen": -3.5621278285980225, "eval_rewards/margins": 2.0916078090667725, "eval_rewards/rejected": -5.653735160827637, "eval_runtime": 585.0731, "eval_samples_per_second": 3.119, "eval_steps_per_second": 1.56, "step": 2400 }, { "epoch": 2.74, "learning_rate": 8.784829721362229e-05, "logits/chosen": -3.9968457221984863, "logits/rejected": -3.8738808631896973, "logps/chosen": -493.1527404785156, "logps/rejected": -417.54156494140625, "loss": 0.0773, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.2573695182800293, "rewards/margins": 4.708822250366211, "rewards/rejected": -6.966191291809082, "step": 2410 }, { "epoch": 2.75, "learning_rate": 8.397832817337461e-05, "logits/chosen": -4.038022041320801, "logits/rejected": -3.8945815563201904, "logps/chosen": -478.8358459472656, "logps/rejected": -404.5120544433594, "loss": 0.0733, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -2.2673752307891846, "rewards/margins": 4.710148811340332, "rewards/rejected": -6.977524757385254, "step": 2420 }, { "epoch": 2.76, "learning_rate": 8.010835913312694e-05, "logits/chosen": -4.061777114868164, "logits/rejected": -3.922711133956909, "logps/chosen": -473.9579162597656, "logps/rejected": -427.1693420410156, "loss": 0.0777, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.121866464614868, "rewards/margins": 4.67832612991333, "rewards/rejected": -6.800192832946777, "step": 2430 }, { "epoch": 2.77, "learning_rate": 7.623839009287926e-05, "logits/chosen": -4.0184831619262695, "logits/rejected": -3.9106674194335938, "logps/chosen": -467.1485290527344, "logps/rejected": -426.37158203125, "loss": 0.0766, "rewards/accuracies": 0.984375, "rewards/chosen": -1.9723981618881226, "rewards/margins": 4.665931224822998, "rewards/rejected": -6.638328552246094, "step": 2440 }, { "epoch": 2.79, "learning_rate": 7.236842105263159e-05, "logits/chosen": -4.041088104248047, "logits/rejected": -3.9131877422332764, "logps/chosen": -471.2433166503906, "logps/rejected": -398.13787841796875, "loss": 0.0731, "rewards/accuracies": 0.9781249761581421, "rewards/chosen": -2.0632588863372803, "rewards/margins": 4.611903190612793, "rewards/rejected": -6.675162315368652, "step": 2450 }, { "epoch": 2.8, "learning_rate": 6.84984520123839e-05, "logits/chosen": -4.039373874664307, "logits/rejected": -3.9188714027404785, "logps/chosen": -473.0728454589844, "logps/rejected": -398.5749206542969, "loss": 0.0782, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.227428913116455, "rewards/margins": 4.6053948402404785, "rewards/rejected": -6.832823753356934, "step": 2460 }, { "epoch": 2.81, "learning_rate": 6.462848297213622e-05, "logits/chosen": -4.026165962219238, "logits/rejected": -3.9272258281707764, "logps/chosen": -462.39947509765625, "logps/rejected": -402.6830749511719, "loss": 0.0859, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1961612701416016, "rewards/margins": 4.54397439956665, "rewards/rejected": -6.74013614654541, "step": 2470 }, { "epoch": 2.82, "learning_rate": 6.0758513931888545e-05, "logits/chosen": -4.0160980224609375, "logits/rejected": -3.924609661102295, "logps/chosen": -452.02056884765625, "logps/rejected": -414.5785217285156, "loss": 0.0736, "rewards/accuracies": 0.979687511920929, "rewards/chosen": -2.0622196197509766, "rewards/margins": 4.619808673858643, "rewards/rejected": -6.682027339935303, "step": 2480 }, { "epoch": 2.83, "learning_rate": 5.688854489164086e-05, "logits/chosen": -4.000798225402832, "logits/rejected": -3.9073472023010254, "logps/chosen": -456.7662048339844, "logps/rejected": -402.3853759765625, "loss": 0.0835, "rewards/accuracies": 0.9671875238418579, "rewards/chosen": -2.1083760261535645, "rewards/margins": 4.517114162445068, "rewards/rejected": -6.625490665435791, "step": 2490 }, { "epoch": 2.84, "learning_rate": 5.3018575851393194e-05, "logits/chosen": -4.044299125671387, "logits/rejected": -3.9014954566955566, "logps/chosen": -462.43670654296875, "logps/rejected": -394.67108154296875, "loss": 0.0863, "rewards/accuracies": 0.971875011920929, "rewards/chosen": -2.0647380352020264, "rewards/margins": 4.580681800842285, "rewards/rejected": -6.645419120788574, "step": 2500 }, { "epoch": 2.85, "learning_rate": 4.914860681114551e-05, "logits/chosen": -4.032798767089844, "logits/rejected": -3.919098377227783, "logps/chosen": -460.6669006347656, "logps/rejected": -402.1219787597656, "loss": 0.0797, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.047119617462158, "rewards/margins": 4.583317279815674, "rewards/rejected": -6.630436897277832, "step": 2510 }, { "epoch": 2.87, "learning_rate": 4.5278637770897836e-05, "logits/chosen": -4.028850555419922, "logits/rejected": -3.9303107261657715, "logps/chosen": -465.36480712890625, "logps/rejected": -412.0946350097656, "loss": 0.0821, "rewards/accuracies": 0.965624988079071, "rewards/chosen": -2.0194358825683594, "rewards/margins": 4.559321403503418, "rewards/rejected": -6.578756809234619, "step": 2520 }, { "epoch": 2.88, "learning_rate": 4.1408668730650154e-05, "logits/chosen": -4.0101799964904785, "logits/rejected": -3.906670331954956, "logps/chosen": -467.3497009277344, "logps/rejected": -402.9781188964844, "loss": 0.0841, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.1412062644958496, "rewards/margins": 4.536365032196045, "rewards/rejected": -6.6775712966918945, "step": 2530 }, { "epoch": 2.89, "learning_rate": 3.753869969040248e-05, "logits/chosen": -4.01845645904541, "logits/rejected": -3.9001305103302, "logps/chosen": -466.72772216796875, "logps/rejected": -396.435546875, "loss": 0.0866, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -2.1858012676239014, "rewards/margins": 4.619623184204102, "rewards/rejected": -6.805423736572266, "step": 2540 }, { "epoch": 2.9, "learning_rate": 3.36687306501548e-05, "logits/chosen": -4.035871505737305, "logits/rejected": -3.88500714302063, "logps/chosen": -470.5638122558594, "logps/rejected": -394.48272705078125, "loss": 0.0669, "rewards/accuracies": 0.984375, "rewards/chosen": -2.1895978450775146, "rewards/margins": 4.679079532623291, "rewards/rejected": -6.868676662445068, "step": 2550 }, { "epoch": 2.91, "learning_rate": 2.979876160990712e-05, "logits/chosen": -4.027894020080566, "logits/rejected": -3.9059689044952393, "logps/chosen": -458.29412841796875, "logps/rejected": -401.7170104980469, "loss": 0.0832, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.199465274810791, "rewards/margins": 4.524439811706543, "rewards/rejected": -6.723905086517334, "step": 2560 }, { "epoch": 2.92, "learning_rate": 2.5928792569659445e-05, "logits/chosen": -4.037873268127441, "logits/rejected": -3.8585994243621826, "logps/chosen": -489.94873046875, "logps/rejected": -393.16204833984375, "loss": 0.065, "rewards/accuracies": 0.9859374761581421, "rewards/chosen": -2.1300439834594727, "rewards/margins": 4.796792030334473, "rewards/rejected": -6.9268364906311035, "step": 2570 }, { "epoch": 2.93, "learning_rate": 2.2058823529411766e-05, "logits/chosen": -4.008620262145996, "logits/rejected": -3.87896728515625, "logps/chosen": -494.171630859375, "logps/rejected": -409.16815185546875, "loss": 0.0632, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -2.1649792194366455, "rewards/margins": 4.822103977203369, "rewards/rejected": -6.987082481384277, "step": 2580 }, { "epoch": 2.94, "learning_rate": 1.8188854489164084e-05, "logits/chosen": -4.028036594390869, "logits/rejected": -3.8880081176757812, "logps/chosen": -466.32818603515625, "logps/rejected": -393.21124267578125, "loss": 0.1064, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": -2.2335736751556396, "rewards/margins": 4.52779483795166, "rewards/rejected": -6.761368751525879, "step": 2590 }, { "epoch": 2.96, "learning_rate": 1.431888544891641e-05, "logits/chosen": -4.004082679748535, "logits/rejected": -3.8907203674316406, "logps/chosen": -500.5557556152344, "logps/rejected": -434.0520935058594, "loss": 0.0689, "rewards/accuracies": 0.973437488079071, "rewards/chosen": -2.2163801193237305, "rewards/margins": 5.072004795074463, "rewards/rejected": -7.288384914398193, "step": 2600 }, { "epoch": 2.97, "learning_rate": 1.044891640866873e-05, "logits/chosen": -3.995713710784912, "logits/rejected": -3.8929927349090576, "logps/chosen": -477.29351806640625, "logps/rejected": -410.3609924316406, "loss": 0.0877, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.259779930114746, "rewards/margins": 4.659885883331299, "rewards/rejected": -6.919666290283203, "step": 2610 }, { "epoch": 2.98, "learning_rate": 6.578947368421052e-06, "logits/chosen": -4.023438453674316, "logits/rejected": -3.9143142700195312, "logps/chosen": -485.7718811035156, "logps/rejected": -423.95135498046875, "loss": 0.0703, "rewards/accuracies": 0.9765625, "rewards/chosen": -2.3461365699768066, "rewards/margins": 4.72307825088501, "rewards/rejected": -7.0692138671875, "step": 2620 }, { "epoch": 2.99, "learning_rate": 2.708978328173375e-06, "logits/chosen": -4.030308723449707, "logits/rejected": -3.9030086994171143, "logps/chosen": -496.8736267089844, "logps/rejected": -424.7303161621094, "loss": 0.0693, "rewards/accuracies": 0.979687511920929, "rewards/chosen": -2.1652159690856934, "rewards/margins": 4.87652063369751, "rewards/rejected": -7.041736602783203, "step": 2630 }, { "epoch": 3.0, "step": 2637, "total_flos": 0.0, "train_loss": 0.3021476837690915, "train_runtime": 113394.3571, "train_samples_per_second": 1.489, "train_steps_per_second": 0.023 } ], "logging_steps": 10, "max_steps": 2637, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "trial_name": null, "trial_params": null }