{ "best_metric": 0.4712187647819519, "best_model_checkpoint": "/mnt/yscfs/zhuchiwei/realquestions/ckpt/250212_realquestions_dpo/checkpoint-700", "epoch": 0.99968, "eval_steps": 100, "global_step": 781, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00128, "grad_norm": 7.436504551716031, "learning_rate": 8.860759493670886e-09, "logits/chosen": -1.0859375, "logits/rejected": -1.10498046875, "logps/chosen": -336.5, "logps/rejected": -339.5, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.00256, "grad_norm": 7.250237903929739, "learning_rate": 1.772151898734177e-08, "logits/chosen": -1.134765625, "logits/rejected": -1.11767578125, "logps/chosen": -329.75, "logps/rejected": -317.75, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.00384, "grad_norm": 7.662424410669392, "learning_rate": 2.658227848101266e-08, "logits/chosen": -1.1162109375, "logits/rejected": -1.1435546875, "logps/chosen": -334.5, "logps/rejected": -305.75, "loss": 0.6918, "rewards/accuracies": 0.328125, "rewards/chosen": -0.0010561943054199219, "rewards/margins": 0.0004811286926269531, "rewards/rejected": -0.0015385150909423828, "step": 3 }, { "epoch": 0.00512, "grad_norm": 7.278826365213056, "learning_rate": 3.544303797468354e-08, "logits/chosen": -1.154296875, "logits/rejected": -1.173828125, "logps/chosen": -312.5, "logps/rejected": -320.625, "loss": 0.692, "rewards/accuracies": 0.34375, "rewards/chosen": 0.00021409988403320312, "rewards/margins": 5.698204040527344e-05, "rewards/rejected": 0.000156402587890625, "step": 4 }, { "epoch": 0.0064, "grad_norm": 7.667975226995807, "learning_rate": 4.430379746835443e-08, "logits/chosen": -1.1318359375, "logits/rejected": -1.1826171875, "logps/chosen": -335.0, "logps/rejected": -351.875, "loss": 0.6909, "rewards/accuracies": 0.328125, "rewards/chosen": -3.075599670410156e-05, "rewards/margins": 0.0016710758209228516, "rewards/rejected": -0.0016994476318359375, "step": 5 }, { "epoch": 0.00768, "grad_norm": 7.34407022526262, "learning_rate": 5.316455696202532e-08, "logits/chosen": -1.0146484375, "logits/rejected": -1.1123046875, "logps/chosen": -330.75, "logps/rejected": -351.5, "loss": 0.6926, "rewards/accuracies": 0.3046875, "rewards/chosen": 0.0008311271667480469, "rewards/margins": -0.0005159378051757812, "rewards/rejected": 0.0013489723205566406, "step": 6 }, { "epoch": 0.00896, "grad_norm": 6.79379213946981, "learning_rate": 6.20253164556962e-08, "logits/chosen": -1.044921875, "logits/rejected": -1.10986328125, "logps/chosen": -285.875, "logps/rejected": -287.0, "loss": 0.6924, "rewards/accuracies": 0.3046875, "rewards/chosen": -0.00012946128845214844, "rewards/margins": -0.00041294097900390625, "rewards/rejected": 0.0002841949462890625, "step": 7 }, { "epoch": 0.01024, "grad_norm": 7.527185660748829, "learning_rate": 7.088607594936708e-08, "logits/chosen": -1.12548828125, "logits/rejected": -1.17578125, "logps/chosen": -337.25, "logps/rejected": -321.0, "loss": 0.6914, "rewards/accuracies": 0.296875, "rewards/chosen": 0.0013508796691894531, "rewards/margins": 0.0003237724304199219, "rewards/rejected": 0.001026153564453125, "step": 8 }, { "epoch": 0.01152, "grad_norm": 7.076196519468638, "learning_rate": 7.974683544303797e-08, "logits/chosen": -1.1572265625, "logits/rejected": -1.1826171875, "logps/chosen": -297.0, "logps/rejected": -312.125, "loss": 0.6921, "rewards/accuracies": 0.234375, "rewards/chosen": 0.0007529258728027344, "rewards/margins": -0.0009369850158691406, "rewards/rejected": 0.0016903877258300781, "step": 9 }, { "epoch": 0.0128, "grad_norm": 7.318991452232657, "learning_rate": 8.860759493670886e-08, "logits/chosen": -1.10400390625, "logits/rejected": -1.1416015625, "logps/chosen": -318.75, "logps/rejected": -304.125, "loss": 0.6917, "rewards/accuracies": 0.390625, "rewards/chosen": 0.0015916824340820312, "rewards/margins": 0.0015668869018554688, "rewards/rejected": 2.9325485229492188e-05, "step": 10 }, { "epoch": 0.01408, "grad_norm": 7.525848702124104, "learning_rate": 9.746835443037974e-08, "logits/chosen": -1.1044921875, "logits/rejected": -1.09130859375, "logps/chosen": -341.25, "logps/rejected": -323.5, "loss": 0.6928, "rewards/accuracies": 0.2578125, "rewards/chosen": -0.0008592605590820312, "rewards/margins": -0.0011625289916992188, "rewards/rejected": 0.00030422210693359375, "step": 11 }, { "epoch": 0.01536, "grad_norm": 7.187235726257762, "learning_rate": 1.0632911392405063e-07, "logits/chosen": -1.09619140625, "logits/rejected": -1.15283203125, "logps/chosen": -327.25, "logps/rejected": -326.25, "loss": 0.6926, "rewards/accuracies": 0.328125, "rewards/chosen": -0.0012707710266113281, "rewards/margins": -0.00025081634521484375, "rewards/rejected": -0.001018524169921875, "step": 12 }, { "epoch": 0.01664, "grad_norm": 7.376759243506061, "learning_rate": 1.151898734177215e-07, "logits/chosen": -1.130859375, "logits/rejected": -1.1220703125, "logps/chosen": -317.75, "logps/rejected": -320.0, "loss": 0.6917, "rewards/accuracies": 0.3203125, "rewards/chosen": 5.7220458984375e-05, "rewards/margins": 0.0014476776123046875, "rewards/rejected": -0.0013861656188964844, "step": 13 }, { "epoch": 0.01792, "grad_norm": 7.321841205322695, "learning_rate": 1.240506329113924e-07, "logits/chosen": -1.11669921875, "logits/rejected": -1.146484375, "logps/chosen": -298.5, "logps/rejected": -291.875, "loss": 0.6915, "rewards/accuracies": 0.3203125, "rewards/chosen": 0.0023250579833984375, "rewards/margins": 0.00077056884765625, "rewards/rejected": 0.0015516281127929688, "step": 14 }, { "epoch": 0.0192, "grad_norm": 7.513669747884291, "learning_rate": 1.329113924050633e-07, "logits/chosen": -1.02978515625, "logits/rejected": -1.06396484375, "logps/chosen": -348.5, "logps/rejected": -348.0, "loss": 0.6909, "rewards/accuracies": 0.2890625, "rewards/chosen": 0.00205230712890625, "rewards/margins": 0.001857757568359375, "rewards/rejected": 0.00019502639770507812, "step": 15 }, { "epoch": 0.02048, "grad_norm": 7.56834980768175, "learning_rate": 1.4177215189873417e-07, "logits/chosen": -1.10009765625, "logits/rejected": -1.123046875, "logps/chosen": -331.125, "logps/rejected": -330.25, "loss": 0.6918, "rewards/accuracies": 0.34375, "rewards/chosen": 0.0020537376403808594, "rewards/margins": 0.0016803741455078125, "rewards/rejected": 0.0003743171691894531, "step": 16 }, { "epoch": 0.02176, "grad_norm": 7.361758673679262, "learning_rate": 1.5063291139240505e-07, "logits/chosen": -1.201171875, "logits/rejected": -1.1767578125, "logps/chosen": -337.375, "logps/rejected": -321.5, "loss": 0.6929, "rewards/accuracies": 0.3203125, "rewards/chosen": -0.001008749008178711, "rewards/margins": -0.001155853271484375, "rewards/rejected": 0.00014638900756835938, "step": 17 }, { "epoch": 0.02304, "grad_norm": 7.632190356013242, "learning_rate": 1.5949367088607593e-07, "logits/chosen": -1.1279296875, "logits/rejected": -1.18115234375, "logps/chosen": -324.25, "logps/rejected": -323.75, "loss": 0.6913, "rewards/accuracies": 0.3671875, "rewards/chosen": 0.002650022506713867, "rewards/margins": 0.0017561912536621094, "rewards/rejected": 0.0008993148803710938, "step": 18 }, { "epoch": 0.02432, "grad_norm": 7.567509047244517, "learning_rate": 1.6835443037974684e-07, "logits/chosen": -1.2041015625, "logits/rejected": -1.126953125, "logps/chosen": -334.125, "logps/rejected": -280.0, "loss": 0.6913, "rewards/accuracies": 0.3359375, "rewards/chosen": 0.0007448196411132812, "rewards/margins": 0.00107574462890625, "rewards/rejected": -0.00033283233642578125, "step": 19 }, { "epoch": 0.0256, "grad_norm": 7.324051502003322, "learning_rate": 1.7721518987341772e-07, "logits/chosen": -1.09814453125, "logits/rejected": -1.10302734375, "logps/chosen": -304.5, "logps/rejected": -293.875, "loss": 0.6924, "rewards/accuracies": 0.234375, "rewards/chosen": -0.0005500316619873047, "rewards/margins": -0.0008497238159179688, "rewards/rejected": 0.00030422210693359375, "step": 20 }, { "epoch": 0.02688, "grad_norm": 7.1047505475300525, "learning_rate": 1.8607594936708857e-07, "logits/chosen": -1.06591796875, "logits/rejected": -1.103515625, "logps/chosen": -343.75, "logps/rejected": -317.625, "loss": 0.6921, "rewards/accuracies": 0.3515625, "rewards/chosen": 0.0008325576782226562, "rewards/margins": -5.53131103515625e-05, "rewards/rejected": 0.0008883476257324219, "step": 21 }, { "epoch": 0.02816, "grad_norm": 6.910170627811144, "learning_rate": 1.9493670886075948e-07, "logits/chosen": -1.08251953125, "logits/rejected": -1.1064453125, "logps/chosen": -296.5, "logps/rejected": -283.375, "loss": 0.6908, "rewards/accuracies": 0.4140625, "rewards/chosen": 0.0032749176025390625, "rewards/margins": 0.003100872039794922, "rewards/rejected": 0.00017690658569335938, "step": 22 }, { "epoch": 0.02944, "grad_norm": 7.159911065415018, "learning_rate": 2.0379746835443036e-07, "logits/chosen": -1.1181640625, "logits/rejected": -1.17578125, "logps/chosen": -322.5, "logps/rejected": -323.5, "loss": 0.6921, "rewards/accuracies": 0.3515625, "rewards/chosen": 0.0003407001495361328, "rewards/margins": 0.0001850128173828125, "rewards/rejected": 0.0001583099365234375, "step": 23 }, { "epoch": 0.03072, "grad_norm": 7.159118745120893, "learning_rate": 2.1265822784810127e-07, "logits/chosen": -1.1064453125, "logits/rejected": -1.12890625, "logps/chosen": -317.375, "logps/rejected": -315.75, "loss": 0.6919, "rewards/accuracies": 0.3203125, "rewards/chosen": 0.0022192001342773438, "rewards/margins": 0.0004982948303222656, "rewards/rejected": 0.0017242431640625, "step": 24 }, { "epoch": 0.032, "grad_norm": 7.381100080259422, "learning_rate": 2.2151898734177215e-07, "logits/chosen": -1.0849609375, "logits/rejected": -1.11376953125, "logps/chosen": -320.625, "logps/rejected": -313.5, "loss": 0.6933, "rewards/accuracies": 0.3515625, "rewards/chosen": 0.005059480667114258, "rewards/margins": -0.00042819976806640625, "rewards/rejected": 0.005497932434082031, "step": 25 }, { "epoch": 0.03328, "grad_norm": 7.218282400256934, "learning_rate": 2.30379746835443e-07, "logits/chosen": -1.166015625, "logits/rejected": -1.185546875, "logps/chosen": -326.875, "logps/rejected": -331.875, "loss": 0.6916, "rewards/accuracies": 0.328125, "rewards/chosen": 0.004558563232421875, "rewards/margins": 0.0010230541229248047, "rewards/rejected": 0.0035305023193359375, "step": 26 }, { "epoch": 0.03456, "grad_norm": 7.239082966066987, "learning_rate": 2.392405063291139e-07, "logits/chosen": -1.1005859375, "logits/rejected": -1.15283203125, "logps/chosen": -342.75, "logps/rejected": -328.875, "loss": 0.6924, "rewards/accuracies": 0.328125, "rewards/chosen": 0.004992961883544922, "rewards/margins": -2.574920654296875e-05, "rewards/rejected": 0.0050220489501953125, "step": 27 }, { "epoch": 0.03584, "grad_norm": 6.948014033726085, "learning_rate": 2.481012658227848e-07, "logits/chosen": -1.05908203125, "logits/rejected": -1.0693359375, "logps/chosen": -321.625, "logps/rejected": -285.125, "loss": 0.6935, "rewards/accuracies": 0.296875, "rewards/chosen": 0.0019271373748779297, "rewards/margins": -0.002063751220703125, "rewards/rejected": 0.003989458084106445, "step": 28 }, { "epoch": 0.03712, "grad_norm": 6.869465257961058, "learning_rate": 2.5696202531645567e-07, "logits/chosen": -1.162109375, "logits/rejected": -1.15380859375, "logps/chosen": -307.5, "logps/rejected": -294.625, "loss": 0.6923, "rewards/accuracies": 0.328125, "rewards/chosen": 0.00728607177734375, "rewards/margins": 5.91278076171875e-05, "rewards/rejected": 0.00720977783203125, "step": 29 }, { "epoch": 0.0384, "grad_norm": 7.337023896045187, "learning_rate": 2.658227848101266e-07, "logits/chosen": -1.017578125, "logits/rejected": -1.07275390625, "logps/chosen": -327.25, "logps/rejected": -343.0, "loss": 0.6921, "rewards/accuracies": 0.3671875, "rewards/chosen": 0.0086822509765625, "rewards/margins": 5.5789947509765625e-05, "rewards/rejected": 0.008625030517578125, "step": 30 }, { "epoch": 0.03968, "grad_norm": 7.347791617859354, "learning_rate": 2.7468354430379743e-07, "logits/chosen": -1.18359375, "logits/rejected": -1.2197265625, "logps/chosen": -311.5, "logps/rejected": -307.0, "loss": 0.6923, "rewards/accuracies": 0.359375, "rewards/chosen": 0.01105499267578125, "rewards/margins": 0.0011830329895019531, "rewards/rejected": 0.009868621826171875, "step": 31 }, { "epoch": 0.04096, "grad_norm": 7.498901433929662, "learning_rate": 2.8354430379746834e-07, "logits/chosen": -1.14111328125, "logits/rejected": -1.11865234375, "logps/chosen": -334.5, "logps/rejected": -310.75, "loss": 0.6923, "rewards/accuracies": 0.359375, "rewards/chosen": 0.009601593017578125, "rewards/margins": 0.0008537769317626953, "rewards/rejected": 0.008741378784179688, "step": 32 }, { "epoch": 0.04224, "grad_norm": 7.169131545571104, "learning_rate": 2.9240506329113925e-07, "logits/chosen": -1.10888671875, "logits/rejected": -1.11962890625, "logps/chosen": -329.25, "logps/rejected": -319.375, "loss": 0.6912, "rewards/accuracies": 0.4140625, "rewards/chosen": 0.013051986694335938, "rewards/margins": 0.0033140182495117188, "rewards/rejected": 0.00975799560546875, "step": 33 }, { "epoch": 0.04352, "grad_norm": 7.268573668937982, "learning_rate": 3.012658227848101e-07, "logits/chosen": -1.12109375, "logits/rejected": -1.13037109375, "logps/chosen": -322.25, "logps/rejected": -304.0, "loss": 0.6923, "rewards/accuracies": 0.3984375, "rewards/chosen": 0.011199951171875, "rewards/margins": 0.0003910064697265625, "rewards/rejected": 0.0108184814453125, "step": 34 }, { "epoch": 0.0448, "grad_norm": 7.598910436962618, "learning_rate": 3.1012658227848096e-07, "logits/chosen": -1.06591796875, "logits/rejected": -1.1396484375, "logps/chosen": -311.375, "logps/rejected": -331.5, "loss": 0.6931, "rewards/accuracies": 0.390625, "rewards/chosen": 0.01214599609375, "rewards/margins": -0.0003960132598876953, "rewards/rejected": 0.012542724609375, "step": 35 }, { "epoch": 0.04608, "grad_norm": 7.108926141259724, "learning_rate": 3.1898734177215186e-07, "logits/chosen": -1.10205078125, "logits/rejected": -1.1240234375, "logps/chosen": -324.25, "logps/rejected": -304.125, "loss": 0.6923, "rewards/accuracies": 0.3671875, "rewards/chosen": 0.0137176513671875, "rewards/margins": 0.001232147216796875, "rewards/rejected": 0.012485504150390625, "step": 36 }, { "epoch": 0.04736, "grad_norm": 7.267204751248262, "learning_rate": 3.2784810126582277e-07, "logits/chosen": -1.04638671875, "logits/rejected": -1.07470703125, "logps/chosen": -304.0, "logps/rejected": -331.25, "loss": 0.6912, "rewards/accuracies": 0.3984375, "rewards/chosen": 0.013946533203125, "rewards/margins": 0.00335693359375, "rewards/rejected": 0.010589599609375, "step": 37 }, { "epoch": 0.04864, "grad_norm": 7.22576003364743, "learning_rate": 3.367088607594937e-07, "logits/chosen": -1.1435546875, "logits/rejected": -1.1552734375, "logps/chosen": -338.0, "logps/rejected": -320.125, "loss": 0.6921, "rewards/accuracies": 0.390625, "rewards/chosen": 0.0144500732421875, "rewards/margins": 0.000946044921875, "rewards/rejected": 0.01351165771484375, "step": 38 }, { "epoch": 0.04992, "grad_norm": 6.91155869405154, "learning_rate": 3.4556962025316453e-07, "logits/chosen": -1.02978515625, "logits/rejected": -1.1005859375, "logps/chosen": -301.125, "logps/rejected": -309.0, "loss": 0.6938, "rewards/accuracies": 0.3359375, "rewards/chosen": 0.013141632080078125, "rewards/margins": -0.00186920166015625, "rewards/rejected": 0.0149993896484375, "step": 39 }, { "epoch": 0.0512, "grad_norm": 7.5651212389735845, "learning_rate": 3.5443037974683544e-07, "logits/chosen": -1.11279296875, "logits/rejected": -1.18359375, "logps/chosen": -301.375, "logps/rejected": -328.0, "loss": 0.693, "rewards/accuracies": 0.359375, "rewards/chosen": 0.013530731201171875, "rewards/margins": -0.0004057884216308594, "rewards/rejected": 0.0139312744140625, "step": 40 }, { "epoch": 0.05248, "grad_norm": 7.055346161775772, "learning_rate": 3.632911392405063e-07, "logits/chosen": -1.1689453125, "logits/rejected": -1.19482421875, "logps/chosen": -325.75, "logps/rejected": -307.5, "loss": 0.6915, "rewards/accuracies": 0.40625, "rewards/chosen": 0.017852783203125, "rewards/margins": 0.002711772918701172, "rewards/rejected": 0.01515960693359375, "step": 41 }, { "epoch": 0.05376, "grad_norm": 7.034415420766986, "learning_rate": 3.7215189873417715e-07, "logits/chosen": -1.14501953125, "logits/rejected": -1.1533203125, "logps/chosen": -342.0, "logps/rejected": -316.25, "loss": 0.6912, "rewards/accuracies": 0.40625, "rewards/chosen": 0.019439697265625, "rewards/margins": 0.0026378631591796875, "rewards/rejected": 0.01682281494140625, "step": 42 }, { "epoch": 0.05504, "grad_norm": 7.974604803897025, "learning_rate": 3.810126582278481e-07, "logits/chosen": -1.1513671875, "logits/rejected": -1.1669921875, "logps/chosen": -364.25, "logps/rejected": -370.125, "loss": 0.6921, "rewards/accuracies": 0.421875, "rewards/chosen": 0.01983642578125, "rewards/margins": 0.0025014877319335938, "rewards/rejected": 0.01732635498046875, "step": 43 }, { "epoch": 0.05632, "grad_norm": 7.177077737022184, "learning_rate": 3.8987341772151896e-07, "logits/chosen": -1.0830078125, "logits/rejected": -1.11474609375, "logps/chosen": -337.0, "logps/rejected": -321.375, "loss": 0.6909, "rewards/accuracies": 0.4296875, "rewards/chosen": 0.0222625732421875, "rewards/margins": 0.0036296844482421875, "rewards/rejected": 0.01862335205078125, "step": 44 }, { "epoch": 0.0576, "grad_norm": 7.092550989605114, "learning_rate": 3.9873417721518987e-07, "logits/chosen": -1.208984375, "logits/rejected": -1.1796875, "logps/chosen": -336.75, "logps/rejected": -338.5, "loss": 0.691, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.0211334228515625, "rewards/margins": 0.0034885406494140625, "rewards/rejected": 0.01763916015625, "step": 45 }, { "epoch": 0.05888, "grad_norm": 7.466264419127936, "learning_rate": 4.075949367088607e-07, "logits/chosen": -1.0966796875, "logits/rejected": -1.126953125, "logps/chosen": -321.0, "logps/rejected": -334.75, "loss": 0.6891, "rewards/accuracies": 0.578125, "rewards/chosen": 0.0274810791015625, "rewards/margins": 0.0086822509765625, "rewards/rejected": 0.0187835693359375, "step": 46 }, { "epoch": 0.06016, "grad_norm": 6.985552122485807, "learning_rate": 4.164556962025316e-07, "logits/chosen": -1.12109375, "logits/rejected": -1.1279296875, "logps/chosen": -303.0, "logps/rejected": -297.875, "loss": 0.6915, "rewards/accuracies": 0.421875, "rewards/chosen": 0.0264129638671875, "rewards/margins": 0.003119945526123047, "rewards/rejected": 0.02330780029296875, "step": 47 }, { "epoch": 0.06144, "grad_norm": 7.176358655196464, "learning_rate": 4.2531645569620254e-07, "logits/chosen": -1.1015625, "logits/rejected": -1.11083984375, "logps/chosen": -340.5, "logps/rejected": -318.75, "loss": 0.6901, "rewards/accuracies": 0.5, "rewards/chosen": 0.028717041015625, "rewards/margins": 0.0058269500732421875, "rewards/rejected": 0.0229034423828125, "step": 48 }, { "epoch": 0.06272, "grad_norm": 7.100421828155237, "learning_rate": 4.341772151898734e-07, "logits/chosen": -1.2080078125, "logits/rejected": -1.2109375, "logps/chosen": -339.75, "logps/rejected": -332.25, "loss": 0.6915, "rewards/accuracies": 0.3984375, "rewards/chosen": 0.0296630859375, "rewards/margins": 0.00295257568359375, "rewards/rejected": 0.0267333984375, "step": 49 }, { "epoch": 0.064, "grad_norm": 7.057468496585091, "learning_rate": 4.430379746835443e-07, "logits/chosen": -1.115234375, "logits/rejected": -1.14599609375, "logps/chosen": -306.75, "logps/rejected": -277.0, "loss": 0.6878, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.03302001953125, "rewards/margins": 0.00989532470703125, "rewards/rejected": 0.02313232421875, "step": 50 }, { "epoch": 0.06528, "grad_norm": 7.03326897520388, "learning_rate": 4.5189873417721515e-07, "logits/chosen": -1.1220703125, "logits/rejected": -1.1650390625, "logps/chosen": -292.5, "logps/rejected": -301.625, "loss": 0.6909, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.0316925048828125, "rewards/margins": 0.0028066635131835938, "rewards/rejected": 0.0289306640625, "step": 51 }, { "epoch": 0.06656, "grad_norm": 6.957761014625216, "learning_rate": 4.60759493670886e-07, "logits/chosen": -1.1015625, "logits/rejected": -1.1357421875, "logps/chosen": -309.75, "logps/rejected": -317.75, "loss": 0.6904, "rewards/accuracies": 0.4296875, "rewards/chosen": 0.034149169921875, "rewards/margins": 0.004019737243652344, "rewards/rejected": 0.0301361083984375, "step": 52 }, { "epoch": 0.06784, "grad_norm": 6.866772048798592, "learning_rate": 4.6962025316455697e-07, "logits/chosen": -1.2236328125, "logits/rejected": -1.23828125, "logps/chosen": -313.0, "logps/rejected": -290.625, "loss": 0.6903, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.036773681640625, "rewards/margins": 0.0050182342529296875, "rewards/rejected": 0.0317535400390625, "step": 53 }, { "epoch": 0.06912, "grad_norm": 34.49051291265523, "learning_rate": 4.784810126582278e-07, "logits/chosen": -1.1630859375, "logits/rejected": -1.1513671875, "logps/chosen": -297.375, "logps/rejected": -443.875, "loss": 0.6864, "rewards/accuracies": 0.4765625, "rewards/chosen": 0.0362701416015625, "rewards/margins": 0.02279949188232422, "rewards/rejected": 0.013580322265625, "step": 54 }, { "epoch": 0.0704, "grad_norm": 7.450083821669119, "learning_rate": 4.873417721518987e-07, "logits/chosen": -1.126953125, "logits/rejected": -1.216796875, "logps/chosen": -303.125, "logps/rejected": -338.25, "loss": 0.6942, "rewards/accuracies": 0.3359375, "rewards/chosen": 0.03314208984375, "rewards/margins": -0.002353191375732422, "rewards/rejected": 0.0355072021484375, "step": 55 }, { "epoch": 0.07168, "grad_norm": 7.328024111375669, "learning_rate": 4.962025316455696e-07, "logits/chosen": -1.1318359375, "logits/rejected": -1.12451171875, "logps/chosen": -327.75, "logps/rejected": -300.75, "loss": 0.6884, "rewards/accuracies": 0.546875, "rewards/chosen": 0.0408172607421875, "rewards/margins": 0.008729934692382812, "rewards/rejected": 0.0321197509765625, "step": 56 }, { "epoch": 0.07296, "grad_norm": 6.641731690681104, "learning_rate": 5.050632911392404e-07, "logits/chosen": -1.19482421875, "logits/rejected": -1.208984375, "logps/chosen": -274.0, "logps/rejected": -292.5, "loss": 0.6896, "rewards/accuracies": 0.5, "rewards/chosen": 0.0372772216796875, "rewards/margins": 0.0062408447265625, "rewards/rejected": 0.0310516357421875, "step": 57 }, { "epoch": 0.07424, "grad_norm": 7.328502114098263, "learning_rate": 5.139240506329113e-07, "logits/chosen": -1.2109375, "logits/rejected": -1.220703125, "logps/chosen": -337.25, "logps/rejected": -324.25, "loss": 0.6887, "rewards/accuracies": 0.4453125, "rewards/chosen": 0.0395965576171875, "rewards/margins": 0.008788108825683594, "rewards/rejected": 0.03082275390625, "step": 58 }, { "epoch": 0.07552, "grad_norm": 7.129804240589812, "learning_rate": 5.227848101265822e-07, "logits/chosen": -1.1884765625, "logits/rejected": -1.2099609375, "logps/chosen": -304.5, "logps/rejected": -306.75, "loss": 0.6885, "rewards/accuracies": 0.5078125, "rewards/chosen": 0.0443572998046875, "rewards/margins": 0.008890151977539062, "rewards/rejected": 0.035400390625, "step": 59 }, { "epoch": 0.0768, "grad_norm": 7.1937924938108875, "learning_rate": 5.316455696202532e-07, "logits/chosen": -1.07861328125, "logits/rejected": -1.03759765625, "logps/chosen": -322.25, "logps/rejected": -299.375, "loss": 0.6874, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.046478271484375, "rewards/margins": 0.011920928955078125, "rewards/rejected": 0.0345611572265625, "step": 60 }, { "epoch": 0.07808, "grad_norm": 7.129369707890816, "learning_rate": 5.405063291139241e-07, "logits/chosen": -1.171875, "logits/rejected": -1.15380859375, "logps/chosen": -317.25, "logps/rejected": -275.125, "loss": 0.685, "rewards/accuracies": 0.546875, "rewards/chosen": 0.05120849609375, "rewards/margins": 0.01607513427734375, "rewards/rejected": 0.0351715087890625, "step": 61 }, { "epoch": 0.07936, "grad_norm": 7.302643657053502, "learning_rate": 5.493670886075949e-07, "logits/chosen": -1.189453125, "logits/rejected": -1.2587890625, "logps/chosen": -330.75, "logps/rejected": -345.5, "loss": 0.691, "rewards/accuracies": 0.421875, "rewards/chosen": 0.0424652099609375, "rewards/margins": 0.003490447998046875, "rewards/rejected": 0.03900146484375, "step": 62 }, { "epoch": 0.08064, "grad_norm": 6.84968399915786, "learning_rate": 5.582278481012658e-07, "logits/chosen": -1.087890625, "logits/rejected": -1.119140625, "logps/chosen": -326.125, "logps/rejected": -308.625, "loss": 0.6886, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.0484619140625, "rewards/margins": 0.009546279907226562, "rewards/rejected": 0.038909912109375, "step": 63 }, { "epoch": 0.08192, "grad_norm": 7.274670055824019, "learning_rate": 5.670886075949367e-07, "logits/chosen": -1.1708984375, "logits/rejected": -1.19140625, "logps/chosen": -324.25, "logps/rejected": -328.875, "loss": 0.6859, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.054931640625, "rewards/margins": 0.013892173767089844, "rewards/rejected": 0.04095458984375, "step": 64 }, { "epoch": 0.0832, "grad_norm": 7.043191769175182, "learning_rate": 5.759493670886076e-07, "logits/chosen": -1.1728515625, "logits/rejected": -1.193359375, "logps/chosen": -321.75, "logps/rejected": -315.0, "loss": 0.6874, "rewards/accuracies": 0.53125, "rewards/chosen": 0.050048828125, "rewards/margins": 0.011362075805664062, "rewards/rejected": 0.0386810302734375, "step": 65 }, { "epoch": 0.08448, "grad_norm": 7.1747985320738445, "learning_rate": 5.848101265822785e-07, "logits/chosen": -1.19921875, "logits/rejected": -1.224609375, "logps/chosen": -331.5, "logps/rejected": -333.5, "loss": 0.6893, "rewards/accuracies": 0.4375, "rewards/chosen": 0.05633544921875, "rewards/margins": 0.007636070251464844, "rewards/rejected": 0.04864501953125, "step": 66 }, { "epoch": 0.08576, "grad_norm": 6.706114883500065, "learning_rate": 5.936708860759493e-07, "logits/chosen": -1.1787109375, "logits/rejected": -1.17626953125, "logps/chosen": -327.0, "logps/rejected": -296.875, "loss": 0.6849, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05401611328125, "rewards/margins": 0.016617774963378906, "rewards/rejected": 0.0373687744140625, "step": 67 }, { "epoch": 0.08704, "grad_norm": 6.815414755393516, "learning_rate": 6.025316455696202e-07, "logits/chosen": -1.1162109375, "logits/rejected": -1.1396484375, "logps/chosen": -315.75, "logps/rejected": -303.5, "loss": 0.689, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.0550537109375, "rewards/margins": 0.00931549072265625, "rewards/rejected": 0.0457763671875, "step": 68 }, { "epoch": 0.08832, "grad_norm": 7.17640719707697, "learning_rate": 6.113924050632911e-07, "logits/chosen": -1.169921875, "logits/rejected": -1.22314453125, "logps/chosen": -345.5, "logps/rejected": -337.75, "loss": 0.6857, "rewards/accuracies": 0.53125, "rewards/chosen": 0.05963134765625, "rewards/margins": 0.015005111694335938, "rewards/rejected": 0.0446624755859375, "step": 69 }, { "epoch": 0.0896, "grad_norm": 7.148882260506765, "learning_rate": 6.202531645569619e-07, "logits/chosen": -1.193359375, "logits/rejected": -1.171875, "logps/chosen": -324.25, "logps/rejected": -319.25, "loss": 0.6857, "rewards/accuracies": 0.578125, "rewards/chosen": 0.058624267578125, "rewards/margins": 0.015472412109375, "rewards/rejected": 0.043121337890625, "step": 70 }, { "epoch": 0.09088, "grad_norm": 6.965207472135811, "learning_rate": 6.291139240506329e-07, "logits/chosen": -1.1845703125, "logits/rejected": -1.189453125, "logps/chosen": -320.75, "logps/rejected": -302.625, "loss": 0.6872, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.059600830078125, "rewards/margins": 0.011350154876708984, "rewards/rejected": 0.0481719970703125, "step": 71 }, { "epoch": 0.09216, "grad_norm": 6.690590284756765, "learning_rate": 6.379746835443037e-07, "logits/chosen": -1.2001953125, "logits/rejected": -1.1767578125, "logps/chosen": -311.125, "logps/rejected": -314.25, "loss": 0.6867, "rewards/accuracies": 0.4921875, "rewards/chosen": 0.058197021484375, "rewards/margins": 0.012401580810546875, "rewards/rejected": 0.045806884765625, "step": 72 }, { "epoch": 0.09344, "grad_norm": 6.995206172986862, "learning_rate": 6.468354430379746e-07, "logits/chosen": -1.185546875, "logits/rejected": -1.2392578125, "logps/chosen": -306.125, "logps/rejected": -320.25, "loss": 0.692, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.054534912109375, "rewards/margins": 0.002094268798828125, "rewards/rejected": 0.052459716796875, "step": 73 }, { "epoch": 0.09472, "grad_norm": 7.197486018158177, "learning_rate": 6.556962025316455e-07, "logits/chosen": -1.189453125, "logits/rejected": -1.18701171875, "logps/chosen": -321.75, "logps/rejected": -315.25, "loss": 0.6816, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.06298828125, "rewards/margins": 0.02439117431640625, "rewards/rejected": 0.038543701171875, "step": 74 }, { "epoch": 0.096, "grad_norm": 7.062298755275947, "learning_rate": 6.645569620253163e-07, "logits/chosen": -1.17529296875, "logits/rejected": -1.18359375, "logps/chosen": -337.625, "logps/rejected": -304.875, "loss": 0.6865, "rewards/accuracies": 0.5859375, "rewards/chosen": 0.068359375, "rewards/margins": 0.01427459716796875, "rewards/rejected": 0.0540771484375, "step": 75 }, { "epoch": 0.09728, "grad_norm": 6.9747053464578155, "learning_rate": 6.734177215189874e-07, "logits/chosen": -1.17431640625, "logits/rejected": -1.197265625, "logps/chosen": -320.25, "logps/rejected": -305.0, "loss": 0.6862, "rewards/accuracies": 0.59375, "rewards/chosen": 0.06768798828125, "rewards/margins": 0.014251708984375, "rewards/rejected": 0.053436279296875, "step": 76 }, { "epoch": 0.09856, "grad_norm": 7.0914726140322015, "learning_rate": 6.822784810126582e-07, "logits/chosen": -1.25, "logits/rejected": -1.28515625, "logps/chosen": -302.625, "logps/rejected": -311.5, "loss": 0.6864, "rewards/accuracies": 0.5625, "rewards/chosen": 0.06793212890625, "rewards/margins": 0.014862060546875, "rewards/rejected": 0.0531005859375, "step": 77 }, { "epoch": 0.09984, "grad_norm": 6.924658831045798, "learning_rate": 6.911392405063291e-07, "logits/chosen": -1.1904296875, "logits/rejected": -1.2080078125, "logps/chosen": -312.375, "logps/rejected": -312.75, "loss": 0.6879, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.064971923828125, "rewards/margins": 0.010891914367675781, "rewards/rejected": 0.054107666015625, "step": 78 }, { "epoch": 0.10112, "grad_norm": 6.617651432015416, "learning_rate": 7e-07, "logits/chosen": -1.220703125, "logits/rejected": -1.2080078125, "logps/chosen": -307.5, "logps/rejected": -288.125, "loss": 0.6858, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.06640625, "rewards/margins": 0.01545858383178711, "rewards/rejected": 0.050933837890625, "step": 79 }, { "epoch": 0.1024, "grad_norm": 7.148212562285202, "learning_rate": 6.999964952031891e-07, "logits/chosen": -1.142578125, "logits/rejected": -1.13818359375, "logps/chosen": -340.5, "logps/rejected": -325.375, "loss": 0.6825, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.069793701171875, "rewards/margins": 0.021467208862304688, "rewards/rejected": 0.04840087890625, "step": 80 }, { "epoch": 0.10368, "grad_norm": 6.912997090681548, "learning_rate": 6.999859808829482e-07, "logits/chosen": -1.212890625, "logits/rejected": -1.1943359375, "logps/chosen": -329.125, "logps/rejected": -302.75, "loss": 0.6813, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.071319580078125, "rewards/margins": 0.024139404296875, "rewards/rejected": 0.047149658203125, "step": 81 }, { "epoch": 0.10496, "grad_norm": 6.642590263041201, "learning_rate": 6.999684572498524e-07, "logits/chosen": -1.251953125, "logits/rejected": -1.2294921875, "logps/chosen": -306.5, "logps/rejected": -282.875, "loss": 0.6837, "rewards/accuracies": 0.5625, "rewards/chosen": 0.070587158203125, "rewards/margins": 0.020760536193847656, "rewards/rejected": 0.0496978759765625, "step": 82 }, { "epoch": 0.10624, "grad_norm": 7.156833321052252, "learning_rate": 6.99943924654854e-07, "logits/chosen": -1.20947265625, "logits/rejected": -1.240234375, "logps/chosen": -299.5, "logps/rejected": -319.5, "loss": 0.6876, "rewards/accuracies": 0.484375, "rewards/chosen": 0.07061767578125, "rewards/margins": 0.011600494384765625, "rewards/rejected": 0.058990478515625, "step": 83 }, { "epoch": 0.10752, "grad_norm": 7.161233013169767, "learning_rate": 6.999123835892781e-07, "logits/chosen": -1.2470703125, "logits/rejected": -1.2265625, "logps/chosen": -361.625, "logps/rejected": -346.875, "loss": 0.6774, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.078857421875, "rewards/margins": 0.03279876708984375, "rewards/rejected": 0.04608154296875, "step": 84 }, { "epoch": 0.1088, "grad_norm": 7.380079541175457, "learning_rate": 6.998738346848098e-07, "logits/chosen": -1.169921875, "logits/rejected": -1.171875, "logps/chosen": -321.75, "logps/rejected": -313.0, "loss": 0.6758, "rewards/accuracies": 0.640625, "rewards/chosen": 0.076507568359375, "rewards/margins": 0.0360107421875, "rewards/rejected": 0.04058837890625, "step": 85 }, { "epoch": 0.11008, "grad_norm": 6.787264499218527, "learning_rate": 6.998282787134845e-07, "logits/chosen": -1.2353515625, "logits/rejected": -1.2216796875, "logps/chosen": -307.625, "logps/rejected": -280.625, "loss": 0.681, "rewards/accuracies": 0.640625, "rewards/chosen": 0.07275390625, "rewards/margins": 0.025938034057617188, "rewards/rejected": 0.046783447265625, "step": 86 }, { "epoch": 0.11136, "grad_norm": 7.070691711467475, "learning_rate": 6.997757165876698e-07, "logits/chosen": -1.212890625, "logits/rejected": -1.21484375, "logps/chosen": -333.5, "logps/rejected": -326.5, "loss": 0.681, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.075775146484375, "rewards/margins": 0.024099349975585938, "rewards/rejected": 0.05169677734375, "step": 87 }, { "epoch": 0.11264, "grad_norm": 7.281268067802746, "learning_rate": 6.997161493600493e-07, "logits/chosen": -1.2333984375, "logits/rejected": -1.20751953125, "logps/chosen": -342.375, "logps/rejected": -297.75, "loss": 0.6748, "rewards/accuracies": 0.609375, "rewards/chosen": 0.0849609375, "rewards/margins": 0.038787841796875, "rewards/rejected": 0.0461883544921875, "step": 88 }, { "epoch": 0.11392, "grad_norm": 6.721540641608089, "learning_rate": 6.996495782236003e-07, "logits/chosen": -1.1689453125, "logits/rejected": -1.1826171875, "logps/chosen": -284.5, "logps/rejected": -302.75, "loss": 0.6877, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.063812255859375, "rewards/margins": 0.011150360107421875, "rewards/rejected": 0.052581787109375, "step": 89 }, { "epoch": 0.1152, "grad_norm": 6.658821074174305, "learning_rate": 6.9957600451157e-07, "logits/chosen": -1.2216796875, "logits/rejected": -1.2529296875, "logps/chosen": -289.0, "logps/rejected": -299.5, "loss": 0.6808, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.0616455078125, "rewards/margins": 0.025604248046875, "rewards/rejected": 0.03614044189453125, "step": 90 }, { "epoch": 0.11648, "grad_norm": 6.960320737670953, "learning_rate": 6.994954296974495e-07, "logits/chosen": -1.23388671875, "logits/rejected": -1.263671875, "logps/chosen": -302.5, "logps/rejected": -310.25, "loss": 0.6793, "rewards/accuracies": 0.59375, "rewards/chosen": 0.07379150390625, "rewards/margins": 0.029428482055664062, "rewards/rejected": 0.044342041015625, "step": 91 }, { "epoch": 0.11776, "grad_norm": 7.049238366581128, "learning_rate": 6.994078553949439e-07, "logits/chosen": -1.2294921875, "logits/rejected": -1.267578125, "logps/chosen": -313.625, "logps/rejected": -294.0, "loss": 0.6731, "rewards/accuracies": 0.6328125, "rewards/chosen": 0.0853271484375, "rewards/margins": 0.0420074462890625, "rewards/rejected": 0.04345703125, "step": 92 }, { "epoch": 0.11904, "grad_norm": 6.649581467509272, "learning_rate": 6.993132833579392e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.2236328125, "logps/chosen": -287.5, "logps/rejected": -288.25, "loss": 0.6779, "rewards/accuracies": 0.625, "rewards/chosen": 0.075164794921875, "rewards/margins": 0.03218841552734375, "rewards/rejected": 0.04302978515625, "step": 93 }, { "epoch": 0.12032, "grad_norm": 6.872841249887952, "learning_rate": 6.992117154804688e-07, "logits/chosen": -1.1748046875, "logits/rejected": -1.224609375, "logps/chosen": -314.25, "logps/rejected": -305.0, "loss": 0.6812, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.07373046875, "rewards/margins": 0.026458740234375, "rewards/rejected": 0.0472412109375, "step": 94 }, { "epoch": 0.1216, "grad_norm": 7.327496061016414, "learning_rate": 6.99103153796674e-07, "logits/chosen": -1.173828125, "logits/rejected": -1.1796875, "logps/chosen": -337.25, "logps/rejected": -300.75, "loss": 0.6748, "rewards/accuracies": 0.671875, "rewards/chosen": 0.07318115234375, "rewards/margins": 0.0394134521484375, "rewards/rejected": 0.03388214111328125, "step": 95 }, { "epoch": 0.12288, "grad_norm": 7.333095642704951, "learning_rate": 6.989876004807644e-07, "logits/chosen": -1.2060546875, "logits/rejected": -1.2041015625, "logps/chosen": -344.75, "logps/rejected": -315.0, "loss": 0.6733, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.07806396484375, "rewards/margins": 0.04097175598144531, "rewards/rejected": 0.0370635986328125, "step": 96 }, { "epoch": 0.12416, "grad_norm": 7.054266672465839, "learning_rate": 6.988650578469735e-07, "logits/chosen": -1.2177734375, "logits/rejected": -1.2255859375, "logps/chosen": -326.25, "logps/rejected": -359.0, "loss": 0.6661, "rewards/accuracies": 0.65625, "rewards/chosen": 0.071868896484375, "rewards/margins": 0.05510711669921875, "rewards/rejected": 0.016735076904296875, "step": 97 }, { "epoch": 0.12544, "grad_norm": 7.188205202679432, "learning_rate": 6.98735528349513e-07, "logits/chosen": -1.18212890625, "logits/rejected": -1.2021484375, "logps/chosen": -273.875, "logps/rejected": -304.0, "loss": 0.6746, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.059661865234375, "rewards/margins": 0.03852081298828125, "rewards/rejected": 0.02120208740234375, "step": 98 }, { "epoch": 0.12672, "grad_norm": 7.374652320682456, "learning_rate": 6.985990145825232e-07, "logits/chosen": -1.13818359375, "logits/rejected": -1.1669921875, "logps/chosen": -330.25, "logps/rejected": -322.0, "loss": 0.6701, "rewards/accuracies": 0.703125, "rewards/chosen": 0.074493408203125, "rewards/margins": 0.04855918884277344, "rewards/rejected": 0.02597808837890625, "step": 99 }, { "epoch": 0.128, "grad_norm": 7.010945640873385, "learning_rate": 6.984555192800215e-07, "logits/chosen": -1.244140625, "logits/rejected": -1.296875, "logps/chosen": -314.75, "logps/rejected": -322.5, "loss": 0.6711, "rewards/accuracies": 0.703125, "rewards/chosen": 0.076141357421875, "rewards/margins": 0.0458831787109375, "rewards/rejected": 0.030277252197265625, "step": 100 }, { "epoch": 0.128, "eval_logits/chosen": -1.169921875, "eval_logits/rejected": -1.2216796875, "eval_logps/chosen": -314.125, "eval_logps/rejected": -299.0625, "eval_loss": 0.6733124852180481, "eval_rewards/accuracies": 0.649609386920929, "eval_rewards/chosen": 0.0625, "eval_rewards/margins": 0.04170989990234375, "eval_rewards/rejected": 0.020813941955566406, "eval_runtime": 27.7049, "eval_samples_per_second": 18.047, "eval_steps_per_second": 0.578, "step": 100 }, { "epoch": 0.12928, "grad_norm": 6.562841834429094, "learning_rate": 6.983050453158471e-07, "logits/chosen": -1.18994140625, "logits/rejected": -1.171875, "logps/chosen": -294.0, "logps/rejected": -260.25, "loss": 0.6775, "rewards/accuracies": 0.625, "rewards/chosen": 0.058380126953125, "rewards/margins": 0.0331878662109375, "rewards/rejected": 0.025234222412109375, "step": 101 }, { "epoch": 0.13056, "grad_norm": 7.311489748652196, "learning_rate": 6.981475957036038e-07, "logits/chosen": -1.18359375, "logits/rejected": -1.1904296875, "logps/chosen": -319.0, "logps/rejected": -322.5, "loss": 0.6697, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.061248779296875, "rewards/margins": 0.049468994140625, "rewards/rejected": 0.011905670166015625, "step": 102 }, { "epoch": 0.13184, "grad_norm": 6.969731388418404, "learning_rate": 6.979831735965997e-07, "logits/chosen": -1.19921875, "logits/rejected": -1.2529296875, "logps/chosen": -312.5, "logps/rejected": -329.0, "loss": 0.6769, "rewards/accuracies": 0.609375, "rewards/chosen": 0.057342529296875, "rewards/margins": 0.03508758544921875, "rewards/rejected": 0.02228546142578125, "step": 103 }, { "epoch": 0.13312, "grad_norm": 7.382805496910107, "learning_rate": 6.978117822877838e-07, "logits/chosen": -1.1396484375, "logits/rejected": -1.162109375, "logps/chosen": -346.0, "logps/rejected": -328.25, "loss": 0.6633, "rewards/accuracies": 0.6796875, "rewards/chosen": 0.0740966796875, "rewards/margins": 0.0634307861328125, "rewards/rejected": 0.010618209838867188, "step": 104 }, { "epoch": 0.1344, "grad_norm": 7.529482042313781, "learning_rate": 6.976334252096801e-07, "logits/chosen": -1.2216796875, "logits/rejected": -1.26123046875, "logps/chosen": -304.875, "logps/rejected": -337.625, "loss": 0.6731, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0610198974609375, "rewards/margins": 0.043567657470703125, "rewards/rejected": 0.0174560546875, "step": 105 }, { "epoch": 0.13568, "grad_norm": 7.287196640480614, "learning_rate": 6.974481059343188e-07, "logits/chosen": -1.240234375, "logits/rejected": -1.224609375, "logps/chosen": -338.25, "logps/rejected": -301.125, "loss": 0.6694, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0517730712890625, "rewards/margins": 0.0513458251953125, "rewards/rejected": 0.000377655029296875, "step": 106 }, { "epoch": 0.13696, "grad_norm": 7.457935260315348, "learning_rate": 6.972558281731654e-07, "logits/chosen": -1.18359375, "logits/rejected": -1.2529296875, "logps/chosen": -308.125, "logps/rejected": -343.875, "loss": 0.6727, "rewards/accuracies": 0.6015625, "rewards/chosen": 0.03688812255859375, "rewards/margins": 0.04468536376953125, "rewards/rejected": -0.007733345031738281, "step": 107 }, { "epoch": 0.13824, "grad_norm": 7.110683172273849, "learning_rate": 6.970565957770455e-07, "logits/chosen": -1.2783203125, "logits/rejected": -1.279296875, "logps/chosen": -325.375, "logps/rejected": -300.75, "loss": 0.6628, "rewards/accuracies": 0.7109375, "rewards/chosen": 0.04693603515625, "rewards/margins": 0.06512451171875, "rewards/rejected": -0.01806640625, "step": 108 }, { "epoch": 0.13952, "grad_norm": 7.4393288593299935, "learning_rate": 6.96850412736068e-07, "logits/chosen": -1.1689453125, "logits/rejected": -1.20166015625, "logps/chosen": -293.75, "logps/rejected": -323.25, "loss": 0.6661, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03907012939453125, "rewards/margins": 0.056720733642578125, "rewards/rejected": -0.0177459716796875, "step": 109 }, { "epoch": 0.1408, "grad_norm": 7.027414643492787, "learning_rate": 6.96637283179545e-07, "logits/chosen": -1.1953125, "logits/rejected": -1.2119140625, "logps/chosen": -319.25, "logps/rejected": -312.0, "loss": 0.6673, "rewards/accuracies": 0.625, "rewards/chosen": 0.024829864501953125, "rewards/margins": 0.054996490478515625, "rewards/rejected": -0.030157089233398438, "step": 110 }, { "epoch": 0.14208, "grad_norm": 7.3061024388552065, "learning_rate": 6.9641721137591e-07, "logits/chosen": -1.1865234375, "logits/rejected": -1.1982421875, "logps/chosen": -347.5, "logps/rejected": -333.5, "loss": 0.653, "rewards/accuracies": 0.703125, "rewards/chosen": 0.02840423583984375, "rewards/margins": 0.085723876953125, "rewards/rejected": -0.057373046875, "step": 111 }, { "epoch": 0.14336, "grad_norm": 7.218209771794371, "learning_rate": 6.961902017326311e-07, "logits/chosen": -1.14892578125, "logits/rejected": -1.22119140625, "logps/chosen": -290.5, "logps/rejected": -310.375, "loss": 0.6562, "rewards/accuracies": 0.703125, "rewards/chosen": 0.0299224853515625, "rewards/margins": 0.07830810546875, "rewards/rejected": -0.0483551025390625, "step": 112 }, { "epoch": 0.14464, "grad_norm": 7.635227167652353, "learning_rate": 6.959562587961234e-07, "logits/chosen": -1.14794921875, "logits/rejected": -1.17919921875, "logps/chosen": -305.5, "logps/rejected": -329.25, "loss": 0.658, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0113372802734375, "rewards/margins": 0.076263427734375, "rewards/rejected": -0.06497573852539062, "step": 113 }, { "epoch": 0.14592, "grad_norm": 7.086183935410638, "learning_rate": 6.957153872516586e-07, "logits/chosen": -1.1669921875, "logits/rejected": -1.236328125, "logps/chosen": -334.25, "logps/rejected": -324.25, "loss": 0.667, "rewards/accuracies": 0.671875, "rewards/chosen": 0.0051097869873046875, "rewards/margins": 0.05683135986328125, "rewards/rejected": -0.05169677734375, "step": 114 }, { "epoch": 0.1472, "grad_norm": 6.902888342176391, "learning_rate": 6.954675919232694e-07, "logits/chosen": -1.20703125, "logits/rejected": -1.23828125, "logps/chosen": -307.75, "logps/rejected": -292.125, "loss": 0.6636, "rewards/accuracies": 0.640625, "rewards/chosen": 0.0019683837890625, "rewards/margins": 0.064910888671875, "rewards/rejected": -0.06283187866210938, "step": 115 }, { "epoch": 0.14848, "grad_norm": 6.9983913393596735, "learning_rate": 6.95212877773655e-07, "logits/chosen": -1.1494140625, "logits/rejected": -1.16650390625, "logps/chosen": -316.25, "logps/rejected": -320.25, "loss": 0.6636, "rewards/accuracies": 0.625, "rewards/chosen": -0.01812744140625, "rewards/margins": 0.06414794921875, "rewards/rejected": -0.082275390625, "step": 116 }, { "epoch": 0.14976, "grad_norm": 7.039439125767687, "learning_rate": 6.949512499040799e-07, "logits/chosen": -1.205078125, "logits/rejected": -1.2041015625, "logps/chosen": -314.0, "logps/rejected": -309.75, "loss": 0.6702, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.03998565673828125, "rewards/margins": 0.051842689514160156, "rewards/rejected": -0.091796875, "step": 117 }, { "epoch": 0.15104, "grad_norm": 7.433508336249601, "learning_rate": 6.946827135542728e-07, "logits/chosen": -1.09521484375, "logits/rejected": -1.140625, "logps/chosen": -310.25, "logps/rejected": -334.625, "loss": 0.6686, "rewards/accuracies": 0.640625, "rewards/chosen": -0.051239013671875, "rewards/margins": 0.05425071716308594, "rewards/rejected": -0.105438232421875, "step": 118 }, { "epoch": 0.15232, "grad_norm": 7.340968519858034, "learning_rate": 6.944072741023215e-07, "logits/chosen": -1.1787109375, "logits/rejected": -1.2099609375, "logps/chosen": -325.75, "logps/rejected": -330.5, "loss": 0.6539, "rewards/accuracies": 0.734375, "rewards/chosen": -0.0429840087890625, "rewards/margins": 0.084930419921875, "rewards/rejected": -0.1279296875, "step": 119 }, { "epoch": 0.1536, "grad_norm": 7.136653505773104, "learning_rate": 6.941249370645649e-07, "logits/chosen": -1.1865234375, "logits/rejected": -1.171875, "logps/chosen": -329.25, "logps/rejected": -323.75, "loss": 0.6649, "rewards/accuracies": 0.6484375, "rewards/chosen": -0.05987548828125, "rewards/margins": 0.06435394287109375, "rewards/rejected": -0.12432861328125, "step": 120 }, { "epoch": 0.15488, "grad_norm": 7.355481505262627, "learning_rate": 6.938357080954826e-07, "logits/chosen": -1.05419921875, "logits/rejected": -1.076171875, "logps/chosen": -334.25, "logps/rejected": -338.25, "loss": 0.6473, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.03987884521484375, "rewards/margins": 0.09857177734375, "rewards/rejected": -0.1387939453125, "step": 121 }, { "epoch": 0.15616, "grad_norm": 7.263830648748775, "learning_rate": 6.935395929875821e-07, "logits/chosen": -1.17333984375, "logits/rejected": -1.1708984375, "logps/chosen": -335.75, "logps/rejected": -332.0, "loss": 0.6578, "rewards/accuracies": 0.625, "rewards/chosen": -0.0830535888671875, "rewards/margins": 0.077911376953125, "rewards/rejected": -0.16094970703125, "step": 122 }, { "epoch": 0.15744, "grad_norm": 8.404772457823935, "learning_rate": 6.932365976712819e-07, "logits/chosen": -1.09716796875, "logits/rejected": -1.13525390625, "logps/chosen": -315.875, "logps/rejected": -359.0, "loss": 0.6414, "rewards/accuracies": 0.71875, "rewards/chosen": -0.076934814453125, "rewards/margins": 0.115936279296875, "rewards/rejected": -0.19287109375, "step": 123 }, { "epoch": 0.15872, "grad_norm": 7.621326463499134, "learning_rate": 6.929267282147936e-07, "logits/chosen": -1.07373046875, "logits/rejected": -1.11767578125, "logps/chosen": -344.125, "logps/rejected": -355.0, "loss": 0.6457, "rewards/accuracies": 0.7421875, "rewards/chosen": -0.08642578125, "rewards/margins": 0.1038818359375, "rewards/rejected": -0.1903076171875, "step": 124 }, { "epoch": 0.16, "grad_norm": 7.788922626286128, "learning_rate": 6.926099908240002e-07, "logits/chosen": -1.142578125, "logits/rejected": -1.1640625, "logps/chosen": -328.5, "logps/rejected": -356.75, "loss": 0.6583, "rewards/accuracies": 0.65625, "rewards/chosen": -0.102294921875, "rewards/margins": 0.0783233642578125, "rewards/rejected": -0.18048095703125, "step": 125 }, { "epoch": 0.16128, "grad_norm": 7.473164766096368, "learning_rate": 6.922863918423311e-07, "logits/chosen": -1.12744140625, "logits/rejected": -1.1103515625, "logps/chosen": -343.25, "logps/rejected": -337.75, "loss": 0.6575, "rewards/accuracies": 0.703125, "rewards/chosen": -0.12322998046875, "rewards/margins": 0.080108642578125, "rewards/rejected": -0.2030029296875, "step": 126 }, { "epoch": 0.16256, "grad_norm": 7.64697192150379, "learning_rate": 6.919559377506359e-07, "logits/chosen": -1.177734375, "logits/rejected": -1.21484375, "logps/chosen": -333.5, "logps/rejected": -354.75, "loss": 0.6485, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.13275146484375, "rewards/margins": 0.10205078125, "rewards/rejected": -0.2347412109375, "step": 127 }, { "epoch": 0.16384, "grad_norm": 7.253778836602147, "learning_rate": 6.916186351670546e-07, "logits/chosen": -1.12353515625, "logits/rejected": -1.16650390625, "logps/chosen": -318.625, "logps/rejected": -313.5, "loss": 0.6458, "rewards/accuracies": 0.6875, "rewards/chosen": -0.16357421875, "rewards/margins": 0.106414794921875, "rewards/rejected": -0.2698974609375, "step": 128 }, { "epoch": 0.16512, "grad_norm": 7.447630373123696, "learning_rate": 6.91274490846884e-07, "logits/chosen": -1.12158203125, "logits/rejected": -1.10791015625, "logps/chosen": -333.75, "logps/rejected": -306.625, "loss": 0.6538, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.17535400390625, "rewards/margins": 0.088592529296875, "rewards/rejected": -0.263916015625, "step": 129 }, { "epoch": 0.1664, "grad_norm": 7.4679011809033815, "learning_rate": 6.90923511682444e-07, "logits/chosen": -1.1396484375, "logits/rejected": -1.171875, "logps/chosen": -331.75, "logps/rejected": -361.5, "loss": 0.6447, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.19451904296875, "rewards/margins": 0.112060546875, "rewards/rejected": -0.3065185546875, "step": 130 }, { "epoch": 0.16768, "grad_norm": 7.362515597925838, "learning_rate": 6.905657047029384e-07, "logits/chosen": -1.087890625, "logits/rejected": -1.1240234375, "logps/chosen": -324.75, "logps/rejected": -308.875, "loss": 0.6387, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.2166748046875, "rewards/margins": 0.126708984375, "rewards/rejected": -0.34326171875, "step": 131 }, { "epoch": 0.16896, "grad_norm": 7.770937607646877, "learning_rate": 6.90201077074314e-07, "logits/chosen": -1.1044921875, "logits/rejected": -1.1640625, "logps/chosen": -329.25, "logps/rejected": -347.25, "loss": 0.6436, "rewards/accuracies": 0.671875, "rewards/chosen": -0.2451171875, "rewards/margins": 0.11639404296875, "rewards/rejected": -0.36181640625, "step": 132 }, { "epoch": 0.17024, "grad_norm": 7.943995279771987, "learning_rate": 6.898296360991182e-07, "logits/chosen": -1.10205078125, "logits/rejected": -1.13330078125, "logps/chosen": -356.25, "logps/rejected": -357.25, "loss": 0.6337, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.2764892578125, "rewards/margins": 0.13845062255859375, "rewards/rejected": -0.414794921875, "step": 133 }, { "epoch": 0.17152, "grad_norm": 8.15643307389567, "learning_rate": 6.894513892163518e-07, "logits/chosen": -1.0361328125, "logits/rejected": -1.05517578125, "logps/chosen": -372.0, "logps/rejected": -356.75, "loss": 0.6553, "rewards/accuracies": 0.640625, "rewards/chosen": -0.291015625, "rewards/margins": 0.0922698974609375, "rewards/rejected": -0.38330078125, "step": 134 }, { "epoch": 0.1728, "grad_norm": 8.402149118568568, "learning_rate": 6.890663440013204e-07, "logits/chosen": -1.07666015625, "logits/rejected": -1.1103515625, "logps/chosen": -367.25, "logps/rejected": -380.5, "loss": 0.636, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3759765625, "rewards/margins": 0.14380645751953125, "rewards/rejected": -0.519287109375, "step": 135 }, { "epoch": 0.17408, "grad_norm": 7.676163099722455, "learning_rate": 6.886745081654823e-07, "logits/chosen": -1.06005859375, "logits/rejected": -1.076171875, "logps/chosen": -365.25, "logps/rejected": -364.125, "loss": 0.6389, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.3699951171875, "rewards/margins": 0.129241943359375, "rewards/rejected": -0.498779296875, "step": 136 }, { "epoch": 0.17536, "grad_norm": 7.792721927404631, "learning_rate": 6.882758895562949e-07, "logits/chosen": -0.97998046875, "logits/rejected": -1.05126953125, "logps/chosen": -360.25, "logps/rejected": -366.25, "loss": 0.6345, "rewards/accuracies": 0.7109375, "rewards/chosen": -0.42626953125, "rewards/margins": 0.145538330078125, "rewards/rejected": -0.572021484375, "step": 137 }, { "epoch": 0.17664, "grad_norm": 13.220416758370622, "learning_rate": 6.878704961570564e-07, "logits/chosen": -0.9208984375, "logits/rejected": -0.9052734375, "logps/chosen": -427.75, "logps/rejected": -412.75, "loss": 0.6566, "rewards/accuracies": 0.65625, "rewards/chosen": -0.642822265625, "rewards/margins": 0.10161972045898438, "rewards/rejected": -0.744140625, "step": 138 }, { "epoch": 0.17792, "grad_norm": 8.66931437809148, "learning_rate": 6.874583360867468e-07, "logits/chosen": -0.8505859375, "logits/rejected": -0.88330078125, "logps/chosen": -399.0, "logps/rejected": -415.0, "loss": 0.6271, "rewards/accuracies": 0.703125, "rewards/chosen": -0.6767578125, "rewards/margins": 0.1732177734375, "rewards/rejected": -0.849609375, "step": 139 }, { "epoch": 0.1792, "grad_norm": 8.526440571507557, "learning_rate": 6.87039417599865e-07, "logits/chosen": -0.875, "logits/rejected": -0.91796875, "logps/chosen": -376.25, "logps/rejected": -390.25, "loss": 0.6212, "rewards/accuracies": 0.6796875, "rewards/chosen": -0.6337890625, "rewards/margins": 0.18011474609375, "rewards/rejected": -0.8134765625, "step": 140 }, { "epoch": 0.18048, "grad_norm": 9.392422913412458, "learning_rate": 6.866137490862636e-07, "logits/chosen": -0.78857421875, "logits/rejected": -0.8232421875, "logps/chosen": -392.25, "logps/rejected": -399.25, "loss": 0.6189, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.6796875, "rewards/margins": 0.19610595703125, "rewards/rejected": -0.87646484375, "step": 141 }, { "epoch": 0.18176, "grad_norm": 9.326150506259697, "learning_rate": 6.861813390709803e-07, "logits/chosen": -0.71630859375, "logits/rejected": -0.78125, "logps/chosen": -369.5, "logps/rejected": -404.25, "loss": 0.6118, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7138671875, "rewards/margins": 0.21263885498046875, "rewards/rejected": -0.92529296875, "step": 142 }, { "epoch": 0.18304, "grad_norm": 8.384403124201002, "learning_rate": 6.857421962140681e-07, "logits/chosen": -0.8466796875, "logits/rejected": -0.87451171875, "logps/chosen": -408.5, "logps/rejected": -402.0, "loss": 0.6247, "rewards/accuracies": 0.6640625, "rewards/chosen": -0.72265625, "rewards/margins": 0.19293212890625, "rewards/rejected": -0.91552734375, "step": 143 }, { "epoch": 0.18432, "grad_norm": 9.388640196159205, "learning_rate": 6.852963293104211e-07, "logits/chosen": -0.94677734375, "logits/rejected": -0.9345703125, "logps/chosen": -389.25, "logps/rejected": -378.0, "loss": 0.621, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6689453125, "rewards/margins": 0.1993408203125, "rewards/rejected": -0.86865234375, "step": 144 }, { "epoch": 0.1856, "grad_norm": 11.03326645275443, "learning_rate": 6.848437472895988e-07, "logits/chosen": -0.9150390625, "logits/rejected": -1.00146484375, "logps/chosen": -386.75, "logps/rejected": -431.25, "loss": 0.5984, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.642578125, "rewards/margins": 0.234375, "rewards/rejected": -0.876953125, "step": 145 }, { "epoch": 0.18688, "grad_norm": 61.60360857590808, "learning_rate": 6.843844592156471e-07, "logits/chosen": -0.83984375, "logits/rejected": -0.96044921875, "logps/chosen": -348.5, "logps/rejected": -454.75, "loss": 0.6328, "rewards/accuracies": 0.671875, "rewards/chosen": -0.7197265625, "rewards/margins": 0.17962646484375, "rewards/rejected": -0.89892578125, "step": 146 }, { "epoch": 0.18816, "grad_norm": 15.422965574908865, "learning_rate": 6.839184742869166e-07, "logits/chosen": -0.86328125, "logits/rejected": -0.85986328125, "logps/chosen": -401.25, "logps/rejected": -429.5, "loss": 0.5783, "rewards/accuracies": 0.6953125, "rewards/chosen": -0.79638671875, "rewards/margins": 0.29449462890625, "rewards/rejected": -1.08984375, "step": 147 }, { "epoch": 0.18944, "grad_norm": 13.374494698966174, "learning_rate": 6.834458018358787e-07, "logits/chosen": -0.8583984375, "logits/rejected": -0.83056640625, "logps/chosen": -438.25, "logps/rejected": -396.25, "loss": 0.6247, "rewards/accuracies": 0.671875, "rewards/chosen": -0.9375, "rewards/margins": 0.221160888671875, "rewards/rejected": -1.15869140625, "step": 148 }, { "epoch": 0.19072, "grad_norm": 10.50579381133434, "learning_rate": 6.829664513289386e-07, "logits/chosen": -0.6806640625, "logits/rejected": -0.696533203125, "logps/chosen": -429.25, "logps/rejected": -445.0, "loss": 0.6204, "rewards/accuracies": 0.6875, "rewards/chosen": -1.02392578125, "rewards/margins": 0.21826171875, "rewards/rejected": -1.24072265625, "step": 149 }, { "epoch": 0.192, "grad_norm": 8.912036503648771, "learning_rate": 6.824804323662456e-07, "logits/chosen": -0.833984375, "logits/rejected": -0.85009765625, "logps/chosen": -450.5, "logps/rejected": -462.75, "loss": 0.6157, "rewards/accuracies": 0.6640625, "rewards/chosen": -1.15869140625, "rewards/margins": 0.23162841796875, "rewards/rejected": -1.3916015625, "step": 150 }, { "epoch": 0.19328, "grad_norm": 8.776101733274905, "learning_rate": 6.819877546815008e-07, "logits/chosen": -0.8564453125, "logits/rejected": -0.875, "logps/chosen": -438.25, "logps/rejected": -447.25, "loss": 0.6202, "rewards/accuracies": 0.6953125, "rewards/chosen": -1.07470703125, "rewards/margins": 0.23724365234375, "rewards/rejected": -1.3115234375, "step": 151 }, { "epoch": 0.19456, "grad_norm": 10.578778147443705, "learning_rate": 6.814884281417626e-07, "logits/chosen": -0.8427734375, "logits/rejected": -0.8701171875, "logps/chosen": -434.75, "logps/rejected": -448.25, "loss": 0.591, "rewards/accuracies": 0.71875, "rewards/chosen": -1.03515625, "rewards/margins": 0.2916259765625, "rewards/rejected": -1.3251953125, "step": 152 }, { "epoch": 0.19584, "grad_norm": 8.049518104374286, "learning_rate": 6.809824627472483e-07, "logits/chosen": -0.8037109375, "logits/rejected": -0.8603515625, "logps/chosen": -419.5, "logps/rejected": -428.5, "loss": 0.5812, "rewards/accuracies": 0.75, "rewards/chosen": -1.02392578125, "rewards/margins": 0.31982421875, "rewards/rejected": -1.34375, "step": 153 }, { "epoch": 0.19712, "grad_norm": 8.793213481082436, "learning_rate": 6.804698686311346e-07, "logits/chosen": -0.87255859375, "logits/rejected": -0.85888671875, "logps/chosen": -437.75, "logps/rejected": -433.5, "loss": 0.6043, "rewards/accuracies": 0.734375, "rewards/chosen": -1.11376953125, "rewards/margins": 0.26483154296875, "rewards/rejected": -1.37939453125, "step": 154 }, { "epoch": 0.1984, "grad_norm": 17.16055737002578, "learning_rate": 6.79950656059354e-07, "logits/chosen": -0.888427734375, "logits/rejected": -0.934326171875, "logps/chosen": -470.75, "logps/rejected": -460.25, "loss": 0.5672, "rewards/accuracies": 0.7578125, "rewards/chosen": -1.1201171875, "rewards/margins": 0.353271484375, "rewards/rejected": -1.474609375, "step": 155 }, { "epoch": 0.19968, "grad_norm": 8.561528637211719, "learning_rate": 6.794248354303899e-07, "logits/chosen": -0.80615234375, "logits/rejected": -0.83203125, "logps/chosen": -441.75, "logps/rejected": -466.25, "loss": 0.6081, "rewards/accuracies": 0.6484375, "rewards/chosen": -1.259765625, "rewards/margins": 0.260406494140625, "rewards/rejected": -1.5205078125, "step": 156 }, { "epoch": 0.20096, "grad_norm": 8.69093146968424, "learning_rate": 6.788924172750679e-07, "logits/chosen": -0.87646484375, "logits/rejected": -0.9169921875, "logps/chosen": -437.5, "logps/rejected": -462.25, "loss": 0.5753, "rewards/accuracies": 0.6484375, "rewards/chosen": -1.1572265625, "rewards/margins": 0.3331298828125, "rewards/rejected": -1.48828125, "step": 157 }, { "epoch": 0.20224, "grad_norm": 8.31733029290186, "learning_rate": 6.783534122563447e-07, "logits/chosen": -0.7666015625, "logits/rejected": -0.8388671875, "logps/chosen": -427.5, "logps/rejected": -477.0, "loss": 0.5649, "rewards/accuracies": 0.75, "rewards/chosen": -1.2607421875, "rewards/margins": 0.38958740234375, "rewards/rejected": -1.6484375, "step": 158 }, { "epoch": 0.20352, "grad_norm": 11.171939635720042, "learning_rate": 6.77807831169095e-07, "logits/chosen": -0.802734375, "logits/rejected": -0.86181640625, "logps/chosen": -476.75, "logps/rejected": -498.0, "loss": 0.5918, "rewards/accuracies": 0.671875, "rewards/chosen": -1.4375, "rewards/margins": 0.305908203125, "rewards/rejected": -1.7431640625, "step": 159 }, { "epoch": 0.2048, "grad_norm": 8.281097141476181, "learning_rate": 6.772556849398952e-07, "logits/chosen": -0.843994140625, "logits/rejected": -0.884765625, "logps/chosen": -505.75, "logps/rejected": -551.75, "loss": 0.5537, "rewards/accuracies": 0.7265625, "rewards/chosen": -1.4814453125, "rewards/margins": 0.419677734375, "rewards/rejected": -1.9033203125, "step": 160 }, { "epoch": 0.20608, "grad_norm": 10.239198232842558, "learning_rate": 6.766969846268044e-07, "logits/chosen": -0.7734375, "logits/rejected": -0.830078125, "logps/chosen": -459.5, "logps/rejected": -488.75, "loss": 0.6061, "rewards/accuracies": 0.671875, "rewards/chosen": -1.57373046875, "rewards/margins": 0.25689697265625, "rewards/rejected": -1.830078125, "step": 161 }, { "epoch": 0.20736, "grad_norm": 9.747342835599794, "learning_rate": 6.761317414191428e-07, "logits/chosen": -0.8076171875, "logits/rejected": -0.8447265625, "logps/chosen": -483.75, "logps/rejected": -545.25, "loss": 0.5981, "rewards/accuracies": 0.6796875, "rewards/chosen": -1.779296875, "rewards/margins": 0.2919921875, "rewards/rejected": -2.072265625, "step": 162 }, { "epoch": 0.20864, "grad_norm": 9.525902367060457, "learning_rate": 6.755599666372684e-07, "logits/chosen": -0.80859375, "logits/rejected": -0.8984375, "logps/chosen": -454.5, "logps/rejected": -502.25, "loss": 0.582, "rewards/accuracies": 0.703125, "rewards/chosen": -1.671875, "rewards/margins": 0.3372802734375, "rewards/rejected": -2.0078125, "step": 163 }, { "epoch": 0.20992, "grad_norm": 10.81029592784359, "learning_rate": 6.749816717323492e-07, "logits/chosen": -0.7578125, "logits/rejected": -0.8056640625, "logps/chosen": -518.75, "logps/rejected": -571.0, "loss": 0.5573, "rewards/accuracies": 0.7578125, "rewards/chosen": -1.8740234375, "rewards/margins": 0.4305419921875, "rewards/rejected": -2.3046875, "step": 164 }, { "epoch": 0.2112, "grad_norm": 8.72875920033335, "learning_rate": 6.743968682861345e-07, "logits/chosen": -0.75830078125, "logits/rejected": -0.82470703125, "logps/chosen": -456.5, "logps/rejected": -518.25, "loss": 0.5457, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.642578125, "rewards/margins": 0.4796142578125, "rewards/rejected": -2.123046875, "step": 165 }, { "epoch": 0.21248, "grad_norm": 12.52887713385305, "learning_rate": 6.738055680107232e-07, "logits/chosen": -0.8310546875, "logits/rejected": -0.83740234375, "logps/chosen": -502.75, "logps/rejected": -525.25, "loss": 0.6151, "rewards/accuracies": 0.6015625, "rewards/chosen": -1.794921875, "rewards/margins": 0.2906494140625, "rewards/rejected": -2.083984375, "step": 166 }, { "epoch": 0.21376, "grad_norm": 12.573683619458626, "learning_rate": 6.732077827483283e-07, "logits/chosen": -0.79052734375, "logits/rejected": -0.7919921875, "logps/chosen": -510.75, "logps/rejected": -537.5, "loss": 0.6154, "rewards/accuracies": 0.6796875, "rewards/chosen": -1.8447265625, "rewards/margins": 0.3406219482421875, "rewards/rejected": -2.185546875, "step": 167 }, { "epoch": 0.21504, "grad_norm": 8.275606690031259, "learning_rate": 6.726035244710405e-07, "logits/chosen": -0.796142578125, "logits/rejected": -0.814453125, "logps/chosen": -465.5, "logps/rejected": -497.0, "loss": 0.5702, "rewards/accuracies": 0.6640625, "rewards/chosen": -1.51953125, "rewards/margins": 0.4149169921875, "rewards/rejected": -1.93359375, "step": 168 }, { "epoch": 0.21632, "grad_norm": 10.111116876816661, "learning_rate": 6.719928052805885e-07, "logits/chosen": -0.83642578125, "logits/rejected": -0.841796875, "logps/chosen": -464.0, "logps/rejected": -512.75, "loss": 0.5484, "rewards/accuracies": 0.78125, "rewards/chosen": -1.36328125, "rewards/margins": 0.4599609375, "rewards/rejected": -1.82421875, "step": 169 }, { "epoch": 0.2176, "grad_norm": 14.001498683826862, "learning_rate": 6.713756374080959e-07, "logits/chosen": -0.8994140625, "logits/rejected": -0.9580078125, "logps/chosen": -458.75, "logps/rejected": -498.5, "loss": 0.5747, "rewards/accuracies": 0.7265625, "rewards/chosen": -1.2998046875, "rewards/margins": 0.36627197265625, "rewards/rejected": -1.6669921875, "step": 170 }, { "epoch": 0.21888, "grad_norm": 13.154660100000475, "learning_rate": 6.70752033213837e-07, "logits/chosen": -0.88525390625, "logits/rejected": -0.93603515625, "logps/chosen": -467.0, "logps/rejected": -471.5, "loss": 0.5537, "rewards/accuracies": 0.765625, "rewards/chosen": -1.3125, "rewards/margins": 0.42266845703125, "rewards/rejected": -1.7353515625, "step": 171 }, { "epoch": 0.22016, "grad_norm": 11.432368493570536, "learning_rate": 6.70122005186989e-07, "logits/chosen": -0.79443359375, "logits/rejected": -0.85107421875, "logps/chosen": -481.5, "logps/rejected": -537.0, "loss": 0.5738, "rewards/accuracies": 0.703125, "rewards/chosen": -1.56640625, "rewards/margins": 0.3822021484375, "rewards/rejected": -1.9443359375, "step": 172 }, { "epoch": 0.22144, "grad_norm": 9.822098330205291, "learning_rate": 6.694855659453818e-07, "logits/chosen": -0.84765625, "logits/rejected": -0.84912109375, "logps/chosen": -511.75, "logps/rejected": -511.5, "loss": 0.5899, "rewards/accuracies": 0.6640625, "rewards/chosen": -1.70703125, "rewards/margins": 0.35321044921875, "rewards/rejected": -2.0595703125, "step": 173 }, { "epoch": 0.22272, "grad_norm": 15.790896268251576, "learning_rate": 6.688427282352449e-07, "logits/chosen": -0.755859375, "logits/rejected": -0.80126953125, "logps/chosen": -492.25, "logps/rejected": -520.5, "loss": 0.5308, "rewards/accuracies": 0.796875, "rewards/chosen": -1.7265625, "rewards/margins": 0.4969482421875, "rewards/rejected": -2.2255859375, "step": 174 }, { "epoch": 0.224, "grad_norm": 10.091430688703294, "learning_rate": 6.681935049309533e-07, "logits/chosen": -0.601318359375, "logits/rejected": -0.64306640625, "logps/chosen": -560.0, "logps/rejected": -612.5, "loss": 0.5713, "rewards/accuracies": 0.7265625, "rewards/chosen": -2.1953125, "rewards/margins": 0.515625, "rewards/rejected": -2.7109375, "step": 175 }, { "epoch": 0.22528, "grad_norm": 9.197272514290859, "learning_rate": 6.675379090347682e-07, "logits/chosen": -0.64501953125, "logits/rejected": -0.674560546875, "logps/chosen": -608.75, "logps/rejected": -654.0, "loss": 0.5305, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.6953125, "rewards/margins": 0.660400390625, "rewards/rejected": -3.35546875, "step": 176 }, { "epoch": 0.22656, "grad_norm": 9.460572041512634, "learning_rate": 6.668759536765779e-07, "logits/chosen": -0.6484375, "logits/rejected": -0.671630859375, "logps/chosen": -600.5, "logps/rejected": -642.0, "loss": 0.567, "rewards/accuracies": 0.671875, "rewards/chosen": -2.6953125, "rewards/margins": 0.5928955078125, "rewards/rejected": -3.291015625, "step": 177 }, { "epoch": 0.22784, "grad_norm": 40.385948070557, "learning_rate": 6.662076521136337e-07, "logits/chosen": -0.5048828125, "logits/rejected": -0.554443359375, "logps/chosen": -604.0, "logps/rejected": -661.75, "loss": 0.5893, "rewards/accuracies": 0.7109375, "rewards/chosen": -2.9296875, "rewards/margins": 0.66015625, "rewards/rejected": -3.58984375, "step": 178 }, { "epoch": 0.22912, "grad_norm": 22.98261625779329, "learning_rate": 6.655330177302857e-07, "logits/chosen": -0.63525390625, "logits/rejected": -0.71435546875, "logps/chosen": -649.5, "logps/rejected": -725.0, "loss": 0.5748, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.234375, "rewards/margins": 0.593994140625, "rewards/rejected": -3.830078125, "step": 179 }, { "epoch": 0.2304, "grad_norm": 8.847574852157168, "learning_rate": 6.64852064037713e-07, "logits/chosen": -0.6103515625, "logits/rejected": -0.654296875, "logps/chosen": -607.0, "logps/rejected": -687.0, "loss": 0.5131, "rewards/accuracies": 0.75, "rewards/chosen": -2.97265625, "rewards/margins": 0.7249755859375, "rewards/rejected": -3.6953125, "step": 180 }, { "epoch": 0.23168, "grad_norm": 9.669090706723097, "learning_rate": 6.641648046736549e-07, "logits/chosen": -0.62060546875, "logits/rejected": -0.67236328125, "logps/chosen": -644.5, "logps/rejected": -689.5, "loss": 0.5648, "rewards/accuracies": 0.6796875, "rewards/chosen": -3.06640625, "rewards/margins": 0.6990966796875, "rewards/rejected": -3.765625, "step": 181 }, { "epoch": 0.23296, "grad_norm": 9.666414731608148, "learning_rate": 6.634712534021367e-07, "logits/chosen": -0.584716796875, "logits/rejected": -0.6240234375, "logps/chosen": -589.0, "logps/rejected": -635.5, "loss": 0.5225, "rewards/accuracies": 0.796875, "rewards/chosen": -2.626953125, "rewards/margins": 0.66064453125, "rewards/rejected": -3.291015625, "step": 182 }, { "epoch": 0.23424, "grad_norm": 9.206989504196308, "learning_rate": 6.627714241131942e-07, "logits/chosen": -0.568115234375, "logits/rejected": -0.587158203125, "logps/chosen": -609.25, "logps/rejected": -633.5, "loss": 0.5513, "rewards/accuracies": 0.7109375, "rewards/chosen": -2.73828125, "rewards/margins": 0.598876953125, "rewards/rejected": -3.333984375, "step": 183 }, { "epoch": 0.23552, "grad_norm": 39.055918050847936, "learning_rate": 6.620653308225959e-07, "logits/chosen": -0.563232421875, "logits/rejected": -0.642578125, "logps/chosen": -568.5, "logps/rejected": -614.5, "loss": 0.6409, "rewards/accuracies": 0.671875, "rewards/chosen": -2.60546875, "rewards/margins": 0.454833984375, "rewards/rejected": -3.060546875, "step": 184 }, { "epoch": 0.2368, "grad_norm": 15.55460973147395, "learning_rate": 6.613529876715619e-07, "logits/chosen": -0.669189453125, "logits/rejected": -0.71875, "logps/chosen": -591.5, "logps/rejected": -631.0, "loss": 0.529, "rewards/accuracies": 0.7578125, "rewards/chosen": -2.4326171875, "rewards/margins": 0.650634765625, "rewards/rejected": -3.080078125, "step": 185 }, { "epoch": 0.23808, "grad_norm": 8.09681757649616, "learning_rate": 6.606344089264805e-07, "logits/chosen": -0.70849609375, "logits/rejected": -0.7412109375, "logps/chosen": -568.25, "logps/rejected": -573.5, "loss": 0.5365, "rewards/accuracies": 0.703125, "rewards/chosen": -2.1474609375, "rewards/margins": 0.56982421875, "rewards/rejected": -2.716796875, "step": 186 }, { "epoch": 0.23936, "grad_norm": 8.312763295596206, "learning_rate": 6.599096089786234e-07, "logits/chosen": -0.8388671875, "logits/rejected": -0.873046875, "logps/chosen": -519.5, "logps/rejected": -573.25, "loss": 0.5779, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.048828125, "rewards/margins": 0.4176025390625, "rewards/rejected": -2.4677734375, "step": 187 }, { "epoch": 0.24064, "grad_norm": 8.587134768192069, "learning_rate": 6.591786023438564e-07, "logits/chosen": -0.638519287109375, "logits/rejected": -0.7158203125, "logps/chosen": -489.75, "logps/rejected": -564.75, "loss": 0.5731, "rewards/accuracies": 0.7421875, "rewards/chosen": -1.8544921875, "rewards/margins": 0.45361328125, "rewards/rejected": -2.3056640625, "step": 188 }, { "epoch": 0.24192, "grad_norm": 13.116768076581428, "learning_rate": 6.584414036623496e-07, "logits/chosen": -0.89697265625, "logits/rejected": -0.98388671875, "logps/chosen": -494.0, "logps/rejected": -540.0, "loss": 0.5309, "rewards/accuracies": 0.75, "rewards/chosen": -1.63671875, "rewards/margins": 0.5660400390625, "rewards/rejected": -2.203125, "step": 189 }, { "epoch": 0.2432, "grad_norm": 9.963354836403468, "learning_rate": 6.576980276982832e-07, "logits/chosen": -0.82666015625, "logits/rejected": -0.91162109375, "logps/chosen": -469.5, "logps/rejected": -498.25, "loss": 0.5733, "rewards/accuracies": 0.703125, "rewards/chosen": -1.67578125, "rewards/margins": 0.40509033203125, "rewards/rejected": -2.08203125, "step": 190 }, { "epoch": 0.24448, "grad_norm": 9.679252853242984, "learning_rate": 6.569484893395527e-07, "logits/chosen": -0.92138671875, "logits/rejected": -0.974609375, "logps/chosen": -504.75, "logps/rejected": -556.5, "loss": 0.559, "rewards/accuracies": 0.71875, "rewards/chosen": -1.8671875, "rewards/margins": 0.467041015625, "rewards/rejected": -2.333984375, "step": 191 }, { "epoch": 0.24576, "grad_norm": 11.180357368507915, "learning_rate": 6.561928035974705e-07, "logits/chosen": -0.875, "logits/rejected": -0.89599609375, "logps/chosen": -547.25, "logps/rejected": -599.25, "loss": 0.5339, "rewards/accuracies": 0.765625, "rewards/chosen": -2.015625, "rewards/margins": 0.6204833984375, "rewards/rejected": -2.634765625, "step": 192 }, { "epoch": 0.24704, "grad_norm": 11.453973983300482, "learning_rate": 6.55430985606465e-07, "logits/chosen": -0.84228515625, "logits/rejected": -0.87890625, "logps/chosen": -550.25, "logps/rejected": -584.5, "loss": 0.5265, "rewards/accuracies": 0.734375, "rewards/chosen": -2.18359375, "rewards/margins": 0.672607421875, "rewards/rejected": -2.857421875, "step": 193 }, { "epoch": 0.24832, "grad_norm": 19.413502192585955, "learning_rate": 6.546630506237778e-07, "logits/chosen": -0.7080078125, "logits/rejected": -0.7685546875, "logps/chosen": -565.0, "logps/rejected": -626.75, "loss": 0.5264, "rewards/accuracies": 0.75, "rewards/chosen": -2.2333984375, "rewards/margins": 0.642578125, "rewards/rejected": -2.873046875, "step": 194 }, { "epoch": 0.2496, "grad_norm": 11.99148623186324, "learning_rate": 6.538890140291578e-07, "logits/chosen": -0.73046875, "logits/rejected": -0.75048828125, "logps/chosen": -602.25, "logps/rejected": -628.75, "loss": 0.5333, "rewards/accuracies": 0.75, "rewards/chosen": -2.5947265625, "rewards/margins": 0.683349609375, "rewards/rejected": -3.27734375, "step": 195 }, { "epoch": 0.25088, "grad_norm": 10.31797612543643, "learning_rate": 6.531088913245536e-07, "logits/chosen": -0.64794921875, "logits/rejected": -0.716796875, "logps/chosen": -635.75, "logps/rejected": -716.0, "loss": 0.4923, "rewards/accuracies": 0.734375, "rewards/chosen": -3.15625, "rewards/margins": 0.8427734375, "rewards/rejected": -3.99609375, "step": 196 }, { "epoch": 0.25216, "grad_norm": 87.52079256871755, "learning_rate": 6.523226981338026e-07, "logits/chosen": -0.68359375, "logits/rejected": -0.70458984375, "logps/chosen": -738.0, "logps/rejected": -774.0, "loss": 0.6523, "rewards/accuracies": 0.6875, "rewards/chosen": -4.09765625, "rewards/margins": 0.696044921875, "rewards/rejected": -4.794921875, "step": 197 }, { "epoch": 0.25344, "grad_norm": 72.5293740432998, "learning_rate": 6.515304502023185e-07, "logits/chosen": -0.5673828125, "logits/rejected": -0.601318359375, "logps/chosen": -806.5, "logps/rejected": -903.0, "loss": 0.5506, "rewards/accuracies": 0.78125, "rewards/chosen": -4.96484375, "rewards/margins": 1.0079345703125, "rewards/rejected": -5.97265625, "step": 198 }, { "epoch": 0.25472, "grad_norm": 44.679991429690986, "learning_rate": 6.507321633967758e-07, "logits/chosen": -0.649658203125, "logits/rejected": -0.6904296875, "logps/chosen": -837.0, "logps/rejected": -915.0, "loss": 0.499, "rewards/accuracies": 0.7421875, "rewards/chosen": -4.8671875, "rewards/margins": 1.01953125, "rewards/rejected": -5.88671875, "step": 199 }, { "epoch": 0.256, "grad_norm": 59.397830971941254, "learning_rate": 6.499278537047919e-07, "logits/chosen": -0.69677734375, "logits/rejected": -0.72998046875, "logps/chosen": -831.0, "logps/rejected": -920.0, "loss": 0.5369, "rewards/accuracies": 0.7109375, "rewards/chosen": -5.015625, "rewards/margins": 0.87841796875, "rewards/rejected": -5.8828125, "step": 200 }, { "epoch": 0.256, "eval_logits/chosen": -0.6611328125, "eval_logits/rejected": -0.733642578125, "eval_logps/chosen": -767.75, "eval_logps/rejected": -825.0, "eval_loss": 0.6021875143051147, "eval_rewards/accuracies": 0.69921875, "eval_rewards/chosen": -4.474609375, "eval_rewards/margins": 0.75811767578125, "eval_rewards/rejected": -5.23046875, "eval_runtime": 27.6962, "eval_samples_per_second": 18.053, "eval_steps_per_second": 0.578, "step": 200 }, { "epoch": 0.25728, "grad_norm": 53.24908848058674, "learning_rate": 6.491175372346071e-07, "logits/chosen": -0.671875, "logits/rejected": -0.701171875, "logps/chosen": -777.5, "logps/rejected": -874.5, "loss": 0.5742, "rewards/accuracies": 0.7109375, "rewards/chosen": -4.5703125, "rewards/margins": 0.83984375, "rewards/rejected": -5.41015625, "step": 201 }, { "epoch": 0.25856, "grad_norm": 96.55947068044792, "learning_rate": 6.483012302147617e-07, "logits/chosen": -0.691162109375, "logits/rejected": -0.73388671875, "logps/chosen": -718.0, "logps/rejected": -742.0, "loss": 0.715, "rewards/accuracies": 0.6171875, "rewards/chosen": -3.8984375, "rewards/margins": 0.52960205078125, "rewards/rejected": -4.42578125, "step": 202 }, { "epoch": 0.25984, "grad_norm": 11.651199302119567, "learning_rate": 6.474789489937715e-07, "logits/chosen": -0.7294921875, "logits/rejected": -0.79931640625, "logps/chosen": -654.0, "logps/rejected": -739.0, "loss": 0.4917, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.275390625, "rewards/margins": 0.88525390625, "rewards/rejected": -4.16015625, "step": 203 }, { "epoch": 0.26112, "grad_norm": 22.98647680803771, "learning_rate": 6.466507100397998e-07, "logits/chosen": -0.763671875, "logits/rejected": -0.79931640625, "logps/chosen": -605.0, "logps/rejected": -657.5, "loss": 0.5948, "rewards/accuracies": 0.6484375, "rewards/chosen": -2.662109375, "rewards/margins": 0.5657958984375, "rewards/rejected": -3.23046875, "step": 204 }, { "epoch": 0.2624, "grad_norm": 8.334836762281965, "learning_rate": 6.458165299403282e-07, "logits/chosen": -0.702880859375, "logits/rejected": -0.7275390625, "logps/chosen": -576.0, "logps/rejected": -620.5, "loss": 0.5457, "rewards/accuracies": 0.78125, "rewards/chosen": -2.412109375, "rewards/margins": 0.678466796875, "rewards/rejected": -3.08984375, "step": 205 }, { "epoch": 0.26368, "grad_norm": 8.43068729699347, "learning_rate": 6.449764254018236e-07, "logits/chosen": -0.8203125, "logits/rejected": -0.89306640625, "logps/chosen": -557.5, "logps/rejected": -619.5, "loss": 0.5399, "rewards/accuracies": 0.703125, "rewards/chosen": -2.326171875, "rewards/margins": 0.57373046875, "rewards/rejected": -2.8984375, "step": 206 }, { "epoch": 0.26496, "grad_norm": 15.681934350374608, "learning_rate": 6.441304132494045e-07, "logits/chosen": -0.89111328125, "logits/rejected": -0.939453125, "logps/chosen": -506.5, "logps/rejected": -534.75, "loss": 0.527, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.857421875, "rewards/margins": 0.568603515625, "rewards/rejected": -2.427734375, "step": 207 }, { "epoch": 0.26624, "grad_norm": 9.604421155152727, "learning_rate": 6.432785104265033e-07, "logits/chosen": -0.9912109375, "logits/rejected": -1.02783203125, "logps/chosen": -475.25, "logps/rejected": -503.25, "loss": 0.5645, "rewards/accuracies": 0.6875, "rewards/chosen": -1.501953125, "rewards/margins": 0.432373046875, "rewards/rejected": -1.9345703125, "step": 208 }, { "epoch": 0.26752, "grad_norm": 14.360891651779323, "learning_rate": 6.424207339945278e-07, "logits/chosen": -0.982421875, "logits/rejected": -1.02734375, "logps/chosen": -465.75, "logps/rejected": -502.25, "loss": 0.5318, "rewards/accuracies": 0.734375, "rewards/chosen": -1.5576171875, "rewards/margins": 0.544677734375, "rewards/rejected": -2.1025390625, "step": 209 }, { "epoch": 0.2688, "grad_norm": 13.445955498385377, "learning_rate": 6.41557101132518e-07, "logits/chosen": -0.91162109375, "logits/rejected": -0.9501953125, "logps/chosen": -472.5, "logps/rejected": -518.5, "loss": 0.5033, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5302734375, "rewards/margins": 0.611328125, "rewards/rejected": -2.142578125, "step": 210 }, { "epoch": 0.27008, "grad_norm": 12.723099981494066, "learning_rate": 6.406876291368041e-07, "logits/chosen": -0.9912109375, "logits/rejected": -1.0634765625, "logps/chosen": -497.0, "logps/rejected": -556.5, "loss": 0.4932, "rewards/accuracies": 0.78125, "rewards/chosen": -1.623046875, "rewards/margins": 0.616455078125, "rewards/rejected": -2.2373046875, "step": 211 }, { "epoch": 0.27136, "grad_norm": 9.020187932187099, "learning_rate": 6.398123354206582e-07, "logits/chosen": -0.91845703125, "logits/rejected": -0.98974609375, "logps/chosen": -500.75, "logps/rejected": -560.0, "loss": 0.4941, "rewards/accuracies": 0.7734375, "rewards/chosen": -1.8056640625, "rewards/margins": 0.711669921875, "rewards/rejected": -2.517578125, "step": 212 }, { "epoch": 0.27264, "grad_norm": 8.556186928836153, "learning_rate": 6.389312375139469e-07, "logits/chosen": -0.9033203125, "logits/rejected": -0.96142578125, "logps/chosen": -594.25, "logps/rejected": -617.5, "loss": 0.555, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.0859375, "rewards/margins": 0.615234375, "rewards/rejected": -2.69921875, "step": 213 }, { "epoch": 0.27392, "grad_norm": 13.39244807934873, "learning_rate": 6.380443530627797e-07, "logits/chosen": -0.85986328125, "logits/rejected": -0.91552734375, "logps/chosen": -502.25, "logps/rejected": -560.75, "loss": 0.4989, "rewards/accuracies": 0.75, "rewards/chosen": -1.9384765625, "rewards/margins": 0.70263671875, "rewards/rejected": -2.63671875, "step": 214 }, { "epoch": 0.2752, "grad_norm": 8.777635188116983, "learning_rate": 6.371516998291552e-07, "logits/chosen": -0.80126953125, "logits/rejected": -0.912109375, "logps/chosen": -518.5, "logps/rejected": -607.5, "loss": 0.4933, "rewards/accuracies": 0.7578125, "rewards/chosen": -2.1640625, "rewards/margins": 0.726806640625, "rewards/rejected": -2.890625, "step": 215 }, { "epoch": 0.27648, "grad_norm": 16.262313588962954, "learning_rate": 6.362532956906059e-07, "logits/chosen": -0.7509765625, "logits/rejected": -0.8251953125, "logps/chosen": -543.0, "logps/rejected": -592.0, "loss": 0.4902, "rewards/accuracies": 0.8125, "rewards/chosen": -2.212890625, "rewards/margins": 0.656982421875, "rewards/rejected": -2.87109375, "step": 216 }, { "epoch": 0.27776, "grad_norm": 11.250156251667049, "learning_rate": 6.353491586398404e-07, "logits/chosen": -0.720703125, "logits/rejected": -0.779296875, "logps/chosen": -568.0, "logps/rejected": -639.5, "loss": 0.5265, "rewards/accuracies": 0.7265625, "rewards/chosen": -2.626953125, "rewards/margins": 0.7470703125, "rewards/rejected": -3.373046875, "step": 217 }, { "epoch": 0.27904, "grad_norm": 9.749001446309522, "learning_rate": 6.344393067843826e-07, "logits/chosen": -0.6494140625, "logits/rejected": -0.721923828125, "logps/chosen": -622.25, "logps/rejected": -703.5, "loss": 0.5042, "rewards/accuracies": 0.75, "rewards/chosen": -3.078125, "rewards/margins": 0.8087158203125, "rewards/rejected": -3.888671875, "step": 218 }, { "epoch": 0.28032, "grad_norm": 13.338732962845118, "learning_rate": 6.335237583462083e-07, "logits/chosen": -0.67822265625, "logits/rejected": -0.67333984375, "logps/chosen": -700.0, "logps/rejected": -834.5, "loss": 0.5743, "rewards/accuracies": 0.6875, "rewards/chosen": -3.390625, "rewards/margins": 1.11883544921875, "rewards/rejected": -4.517578125, "step": 219 }, { "epoch": 0.2816, "grad_norm": 23.59756440330084, "learning_rate": 6.326025316613823e-07, "logits/chosen": -0.515380859375, "logits/rejected": -0.543701171875, "logps/chosen": -666.0, "logps/rejected": -739.5, "loss": 0.5342, "rewards/accuracies": 0.703125, "rewards/chosen": -3.46484375, "rewards/margins": 0.9072265625, "rewards/rejected": -4.375, "step": 220 }, { "epoch": 0.28288, "grad_norm": 38.39849197226403, "learning_rate": 6.316756451796894e-07, "logits/chosen": -0.611572265625, "logits/rejected": -0.69287109375, "logps/chosen": -653.0, "logps/rejected": -741.5, "loss": 0.5483, "rewards/accuracies": 0.734375, "rewards/chosen": -3.568359375, "rewards/margins": 0.7652587890625, "rewards/rejected": -4.328125, "step": 221 }, { "epoch": 0.28416, "grad_norm": 15.339038520285099, "learning_rate": 6.307431174642653e-07, "logits/chosen": -0.654296875, "logits/rejected": -0.69580078125, "logps/chosen": -693.0, "logps/rejected": -754.5, "loss": 0.5019, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.5, "rewards/margins": 0.99560546875, "rewards/rejected": -4.494140625, "step": 222 }, { "epoch": 0.28544, "grad_norm": 8.374650607981456, "learning_rate": 6.298049671912254e-07, "logits/chosen": -0.57861328125, "logits/rejected": -0.62353515625, "logps/chosen": -627.0, "logps/rejected": -704.5, "loss": 0.4972, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.31640625, "rewards/margins": 0.92919921875, "rewards/rejected": -4.244140625, "step": 223 }, { "epoch": 0.28672, "grad_norm": 45.963116084804696, "learning_rate": 6.2886121314929e-07, "logits/chosen": -0.609619140625, "logits/rejected": -0.6494140625, "logps/chosen": -639.5, "logps/rejected": -666.25, "loss": 0.7107, "rewards/accuracies": 0.59375, "rewards/chosen": -3.390625, "rewards/margins": 0.4444580078125, "rewards/rejected": -3.8359375, "step": 224 }, { "epoch": 0.288, "grad_norm": 7.958096573222809, "learning_rate": 6.279118742394089e-07, "logits/chosen": -0.588623046875, "logits/rejected": -0.64306640625, "logps/chosen": -624.5, "logps/rejected": -695.5, "loss": 0.4767, "rewards/accuracies": 0.7890625, "rewards/chosen": -2.857421875, "rewards/margins": 0.83056640625, "rewards/rejected": -3.689453125, "step": 225 }, { "epoch": 0.28928, "grad_norm": 9.611100210392358, "learning_rate": 6.269569694743816e-07, "logits/chosen": -0.6396484375, "logits/rejected": -0.68701171875, "logps/chosen": -564.5, "logps/rejected": -650.0, "loss": 0.4786, "rewards/accuracies": 0.8203125, "rewards/chosen": -2.638671875, "rewards/margins": 0.86083984375, "rewards/rejected": -3.49609375, "step": 226 }, { "epoch": 0.29056, "grad_norm": 9.097155318325184, "learning_rate": 6.259965179784779e-07, "logits/chosen": -0.71484375, "logits/rejected": -0.76611328125, "logps/chosen": -612.0, "logps/rejected": -672.5, "loss": 0.4834, "rewards/accuracies": 0.734375, "rewards/chosen": -2.845703125, "rewards/margins": 0.808837890625, "rewards/rejected": -3.658203125, "step": 227 }, { "epoch": 0.29184, "grad_norm": 11.493217517064283, "learning_rate": 6.250305389870541e-07, "logits/chosen": -0.7099609375, "logits/rejected": -0.77001953125, "logps/chosen": -602.5, "logps/rejected": -667.5, "loss": 0.5058, "rewards/accuracies": 0.734375, "rewards/chosen": -2.642578125, "rewards/margins": 0.72119140625, "rewards/rejected": -3.36328125, "step": 228 }, { "epoch": 0.29312, "grad_norm": 9.437777289578113, "learning_rate": 6.240590518461678e-07, "logits/chosen": -0.62939453125, "logits/rejected": -0.7001953125, "logps/chosen": -529.75, "logps/rejected": -617.0, "loss": 0.4882, "rewards/accuracies": 0.8046875, "rewards/chosen": -2.3828125, "rewards/margins": 0.7900390625, "rewards/rejected": -3.171875, "step": 229 }, { "epoch": 0.2944, "grad_norm": 8.312134657078936, "learning_rate": 6.230820760121904e-07, "logits/chosen": -0.578369140625, "logits/rejected": -0.627197265625, "logps/chosen": -559.75, "logps/rejected": -656.5, "loss": 0.4768, "rewards/accuracies": 0.765625, "rewards/chosen": -2.5625, "rewards/margins": 0.951416015625, "rewards/rejected": -3.51171875, "step": 230 }, { "epoch": 0.29568, "grad_norm": 11.986917072887415, "learning_rate": 6.220996310514181e-07, "logits/chosen": -0.5537109375, "logits/rejected": -0.60888671875, "logps/chosen": -616.0, "logps/rejected": -668.5, "loss": 0.576, "rewards/accuracies": 0.703125, "rewards/chosen": -2.873046875, "rewards/margins": 0.666015625, "rewards/rejected": -3.537109375, "step": 231 }, { "epoch": 0.29696, "grad_norm": 12.172332354528226, "learning_rate": 6.21111736639679e-07, "logits/chosen": -0.610595703125, "logits/rejected": -0.695068359375, "logps/chosen": -535.0, "logps/rejected": -651.0, "loss": 0.4534, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4560546875, "rewards/margins": 0.952392578125, "rewards/rejected": -3.408203125, "step": 232 }, { "epoch": 0.29824, "grad_norm": 14.200772198441506, "learning_rate": 6.201184125619403e-07, "logits/chosen": -0.520263671875, "logits/rejected": -0.56201171875, "logps/chosen": -595.0, "logps/rejected": -654.0, "loss": 0.4812, "rewards/accuracies": 0.765625, "rewards/chosen": -2.56640625, "rewards/margins": 0.9599609375, "rewards/rejected": -3.52734375, "step": 233 }, { "epoch": 0.29952, "grad_norm": 13.457335334304465, "learning_rate": 6.191196787119104e-07, "logits/chosen": -0.5048828125, "logits/rejected": -0.588134765625, "logps/chosen": -642.5, "logps/rejected": -732.5, "loss": 0.4973, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.150390625, "rewards/margins": 0.856689453125, "rewards/rejected": -4.005859375, "step": 234 }, { "epoch": 0.3008, "grad_norm": 10.321622368203395, "learning_rate": 6.181155550916422e-07, "logits/chosen": -0.4512939453125, "logits/rejected": -0.501708984375, "logps/chosen": -647.5, "logps/rejected": -769.5, "loss": 0.4746, "rewards/accuracies": 0.75, "rewards/chosen": -3.408203125, "rewards/margins": 0.9677734375, "rewards/rejected": -4.37109375, "step": 235 }, { "epoch": 0.30208, "grad_norm": 8.858095075192827, "learning_rate": 6.171060618111317e-07, "logits/chosen": -0.44775390625, "logits/rejected": -0.4814453125, "logps/chosen": -666.0, "logps/rejected": -779.0, "loss": 0.4962, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.6328125, "rewards/margins": 0.91748046875, "rewards/rejected": -4.556640625, "step": 236 }, { "epoch": 0.30336, "grad_norm": 18.70168630039342, "learning_rate": 6.160912190879145e-07, "logits/chosen": -0.5400390625, "logits/rejected": -0.544921875, "logps/chosen": -651.0, "logps/rejected": -721.5, "loss": 0.5413, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.35546875, "rewards/margins": 0.91790771484375, "rewards/rejected": -4.275390625, "step": 237 }, { "epoch": 0.30464, "grad_norm": 9.749493306506885, "learning_rate": 6.150710472466629e-07, "logits/chosen": -0.45068359375, "logits/rejected": -0.485595703125, "logps/chosen": -638.0, "logps/rejected": -719.5, "loss": 0.4751, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.2265625, "rewards/margins": 0.86181640625, "rewards/rejected": -4.0859375, "step": 238 }, { "epoch": 0.30592, "grad_norm": 20.80698350471745, "learning_rate": 6.140455667187765e-07, "logits/chosen": -0.46551513671875, "logits/rejected": -0.50634765625, "logps/chosen": -761.0, "logps/rejected": -825.0, "loss": 0.5784, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.84375, "rewards/margins": 0.728759765625, "rewards/rejected": -4.578125, "step": 239 }, { "epoch": 0.3072, "grad_norm": 8.953639678478353, "learning_rate": 6.13014798041975e-07, "logits/chosen": -0.417236328125, "logits/rejected": -0.4698486328125, "logps/chosen": -658.0, "logps/rejected": -751.0, "loss": 0.5047, "rewards/accuracies": 0.6953125, "rewards/chosen": -3.318359375, "rewards/margins": 0.96484375, "rewards/rejected": -4.283203125, "step": 240 }, { "epoch": 0.30848, "grad_norm": 10.845686942793368, "learning_rate": 6.119787618598854e-07, "logits/chosen": -0.4853668212890625, "logits/rejected": -0.5223388671875, "logps/chosen": -655.75, "logps/rejected": -701.0, "loss": 0.5384, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.103515625, "rewards/margins": 0.777099609375, "rewards/rejected": -3.880859375, "step": 241 }, { "epoch": 0.30976, "grad_norm": 9.846249269177688, "learning_rate": 6.109374789216295e-07, "logits/chosen": -0.4921875, "logits/rejected": -0.587646484375, "logps/chosen": -609.0, "logps/rejected": -722.0, "loss": 0.4894, "rewards/accuracies": 0.7265625, "rewards/chosen": -2.994140625, "rewards/margins": 0.99072265625, "rewards/rejected": -3.9765625, "step": 242 }, { "epoch": 0.31104, "grad_norm": 8.39816655306362, "learning_rate": 6.098909700814082e-07, "logits/chosen": -0.564208984375, "logits/rejected": -0.618408203125, "logps/chosen": -498.25, "logps/rejected": -594.25, "loss": 0.523, "rewards/accuracies": 0.75, "rewards/chosen": -2.3525390625, "rewards/margins": 0.72216796875, "rewards/rejected": -3.078125, "step": 243 }, { "epoch": 0.31232, "grad_norm": 15.315342312273753, "learning_rate": 6.08839256298083e-07, "logits/chosen": -0.510986328125, "logits/rejected": -0.580322265625, "logps/chosen": -574.75, "logps/rejected": -668.0, "loss": 0.4818, "rewards/accuracies": 0.765625, "rewards/chosen": -2.73046875, "rewards/margins": 0.8583984375, "rewards/rejected": -3.591796875, "step": 244 }, { "epoch": 0.3136, "grad_norm": 11.767718051696765, "learning_rate": 6.077823586347579e-07, "logits/chosen": -0.48681640625, "logits/rejected": -0.5322265625, "logps/chosen": -561.0, "logps/rejected": -622.0, "loss": 0.5154, "rewards/accuracies": 0.734375, "rewards/chosen": -2.603515625, "rewards/margins": 0.7664794921875, "rewards/rejected": -3.3671875, "step": 245 }, { "epoch": 0.31488, "grad_norm": 9.13068372680758, "learning_rate": 6.067202982583559e-07, "logits/chosen": -0.5447998046875, "logits/rejected": -0.6015625, "logps/chosen": -654.5, "logps/rejected": -729.0, "loss": 0.5201, "rewards/accuracies": 0.765625, "rewards/chosen": -2.890625, "rewards/margins": 0.822998046875, "rewards/rejected": -3.7109375, "step": 246 }, { "epoch": 0.31616, "grad_norm": 8.828236079236284, "learning_rate": 6.056530964391961e-07, "logits/chosen": -0.55419921875, "logits/rejected": -0.6162109375, "logps/chosen": -605.75, "logps/rejected": -679.5, "loss": 0.4972, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.8447265625, "rewards/margins": 0.84423828125, "rewards/rejected": -3.6875, "step": 247 }, { "epoch": 0.31744, "grad_norm": 9.350860751825014, "learning_rate": 6.04580774550567e-07, "logits/chosen": -0.501220703125, "logits/rejected": -0.592529296875, "logps/chosen": -632.5, "logps/rejected": -708.0, "loss": 0.4995, "rewards/accuracies": 0.7890625, "rewards/chosen": -2.921875, "rewards/margins": 0.94384765625, "rewards/rejected": -3.8671875, "step": 248 }, { "epoch": 0.31872, "grad_norm": 9.838783187292746, "learning_rate": 6.035033540682994e-07, "logits/chosen": -0.5537109375, "logits/rejected": -0.60009765625, "logps/chosen": -606.0, "logps/rejected": -660.5, "loss": 0.5366, "rewards/accuracies": 0.6796875, "rewards/chosen": -2.8203125, "rewards/margins": 0.7607421875, "rewards/rejected": -3.580078125, "step": 249 }, { "epoch": 0.32, "grad_norm": 10.098149943213269, "learning_rate": 6.02420856570335e-07, "logits/chosen": -0.512939453125, "logits/rejected": -0.553955078125, "logps/chosen": -584.0, "logps/rejected": -677.5, "loss": 0.4851, "rewards/accuracies": 0.765625, "rewards/chosen": -2.765625, "rewards/margins": 0.787109375, "rewards/rejected": -3.5546875, "step": 250 }, { "epoch": 0.32128, "grad_norm": 11.938073471759608, "learning_rate": 6.013333037362958e-07, "logits/chosen": -0.53125, "logits/rejected": -0.58447265625, "logps/chosen": -637.0, "logps/rejected": -707.0, "loss": 0.5481, "rewards/accuracies": 0.671875, "rewards/chosen": -2.98046875, "rewards/margins": 0.731689453125, "rewards/rejected": -3.71484375, "step": 251 }, { "epoch": 0.32256, "grad_norm": 8.629915885629975, "learning_rate": 6.002407173470485e-07, "logits/chosen": -0.52685546875, "logits/rejected": -0.5693359375, "logps/chosen": -647.0, "logps/rejected": -739.5, "loss": 0.4979, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.0390625, "rewards/margins": 0.815185546875, "rewards/rejected": -3.85546875, "step": 252 }, { "epoch": 0.32384, "grad_norm": 15.164274031135337, "learning_rate": 5.991431192842692e-07, "logits/chosen": -0.47216796875, "logits/rejected": -0.52001953125, "logps/chosen": -640.5, "logps/rejected": -726.0, "loss": 0.455, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.05859375, "rewards/margins": 0.89501953125, "rewards/rejected": -3.951171875, "step": 253 }, { "epoch": 0.32512, "grad_norm": 13.467159822471947, "learning_rate": 5.980405315300045e-07, "logits/chosen": -0.395751953125, "logits/rejected": -0.4453125, "logps/chosen": -634.5, "logps/rejected": -718.0, "loss": 0.5203, "rewards/accuracies": 0.7109375, "rewards/chosen": -3.119140625, "rewards/margins": 0.7841796875, "rewards/rejected": -3.90625, "step": 254 }, { "epoch": 0.3264, "grad_norm": 9.015056279995315, "learning_rate": 5.969329761662318e-07, "logits/chosen": -0.38385009765625, "logits/rejected": -0.4581298828125, "logps/chosen": -625.0, "logps/rejected": -722.5, "loss": 0.5035, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.091796875, "rewards/margins": 0.9658203125, "rewards/rejected": -4.05859375, "step": 255 }, { "epoch": 0.32768, "grad_norm": 9.520970646405912, "learning_rate": 5.958204753744171e-07, "logits/chosen": -0.4287109375, "logits/rejected": -0.486572265625, "logps/chosen": -615.5, "logps/rejected": -700.0, "loss": 0.5045, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.001953125, "rewards/margins": 0.9034423828125, "rewards/rejected": -3.90625, "step": 256 }, { "epoch": 0.32896, "grad_norm": 22.42105812552605, "learning_rate": 5.9470305143507e-07, "logits/chosen": -0.405029296875, "logits/rejected": -0.456787109375, "logps/chosen": -684.0, "logps/rejected": -746.0, "loss": 0.6086, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.5625, "rewards/margins": 0.7578125, "rewards/rejected": -4.3203125, "step": 257 }, { "epoch": 0.33024, "grad_norm": 20.00218984655251, "learning_rate": 5.935807267272985e-07, "logits/chosen": -0.4755859375, "logits/rejected": -0.497802734375, "logps/chosen": -668.0, "logps/rejected": -725.5, "loss": 0.5487, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.1875, "rewards/margins": 0.792236328125, "rewards/rejected": -3.978515625, "step": 258 }, { "epoch": 0.33152, "grad_norm": 10.403435140293421, "learning_rate": 5.924535237283598e-07, "logits/chosen": -0.47998046875, "logits/rejected": -0.528076171875, "logps/chosen": -617.5, "logps/rejected": -688.0, "loss": 0.4796, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8671875, "rewards/margins": 0.8173828125, "rewards/rejected": -3.68359375, "step": 259 }, { "epoch": 0.3328, "grad_norm": 8.359013118624633, "learning_rate": 5.913214650132112e-07, "logits/chosen": -0.43115234375, "logits/rejected": -0.514892578125, "logps/chosen": -580.75, "logps/rejected": -687.0, "loss": 0.4962, "rewards/accuracies": 0.734375, "rewards/chosen": -2.859375, "rewards/margins": 0.9130859375, "rewards/rejected": -3.76953125, "step": 260 }, { "epoch": 0.33408, "grad_norm": 9.989053690781105, "learning_rate": 5.901845732540568e-07, "logits/chosen": -0.488037109375, "logits/rejected": -0.51708984375, "logps/chosen": -659.0, "logps/rejected": -727.0, "loss": 0.4965, "rewards/accuracies": 0.75, "rewards/chosen": -3.03125, "rewards/margins": 1.017578125, "rewards/rejected": -4.044921875, "step": 261 }, { "epoch": 0.33536, "grad_norm": 9.229777464931137, "learning_rate": 5.890428712198945e-07, "logits/chosen": -0.4755859375, "logits/rejected": -0.4970703125, "logps/chosen": -673.5, "logps/rejected": -766.0, "loss": 0.4583, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.19921875, "rewards/margins": 1.03955078125, "rewards/rejected": -4.2421875, "step": 262 }, { "epoch": 0.33664, "grad_norm": 11.937311630757574, "learning_rate": 5.878963817760597e-07, "logits/chosen": -0.454833984375, "logits/rejected": -0.519775390625, "logps/chosen": -642.5, "logps/rejected": -735.5, "loss": 0.5139, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.205078125, "rewards/margins": 0.8829345703125, "rewards/rejected": -4.0859375, "step": 263 }, { "epoch": 0.33792, "grad_norm": 10.230639726258993, "learning_rate": 5.867451278837666e-07, "logits/chosen": -0.355499267578125, "logits/rejected": -0.408599853515625, "logps/chosen": -642.5, "logps/rejected": -704.0, "loss": 0.5486, "rewards/accuracies": 0.7109375, "rewards/chosen": -3.228515625, "rewards/margins": 0.77197265625, "rewards/rejected": -3.998046875, "step": 264 }, { "epoch": 0.3392, "grad_norm": 13.416289127212499, "learning_rate": 5.855891325996495e-07, "logits/chosen": -0.41424560546875, "logits/rejected": -0.42974853515625, "logps/chosen": -658.5, "logps/rejected": -730.0, "loss": 0.5526, "rewards/accuracies": 0.6953125, "rewards/chosen": -3.15625, "rewards/margins": 0.790771484375, "rewards/rejected": -3.943359375, "step": 265 }, { "epoch": 0.34048, "grad_norm": 12.108152563268916, "learning_rate": 5.844284190753003e-07, "logits/chosen": -0.4765625, "logits/rejected": -0.50439453125, "logps/chosen": -636.5, "logps/rejected": -707.0, "loss": 0.5176, "rewards/accuracies": 0.7734375, "rewards/chosen": -2.982421875, "rewards/margins": 0.932373046875, "rewards/rejected": -3.916015625, "step": 266 }, { "epoch": 0.34176, "grad_norm": 9.58986611812255, "learning_rate": 5.83263010556805e-07, "logits/chosen": -0.508544921875, "logits/rejected": -0.589599609375, "logps/chosen": -634.0, "logps/rejected": -738.5, "loss": 0.5, "rewards/accuracies": 0.71875, "rewards/chosen": -3.123046875, "rewards/margins": 0.942626953125, "rewards/rejected": -4.068359375, "step": 267 }, { "epoch": 0.34304, "grad_norm": 10.292439732995158, "learning_rate": 5.820929303842783e-07, "logits/chosen": -0.5439453125, "logits/rejected": -0.611572265625, "logps/chosen": -568.25, "logps/rejected": -657.5, "loss": 0.4522, "rewards/accuracies": 0.7734375, "rewards/chosen": -2.576171875, "rewards/margins": 1.00390625, "rewards/rejected": -3.58203125, "step": 268 }, { "epoch": 0.34432, "grad_norm": 8.040921361038157, "learning_rate": 5.809182019913959e-07, "logits/chosen": -0.57275390625, "logits/rejected": -0.607666015625, "logps/chosen": -569.75, "logps/rejected": -623.0, "loss": 0.5395, "rewards/accuracies": 0.703125, "rewards/chosen": -2.421875, "rewards/margins": 0.765380859375, "rewards/rejected": -3.1875, "step": 269 }, { "epoch": 0.3456, "grad_norm": 12.336782262534681, "learning_rate": 5.797388489049254e-07, "logits/chosen": -0.59423828125, "logits/rejected": -0.62109375, "logps/chosen": -616.0, "logps/rejected": -659.5, "loss": 0.5252, "rewards/accuracies": 0.75, "rewards/chosen": -2.69140625, "rewards/margins": 0.767822265625, "rewards/rejected": -3.45703125, "step": 270 }, { "epoch": 0.34688, "grad_norm": 14.10037159945176, "learning_rate": 5.785548947442547e-07, "logits/chosen": -0.578125, "logits/rejected": -0.60888671875, "logps/chosen": -575.5, "logps/rejected": -677.5, "loss": 0.444, "rewards/accuracies": 0.78125, "rewards/chosen": -2.5703125, "rewards/margins": 0.984375, "rewards/rejected": -3.55859375, "step": 271 }, { "epoch": 0.34816, "grad_norm": 10.631299174291128, "learning_rate": 5.773663632209201e-07, "logits/chosen": -0.559814453125, "logits/rejected": -0.641357421875, "logps/chosen": -580.5, "logps/rejected": -681.5, "loss": 0.4758, "rewards/accuracies": 0.7734375, "rewards/chosen": -2.64453125, "rewards/margins": 0.9033203125, "rewards/rejected": -3.546875, "step": 272 }, { "epoch": 0.34944, "grad_norm": 9.172576313947784, "learning_rate": 5.7617327813813e-07, "logits/chosen": -0.492431640625, "logits/rejected": -0.562744140625, "logps/chosen": -618.0, "logps/rejected": -672.5, "loss": 0.5197, "rewards/accuracies": 0.78125, "rewards/chosen": -2.921875, "rewards/margins": 0.7423095703125, "rewards/rejected": -3.666015625, "step": 273 }, { "epoch": 0.35072, "grad_norm": 10.958422585425877, "learning_rate": 5.749756633902887e-07, "logits/chosen": -0.531494140625, "logits/rejected": -0.56201171875, "logps/chosen": -585.75, "logps/rejected": -664.5, "loss": 0.4871, "rewards/accuracies": 0.765625, "rewards/chosen": -2.65625, "rewards/margins": 0.888671875, "rewards/rejected": -3.548828125, "step": 274 }, { "epoch": 0.352, "grad_norm": 11.430707520592867, "learning_rate": 5.737735429625186e-07, "logits/chosen": -0.546142578125, "logits/rejected": -0.597900390625, "logps/chosen": -664.0, "logps/rejected": -724.5, "loss": 0.5227, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.134765625, "rewards/margins": 0.956787109375, "rewards/rejected": -4.08984375, "step": 275 }, { "epoch": 0.35328, "grad_norm": 16.483350734581908, "learning_rate": 5.725669409301782e-07, "logits/chosen": -0.44775390625, "logits/rejected": -0.451416015625, "logps/chosen": -642.0, "logps/rejected": -748.5, "loss": 0.5278, "rewards/accuracies": 0.7109375, "rewards/chosen": -3.234375, "rewards/margins": 0.9853515625, "rewards/rejected": -4.224609375, "step": 276 }, { "epoch": 0.35456, "grad_norm": 9.95398312986462, "learning_rate": 5.71355881458382e-07, "logits/chosen": -0.474609375, "logits/rejected": -0.506591796875, "logps/chosen": -635.0, "logps/rejected": -716.5, "loss": 0.4857, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.126953125, "rewards/margins": 0.912109375, "rewards/rejected": -4.033203125, "step": 277 }, { "epoch": 0.35584, "grad_norm": 18.865132248923636, "learning_rate": 5.701403888015149e-07, "logits/chosen": -0.446044921875, "logits/rejected": -0.47314453125, "logps/chosen": -639.25, "logps/rejected": -730.0, "loss": 0.4663, "rewards/accuracies": 0.765625, "rewards/chosen": -3.19921875, "rewards/margins": 0.96240234375, "rewards/rejected": -4.16796875, "step": 278 }, { "epoch": 0.35712, "grad_norm": 9.213729341091112, "learning_rate": 5.689204873027471e-07, "logits/chosen": -0.38330078125, "logits/rejected": -0.398681640625, "logps/chosen": -657.0, "logps/rejected": -725.0, "loss": 0.502, "rewards/accuracies": 0.796875, "rewards/chosen": -3.232421875, "rewards/margins": 0.822998046875, "rewards/rejected": -4.0546875, "step": 279 }, { "epoch": 0.3584, "grad_norm": 19.70630286015966, "learning_rate": 5.676962013935464e-07, "logits/chosen": -0.4217529296875, "logits/rejected": -0.4654541015625, "logps/chosen": -679.5, "logps/rejected": -734.5, "loss": 0.5831, "rewards/accuracies": 0.6953125, "rewards/chosen": -3.66015625, "rewards/margins": 0.718505859375, "rewards/rejected": -4.37890625, "step": 280 }, { "epoch": 0.35968, "grad_norm": 22.464253350390536, "learning_rate": 5.664675555931892e-07, "logits/chosen": -0.401611328125, "logits/rejected": -0.404296875, "logps/chosen": -713.0, "logps/rejected": -825.0, "loss": 0.572, "rewards/accuracies": 0.6953125, "rewards/chosen": -4.025390625, "rewards/margins": 0.773193359375, "rewards/rejected": -4.796875, "step": 281 }, { "epoch": 0.36096, "grad_norm": 10.334427726270604, "learning_rate": 5.652345745082692e-07, "logits/chosen": -0.4014892578125, "logits/rejected": -0.439208984375, "logps/chosen": -702.0, "logps/rejected": -764.0, "loss": 0.4943, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.63671875, "rewards/margins": 0.854736328125, "rewards/rejected": -4.494140625, "step": 282 }, { "epoch": 0.36224, "grad_norm": 8.836351837837654, "learning_rate": 5.639972828322043e-07, "logits/chosen": -0.3765869140625, "logits/rejected": -0.436767578125, "logps/chosen": -676.5, "logps/rejected": -752.0, "loss": 0.4468, "rewards/accuracies": 0.84375, "rewards/chosen": -3.45703125, "rewards/margins": 1.09326171875, "rewards/rejected": -4.55078125, "step": 283 }, { "epoch": 0.36352, "grad_norm": 36.01906152382893, "learning_rate": 5.627557053447426e-07, "logits/chosen": -0.3876953125, "logits/rejected": -0.413818359375, "logps/chosen": -689.0, "logps/rejected": -736.5, "loss": 0.6353, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.5625, "rewards/margins": 0.6876678466796875, "rewards/rejected": -4.24609375, "step": 284 }, { "epoch": 0.3648, "grad_norm": 10.074359287488177, "learning_rate": 5.615098669114664e-07, "logits/chosen": -0.3740234375, "logits/rejected": -0.421630859375, "logps/chosen": -666.0, "logps/rejected": -800.5, "loss": 0.4379, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.51953125, "rewards/margins": 1.00341796875, "rewards/rejected": -4.52734375, "step": 285 }, { "epoch": 0.36608, "grad_norm": 8.877186370667038, "learning_rate": 5.602597924832926e-07, "logits/chosen": -0.508056640625, "logits/rejected": -0.53857421875, "logps/chosen": -638.0, "logps/rejected": -740.5, "loss": 0.4867, "rewards/accuracies": 0.75, "rewards/chosen": -3.2109375, "rewards/margins": 1.00146484375, "rewards/rejected": -4.212890625, "step": 286 }, { "epoch": 0.36736, "grad_norm": 9.967314448363597, "learning_rate": 5.590055070959751e-07, "logits/chosen": -0.4375, "logits/rejected": -0.4688720703125, "logps/chosen": -672.0, "logps/rejected": -734.0, "loss": 0.4381, "rewards/accuracies": 0.8125, "rewards/chosen": -3.158203125, "rewards/margins": 1.140625, "rewards/rejected": -4.298828125, "step": 287 }, { "epoch": 0.36864, "grad_norm": 33.10004352946362, "learning_rate": 5.577470358696021e-07, "logits/chosen": -0.389404296875, "logits/rejected": -0.446044921875, "logps/chosen": -646.5, "logps/rejected": -739.5, "loss": 0.6066, "rewards/accuracies": 0.6953125, "rewards/chosen": -3.439453125, "rewards/margins": 0.7655029296875, "rewards/rejected": -4.203125, "step": 288 }, { "epoch": 0.36992, "grad_norm": 10.915408242599547, "learning_rate": 5.56484404008093e-07, "logits/chosen": -0.506103515625, "logits/rejected": -0.5328369140625, "logps/chosen": -618.5, "logps/rejected": -691.5, "loss": 0.4948, "rewards/accuracies": 0.75, "rewards/chosen": -2.96484375, "rewards/margins": 1.02490234375, "rewards/rejected": -3.990234375, "step": 289 }, { "epoch": 0.3712, "grad_norm": 10.911477941525613, "learning_rate": 5.552176367986944e-07, "logits/chosen": -0.579345703125, "logits/rejected": -0.63330078125, "logps/chosen": -630.5, "logps/rejected": -765.5, "loss": 0.501, "rewards/accuracies": 0.75, "rewards/chosen": -2.904296875, "rewards/margins": 0.831298828125, "rewards/rejected": -3.734375, "step": 290 }, { "epoch": 0.37248, "grad_norm": 13.643638150351943, "learning_rate": 5.539467596114729e-07, "logits/chosen": -0.488525390625, "logits/rejected": -0.54443359375, "logps/chosen": -586.5, "logps/rejected": -697.5, "loss": 0.4345, "rewards/accuracies": 0.796875, "rewards/chosen": -2.744140625, "rewards/margins": 1.025390625, "rewards/rejected": -3.76953125, "step": 291 }, { "epoch": 0.37376, "grad_norm": 10.440042665143498, "learning_rate": 5.526717978988076e-07, "logits/chosen": -0.615234375, "logits/rejected": -0.625, "logps/chosen": -635.0, "logps/rejected": -655.0, "loss": 0.5673, "rewards/accuracies": 0.7265625, "rewards/chosen": -2.779296875, "rewards/margins": 0.68798828125, "rewards/rejected": -3.470703125, "step": 292 }, { "epoch": 0.37504, "grad_norm": 13.718400637119657, "learning_rate": 5.513927771948797e-07, "logits/chosen": -0.5810546875, "logits/rejected": -0.61767578125, "logps/chosen": -562.5, "logps/rejected": -622.25, "loss": 0.5058, "rewards/accuracies": 0.796875, "rewards/chosen": -2.4091796875, "rewards/margins": 0.7276611328125, "rewards/rejected": -3.138671875, "step": 293 }, { "epoch": 0.37632, "grad_norm": 12.273740462313567, "learning_rate": 5.501097231151619e-07, "logits/chosen": -0.6591796875, "logits/rejected": -0.68896484375, "logps/chosen": -548.25, "logps/rejected": -623.0, "loss": 0.4711, "rewards/accuracies": 0.7734375, "rewards/chosen": -2.212890625, "rewards/margins": 0.90234375, "rewards/rejected": -3.115234375, "step": 294 }, { "epoch": 0.3776, "grad_norm": 12.632600487373818, "learning_rate": 5.488226613559045e-07, "logits/chosen": -0.608154296875, "logits/rejected": -0.649169921875, "logps/chosen": -562.75, "logps/rejected": -593.75, "loss": 0.4987, "rewards/accuracies": 0.796875, "rewards/chosen": -2.2451171875, "rewards/margins": 0.77099609375, "rewards/rejected": -3.017578125, "step": 295 }, { "epoch": 0.37888, "grad_norm": 8.796089070402305, "learning_rate": 5.475316176936217e-07, "logits/chosen": -0.593505859375, "logits/rejected": -0.647705078125, "logps/chosen": -598.5, "logps/rejected": -641.5, "loss": 0.5126, "rewards/accuracies": 0.734375, "rewards/chosen": -2.521484375, "rewards/margins": 0.88037109375, "rewards/rejected": -3.40234375, "step": 296 }, { "epoch": 0.38016, "grad_norm": 8.538475094142157, "learning_rate": 5.462366179845746e-07, "logits/chosen": -0.6328125, "logits/rejected": -0.70556640625, "logps/chosen": -560.25, "logps/rejected": -608.5, "loss": 0.551, "rewards/accuracies": 0.7109375, "rewards/chosen": -2.3828125, "rewards/margins": 0.629150390625, "rewards/rejected": -3.009765625, "step": 297 }, { "epoch": 0.38144, "grad_norm": 8.516105169564174, "learning_rate": 5.449376881642538e-07, "logits/chosen": -0.524658203125, "logits/rejected": -0.607666015625, "logps/chosen": -572.75, "logps/rejected": -645.0, "loss": 0.5534, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.763671875, "rewards/margins": 0.6611328125, "rewards/rejected": -3.423828125, "step": 298 }, { "epoch": 0.38272, "grad_norm": 8.520830137173526, "learning_rate": 5.436348542468598e-07, "logits/chosen": -0.6298828125, "logits/rejected": -0.643310546875, "logps/chosen": -637.0, "logps/rejected": -700.5, "loss": 0.5124, "rewards/accuracies": 0.7734375, "rewards/chosen": -2.669921875, "rewards/margins": 0.77044677734375, "rewards/rejected": -3.4375, "step": 299 }, { "epoch": 0.384, "grad_norm": 11.383934405773742, "learning_rate": 5.423281423247821e-07, "logits/chosen": -0.62939453125, "logits/rejected": -0.669921875, "logps/chosen": -626.5, "logps/rejected": -694.0, "loss": 0.4308, "rewards/accuracies": 0.8359375, "rewards/chosen": -2.6640625, "rewards/margins": 0.98193359375, "rewards/rejected": -3.646484375, "step": 300 }, { "epoch": 0.384, "eval_logits/chosen": -0.5311279296875, "eval_logits/rejected": -0.6102294921875, "eval_logps/chosen": -589.25, "eval_logps/rejected": -647.5, "eval_loss": 0.5219140648841858, "eval_rewards/accuracies": 0.7308593988418579, "eval_rewards/chosen": -2.68359375, "eval_rewards/margins": 0.7745361328125, "eval_rewards/rejected": -3.458984375, "eval_runtime": 27.4735, "eval_samples_per_second": 18.199, "eval_steps_per_second": 0.582, "step": 300 }, { "epoch": 0.38528, "grad_norm": 9.685684454302386, "learning_rate": 5.410175785680765e-07, "logits/chosen": -0.60009765625, "logits/rejected": -0.627197265625, "logps/chosen": -610.25, "logps/rejected": -658.5, "loss": 0.5635, "rewards/accuracies": 0.703125, "rewards/chosen": -2.712890625, "rewards/margins": 0.729248046875, "rewards/rejected": -3.44140625, "step": 301 }, { "epoch": 0.38656, "grad_norm": 15.00512977681773, "learning_rate": 5.397031892239415e-07, "logits/chosen": -0.61962890625, "logits/rejected": -0.6162109375, "logps/chosen": -609.5, "logps/rejected": -658.0, "loss": 0.4694, "rewards/accuracies": 0.78125, "rewards/chosen": -2.783203125, "rewards/margins": 0.82666015625, "rewards/rejected": -3.61328125, "step": 302 }, { "epoch": 0.38784, "grad_norm": 13.021657646201135, "learning_rate": 5.383850006161913e-07, "logits/chosen": -0.453125, "logits/rejected": -0.511962890625, "logps/chosen": -610.0, "logps/rejected": -676.0, "loss": 0.5271, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.849609375, "rewards/margins": 0.75146484375, "rewards/rejected": -3.60546875, "step": 303 }, { "epoch": 0.38912, "grad_norm": 13.16735841607135, "learning_rate": 5.370630391447304e-07, "logits/chosen": -0.53125, "logits/rejected": -0.5673828125, "logps/chosen": -589.25, "logps/rejected": -693.0, "loss": 0.4679, "rewards/accuracies": 0.7578125, "rewards/chosen": -2.7890625, "rewards/margins": 0.91796875, "rewards/rejected": -3.712890625, "step": 304 }, { "epoch": 0.3904, "grad_norm": 8.650434954949796, "learning_rate": 5.357373312850235e-07, "logits/chosen": -0.49755859375, "logits/rejected": -0.5546875, "logps/chosen": -628.5, "logps/rejected": -716.0, "loss": 0.5205, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.193359375, "rewards/margins": 0.80926513671875, "rewards/rejected": -4.001953125, "step": 305 }, { "epoch": 0.39168, "grad_norm": 12.03283706604588, "learning_rate": 5.344079035875661e-07, "logits/chosen": -0.468505859375, "logits/rejected": -0.535400390625, "logps/chosen": -658.0, "logps/rejected": -745.0, "loss": 0.5055, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.0546875, "rewards/margins": 0.9290771484375, "rewards/rejected": -3.98828125, "step": 306 }, { "epoch": 0.39296, "grad_norm": 21.421330253934375, "learning_rate": 5.330747826773522e-07, "logits/chosen": -0.4796142578125, "logits/rejected": -0.5126953125, "logps/chosen": -613.5, "logps/rejected": -671.0, "loss": 0.5362, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.009765625, "rewards/margins": 0.86328125, "rewards/rejected": -3.873046875, "step": 307 }, { "epoch": 0.39424, "grad_norm": 11.41199587027057, "learning_rate": 5.317379952533411e-07, "logits/chosen": -0.56982421875, "logits/rejected": -0.62548828125, "logps/chosen": -628.0, "logps/rejected": -727.5, "loss": 0.4724, "rewards/accuracies": 0.7578125, "rewards/chosen": -2.916015625, "rewards/margins": 1.119140625, "rewards/rejected": -4.033203125, "step": 308 }, { "epoch": 0.39552, "grad_norm": 13.469179182772214, "learning_rate": 5.303975680879232e-07, "logits/chosen": -0.52392578125, "logits/rejected": -0.54833984375, "logps/chosen": -615.5, "logps/rejected": -669.0, "loss": 0.5421, "rewards/accuracies": 0.7578125, "rewards/chosen": -2.951171875, "rewards/margins": 0.742431640625, "rewards/rejected": -3.6953125, "step": 309 }, { "epoch": 0.3968, "grad_norm": 16.733814154590878, "learning_rate": 5.290535280263835e-07, "logits/chosen": -0.39404296875, "logits/rejected": -0.3988037109375, "logps/chosen": -630.0, "logps/rejected": -687.5, "loss": 0.4677, "rewards/accuracies": 0.765625, "rewards/chosen": -2.814453125, "rewards/margins": 0.91748046875, "rewards/rejected": -3.740234375, "step": 310 }, { "epoch": 0.39808, "grad_norm": 8.75301225998322, "learning_rate": 5.277059019863637e-07, "logits/chosen": -0.43408203125, "logits/rejected": -0.5146484375, "logps/chosen": -606.0, "logps/rejected": -685.25, "loss": 0.5019, "rewards/accuracies": 0.78125, "rewards/chosen": -2.65625, "rewards/margins": 0.901123046875, "rewards/rejected": -3.560546875, "step": 311 }, { "epoch": 0.39936, "grad_norm": 8.024297545260174, "learning_rate": 5.263547169573235e-07, "logits/chosen": -0.55029296875, "logits/rejected": -0.591064453125, "logps/chosen": -567.75, "logps/rejected": -655.0, "loss": 0.5004, "rewards/accuracies": 0.75, "rewards/chosen": -2.51953125, "rewards/margins": 0.76953125, "rewards/rejected": -3.2890625, "step": 312 }, { "epoch": 0.40064, "grad_norm": 9.38829913769337, "learning_rate": 5.25e-07, "logits/chosen": -0.4615478515625, "logits/rejected": -0.5108642578125, "logps/chosen": -597.75, "logps/rejected": -676.0, "loss": 0.4893, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.59375, "rewards/margins": 0.8583984375, "rewards/rejected": -3.455078125, "step": 313 }, { "epoch": 0.40192, "grad_norm": 8.678985799279634, "learning_rate": 5.236417782458656e-07, "logits/chosen": -0.572021484375, "logits/rejected": -0.590576171875, "logps/chosen": -636.5, "logps/rejected": -693.5, "loss": 0.5174, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.84375, "rewards/margins": 0.8515625, "rewards/rejected": -3.6953125, "step": 314 }, { "epoch": 0.4032, "grad_norm": 12.718593158199928, "learning_rate": 5.222800788965847e-07, "logits/chosen": -0.526611328125, "logits/rejected": -0.5595703125, "logps/chosen": -596.0, "logps/rejected": -672.5, "loss": 0.4431, "rewards/accuracies": 0.828125, "rewards/chosen": -2.623046875, "rewards/margins": 0.919677734375, "rewards/rejected": -3.546875, "step": 315 }, { "epoch": 0.40448, "grad_norm": 7.7191674664296075, "learning_rate": 5.209149292234689e-07, "logits/chosen": -0.491455078125, "logits/rejected": -0.586669921875, "logps/chosen": -589.0, "logps/rejected": -668.0, "loss": 0.4413, "rewards/accuracies": 0.796875, "rewards/chosen": -2.6796875, "rewards/margins": 0.98486328125, "rewards/rejected": -3.662109375, "step": 316 }, { "epoch": 0.40576, "grad_norm": 8.070223960301346, "learning_rate": 5.195463565669309e-07, "logits/chosen": -0.482421875, "logits/rejected": -0.54638671875, "logps/chosen": -555.5, "logps/rejected": -614.5, "loss": 0.5058, "rewards/accuracies": 0.7421875, "rewards/chosen": -2.587890625, "rewards/margins": 0.83489990234375, "rewards/rejected": -3.419921875, "step": 317 }, { "epoch": 0.40704, "grad_norm": 10.840771905557805, "learning_rate": 5.18174388335937e-07, "logits/chosen": -0.4683837890625, "logits/rejected": -0.53369140625, "logps/chosen": -602.5, "logps/rejected": -684.5, "loss": 0.4845, "rewards/accuracies": 0.78125, "rewards/chosen": -2.919921875, "rewards/margins": 0.88427734375, "rewards/rejected": -3.798828125, "step": 318 }, { "epoch": 0.40832, "grad_norm": 10.638532746515596, "learning_rate": 5.167990520074577e-07, "logits/chosen": -0.458251953125, "logits/rejected": -0.510498046875, "logps/chosen": -632.5, "logps/rejected": -718.5, "loss": 0.4626, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.06640625, "rewards/margins": 1.0732421875, "rewards/rejected": -4.142578125, "step": 319 }, { "epoch": 0.4096, "grad_norm": 19.063401240095594, "learning_rate": 5.154203751259183e-07, "logits/chosen": -0.400634765625, "logits/rejected": -0.4285888671875, "logps/chosen": -671.5, "logps/rejected": -744.5, "loss": 0.53, "rewards/accuracies": 0.734375, "rewards/chosen": -3.439453125, "rewards/margins": 1.008056640625, "rewards/rejected": -4.447265625, "step": 320 }, { "epoch": 0.41088, "grad_norm": 11.58200685748918, "learning_rate": 5.140383853026462e-07, "logits/chosen": -0.2918701171875, "logits/rejected": -0.3492431640625, "logps/chosen": -709.5, "logps/rejected": -846.0, "loss": 0.4238, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.49609375, "rewards/margins": 1.197265625, "rewards/rejected": -4.6953125, "step": 321 }, { "epoch": 0.41216, "grad_norm": 10.060507343077607, "learning_rate": 5.12653110215319e-07, "logits/chosen": -0.3533172607421875, "logits/rejected": -0.39697265625, "logps/chosen": -668.0, "logps/rejected": -751.0, "loss": 0.4708, "rewards/accuracies": 0.78125, "rewards/chosen": -3.43359375, "rewards/margins": 0.975830078125, "rewards/rejected": -4.40625, "step": 322 }, { "epoch": 0.41344, "grad_norm": 20.29118301920304, "learning_rate": 5.112645776074089e-07, "logits/chosen": -0.33740234375, "logits/rejected": -0.3800048828125, "logps/chosen": -679.5, "logps/rejected": -731.0, "loss": 0.529, "rewards/accuracies": 0.75, "rewards/chosen": -3.548828125, "rewards/margins": 0.9560546875, "rewards/rejected": -4.50390625, "step": 323 }, { "epoch": 0.41472, "grad_norm": 10.344847321745984, "learning_rate": 5.098728152876287e-07, "logits/chosen": -0.3602294921875, "logits/rejected": -0.4044189453125, "logps/chosen": -676.5, "logps/rejected": -783.5, "loss": 0.4689, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.681640625, "rewards/margins": 1.164306640625, "rewards/rejected": -4.84765625, "step": 324 }, { "epoch": 0.416, "grad_norm": 53.68636513984977, "learning_rate": 5.084778511293732e-07, "logits/chosen": -0.37158203125, "logits/rejected": -0.422607421875, "logps/chosen": -695.0, "logps/rejected": -795.5, "loss": 0.5701, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.08203125, "rewards/margins": 0.8125, "rewards/rejected": -4.8984375, "step": 325 }, { "epoch": 0.41728, "grad_norm": 8.17017463161669, "learning_rate": 5.070797130701617e-07, "logits/chosen": -0.4307861328125, "logits/rejected": -0.479736328125, "logps/chosen": -671.0, "logps/rejected": -785.5, "loss": 0.45, "rewards/accuracies": 0.765625, "rewards/chosen": -3.505859375, "rewards/margins": 1.221435546875, "rewards/rejected": -4.73046875, "step": 326 }, { "epoch": 0.41856, "grad_norm": 21.790110998965517, "learning_rate": 5.056784291110795e-07, "logits/chosen": -0.364990234375, "logits/rejected": -0.401123046875, "logps/chosen": -700.5, "logps/rejected": -780.0, "loss": 0.5295, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.9375, "rewards/margins": 0.91845703125, "rewards/rejected": -4.85546875, "step": 327 }, { "epoch": 0.41984, "grad_norm": 12.801819009552196, "learning_rate": 5.04274027316215e-07, "logits/chosen": -0.3697509765625, "logits/rejected": -0.385498046875, "logps/chosen": -688.5, "logps/rejected": -746.5, "loss": 0.573, "rewards/accuracies": 0.75, "rewards/chosen": -3.572265625, "rewards/margins": 0.7158203125, "rewards/rejected": -4.29296875, "step": 328 }, { "epoch": 0.42112, "grad_norm": 9.406211305601078, "learning_rate": 5.028665358120994e-07, "logits/chosen": -0.3780517578125, "logits/rejected": -0.391845703125, "logps/chosen": -657.0, "logps/rejected": -727.0, "loss": 0.4427, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.337890625, "rewards/margins": 0.92919921875, "rewards/rejected": -4.259765625, "step": 329 }, { "epoch": 0.4224, "grad_norm": 10.033069082942374, "learning_rate": 5.014559827871426e-07, "logits/chosen": -0.425048828125, "logits/rejected": -0.448486328125, "logps/chosen": -656.0, "logps/rejected": -715.0, "loss": 0.5266, "rewards/accuracies": 0.78125, "rewards/chosen": -3.21484375, "rewards/margins": 0.91552734375, "rewards/rejected": -4.12890625, "step": 330 }, { "epoch": 0.42368, "grad_norm": 9.652663655572574, "learning_rate": 5.00042396491069e-07, "logits/chosen": -0.506103515625, "logits/rejected": -0.539794921875, "logps/chosen": -658.0, "logps/rejected": -751.0, "loss": 0.4558, "rewards/accuracies": 0.796875, "rewards/chosen": -3.150390625, "rewards/margins": 1.1103515625, "rewards/rejected": -4.26171875, "step": 331 }, { "epoch": 0.42496, "grad_norm": 12.260458134069255, "learning_rate": 4.986258052343511e-07, "logits/chosen": -0.4840087890625, "logits/rejected": -0.509521484375, "logps/chosen": -655.5, "logps/rejected": -725.0, "loss": 0.5506, "rewards/accuracies": 0.71875, "rewards/chosen": -3.10546875, "rewards/margins": 0.929931640625, "rewards/rejected": -4.033203125, "step": 332 }, { "epoch": 0.42624, "grad_norm": 8.767742475858029, "learning_rate": 4.972062373876435e-07, "logits/chosen": -0.4326171875, "logits/rejected": -0.478515625, "logps/chosen": -643.5, "logps/rejected": -712.5, "loss": 0.5134, "rewards/accuracies": 0.75, "rewards/chosen": -3.208984375, "rewards/margins": 0.804443359375, "rewards/rejected": -4.01171875, "step": 333 }, { "epoch": 0.42752, "grad_norm": 8.183910291192875, "learning_rate": 4.95783721381214e-07, "logits/chosen": -0.564453125, "logits/rejected": -0.620361328125, "logps/chosen": -615.5, "logps/rejected": -733.5, "loss": 0.4354, "rewards/accuracies": 0.765625, "rewards/chosen": -2.94921875, "rewards/margins": 1.19970703125, "rewards/rejected": -4.150390625, "step": 334 }, { "epoch": 0.4288, "grad_norm": 12.662549180581198, "learning_rate": 4.943582857043742e-07, "logits/chosen": -0.531005859375, "logits/rejected": -0.53173828125, "logps/chosen": -628.0, "logps/rejected": -703.0, "loss": 0.4677, "rewards/accuracies": 0.7890625, "rewards/chosen": -2.96484375, "rewards/margins": 0.94482421875, "rewards/rejected": -3.9140625, "step": 335 }, { "epoch": 0.43008, "grad_norm": 11.452856436676567, "learning_rate": 4.929299589049095e-07, "logits/chosen": -0.5340576171875, "logits/rejected": -0.6025390625, "logps/chosen": -600.0, "logps/rejected": -719.0, "loss": 0.4012, "rewards/accuracies": 0.828125, "rewards/chosen": -2.669921875, "rewards/margins": 1.154296875, "rewards/rejected": -3.826171875, "step": 336 }, { "epoch": 0.43136, "grad_norm": 8.22048523281164, "learning_rate": 4.914987695885067e-07, "logits/chosen": -0.60498046875, "logits/rejected": -0.62744140625, "logps/chosen": -667.0, "logps/rejected": -755.0, "loss": 0.483, "rewards/accuracies": 0.7890625, "rewards/chosen": -2.97265625, "rewards/margins": 1.0361328125, "rewards/rejected": -4.005859375, "step": 337 }, { "epoch": 0.43264, "grad_norm": 12.225224739430297, "learning_rate": 4.900647464181817e-07, "logits/chosen": -0.485107421875, "logits/rejected": -0.52783203125, "logps/chosen": -680.5, "logps/rejected": -767.5, "loss": 0.5177, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.234375, "rewards/margins": 1.04296875, "rewards/rejected": -4.27734375, "step": 338 }, { "epoch": 0.43392, "grad_norm": 8.757850519576271, "learning_rate": 4.886279181137049e-07, "logits/chosen": -0.552490234375, "logits/rejected": -0.578369140625, "logps/chosen": -653.0, "logps/rejected": -717.5, "loss": 0.5448, "rewards/accuracies": 0.71875, "rewards/chosen": -3.142578125, "rewards/margins": 0.8095703125, "rewards/rejected": -3.955078125, "step": 339 }, { "epoch": 0.4352, "grad_norm": 9.695868477167725, "learning_rate": 4.871883134510262e-07, "logits/chosen": -0.494384765625, "logits/rejected": -0.51611328125, "logps/chosen": -625.0, "logps/rejected": -719.5, "loss": 0.4424, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.021484375, "rewards/margins": 1.091796875, "rewards/rejected": -4.111328125, "step": 340 }, { "epoch": 0.43648, "grad_norm": 8.243809920140835, "learning_rate": 4.857459612616992e-07, "logits/chosen": -0.506591796875, "logits/rejected": -0.54443359375, "logps/chosen": -581.0, "logps/rejected": -647.0, "loss": 0.5348, "rewards/accuracies": 0.7265625, "rewards/chosen": -2.822265625, "rewards/margins": 0.7535400390625, "rewards/rejected": -3.580078125, "step": 341 }, { "epoch": 0.43776, "grad_norm": 18.55929104936941, "learning_rate": 4.843008904323029e-07, "logits/chosen": -0.501708984375, "logits/rejected": -0.51806640625, "logps/chosen": -652.5, "logps/rejected": -742.0, "loss": 0.4282, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1953125, "rewards/margins": 1.115234375, "rewards/rejected": -4.31640625, "step": 342 }, { "epoch": 0.43904, "grad_norm": 10.539994924615243, "learning_rate": 4.828531299038638e-07, "logits/chosen": -0.484375, "logits/rejected": -0.52587890625, "logps/chosen": -627.0, "logps/rejected": -710.5, "loss": 0.5168, "rewards/accuracies": 0.796875, "rewards/chosen": -3.1171875, "rewards/margins": 0.879150390625, "rewards/rejected": -3.99609375, "step": 343 }, { "epoch": 0.44032, "grad_norm": 14.239765085999206, "learning_rate": 4.81402708671276e-07, "logits/chosen": -0.473876953125, "logits/rejected": -0.54345703125, "logps/chosen": -616.0, "logps/rejected": -745.5, "loss": 0.3896, "rewards/accuracies": 0.859375, "rewards/chosen": -3.083984375, "rewards/margins": 1.267822265625, "rewards/rejected": -4.3515625, "step": 344 }, { "epoch": 0.4416, "grad_norm": 11.762433092658819, "learning_rate": 4.799496557827208e-07, "logits/chosen": -0.507568359375, "logits/rejected": -0.552001953125, "logps/chosen": -673.0, "logps/rejected": -818.5, "loss": 0.4055, "rewards/accuracies": 0.8125, "rewards/chosen": -3.478515625, "rewards/margins": 1.3271484375, "rewards/rejected": -4.80859375, "step": 345 }, { "epoch": 0.44288, "grad_norm": 13.3714208302887, "learning_rate": 4.784940003390846e-07, "logits/chosen": -0.47705078125, "logits/rejected": -0.4874267578125, "logps/chosen": -651.0, "logps/rejected": -722.5, "loss": 0.484, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.33203125, "rewards/margins": 0.9072265625, "rewards/rejected": -4.244140625, "step": 346 }, { "epoch": 0.44416, "grad_norm": 8.419739675084086, "learning_rate": 4.770357714933765e-07, "logits/chosen": -0.3489990234375, "logits/rejected": -0.399169921875, "logps/chosen": -715.5, "logps/rejected": -804.0, "loss": 0.4466, "rewards/accuracies": 0.796875, "rewards/chosen": -3.701171875, "rewards/margins": 1.22509765625, "rewards/rejected": -4.9296875, "step": 347 }, { "epoch": 0.44544, "grad_norm": 28.282829493788125, "learning_rate": 4.7557499845014363e-07, "logits/chosen": -0.34809112548828125, "logits/rejected": -0.3986968994140625, "logps/chosen": -692.0, "logps/rejected": -785.5, "loss": 0.5644, "rewards/accuracies": 0.765625, "rewards/chosen": -3.8125, "rewards/margins": 1.04931640625, "rewards/rejected": -4.859375, "step": 348 }, { "epoch": 0.44672, "grad_norm": 11.217438269564665, "learning_rate": 4.741117104648874e-07, "logits/chosen": -0.368408203125, "logits/rejected": -0.39208984375, "logps/chosen": -736.5, "logps/rejected": -841.0, "loss": 0.4152, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.939453125, "rewards/margins": 1.28564453125, "rewards/rejected": -5.2265625, "step": 349 }, { "epoch": 0.448, "grad_norm": 9.988120972029757, "learning_rate": 4.726459368434768e-07, "logits/chosen": -0.345458984375, "logits/rejected": -0.3746337890625, "logps/chosen": -782.5, "logps/rejected": -898.5, "loss": 0.4168, "rewards/accuracies": 0.8359375, "rewards/chosen": -4.28125, "rewards/margins": 1.29931640625, "rewards/rejected": -5.578125, "step": 350 }, { "epoch": 0.44928, "grad_norm": 27.1774086371912, "learning_rate": 4.7117770694156146e-07, "logits/chosen": -0.35791015625, "logits/rejected": -0.396728515625, "logps/chosen": -795.0, "logps/rejected": -910.0, "loss": 0.5071, "rewards/accuracies": 0.75, "rewards/chosen": -4.61328125, "rewards/margins": 1.2353515625, "rewards/rejected": -5.84765625, "step": 351 }, { "epoch": 0.45056, "grad_norm": 18.51059077469931, "learning_rate": 4.697070501639841e-07, "logits/chosen": -0.3016357421875, "logits/rejected": -0.3563232421875, "logps/chosen": -777.0, "logps/rejected": -924.0, "loss": 0.4595, "rewards/accuracies": 0.8203125, "rewards/chosen": -4.75390625, "rewards/margins": 1.1455078125, "rewards/rejected": -5.90234375, "step": 352 }, { "epoch": 0.45184, "grad_norm": 33.97653228823091, "learning_rate": 4.682339959641915e-07, "logits/chosen": -0.3653564453125, "logits/rejected": -0.433349609375, "logps/chosen": -755.0, "logps/rejected": -877.0, "loss": 0.505, "rewards/accuracies": 0.734375, "rewards/chosen": -4.537109375, "rewards/margins": 1.25146484375, "rewards/rejected": -5.7890625, "step": 353 }, { "epoch": 0.45312, "grad_norm": 38.200517048662974, "learning_rate": 4.6675857384364475e-07, "logits/chosen": -0.3599853515625, "logits/rejected": -0.429931640625, "logps/chosen": -783.0, "logps/rejected": -925.0, "loss": 0.5449, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.583984375, "rewards/margins": 1.157470703125, "rewards/rejected": -5.7421875, "step": 354 }, { "epoch": 0.4544, "grad_norm": 24.734123344344813, "learning_rate": 4.6528081335122786e-07, "logits/chosen": -0.34062957763671875, "logits/rejected": -0.3896484375, "logps/chosen": -734.5, "logps/rejected": -849.0, "loss": 0.522, "rewards/accuracies": 0.765625, "rewards/chosen": -4.123046875, "rewards/margins": 1.119140625, "rewards/rejected": -5.2421875, "step": 355 }, { "epoch": 0.45568, "grad_norm": 8.65649758872943, "learning_rate": 4.6380074408265677e-07, "logits/chosen": -0.3984375, "logits/rejected": -0.3931884765625, "logps/chosen": -743.0, "logps/rejected": -840.0, "loss": 0.4604, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.982421875, "rewards/margins": 1.145751953125, "rewards/rejected": -5.12890625, "step": 356 }, { "epoch": 0.45696, "grad_norm": 13.642027050493772, "learning_rate": 4.62318395679886e-07, "logits/chosen": -0.3692626953125, "logits/rejected": -0.432861328125, "logps/chosen": -708.25, "logps/rejected": -823.5, "loss": 0.4635, "rewards/accuracies": 0.734375, "rewards/chosen": -3.76171875, "rewards/margins": 1.205078125, "rewards/rejected": -4.9609375, "step": 357 }, { "epoch": 0.45824, "grad_norm": 8.597706559419494, "learning_rate": 4.608337978305154e-07, "logits/chosen": -0.404541015625, "logits/rejected": -0.471435546875, "logps/chosen": -671.5, "logps/rejected": -778.5, "loss": 0.465, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.478515625, "rewards/margins": 1.0322265625, "rewards/rejected": -4.515625, "step": 358 }, { "epoch": 0.45952, "grad_norm": 16.144497938016837, "learning_rate": 4.593469802671951e-07, "logits/chosen": -0.454833984375, "logits/rejected": -0.503173828125, "logps/chosen": -623.0, "logps/rejected": -733.5, "loss": 0.427, "rewards/accuracies": 0.828125, "rewards/chosen": -3.236328125, "rewards/margins": 1.34521484375, "rewards/rejected": -4.58203125, "step": 359 }, { "epoch": 0.4608, "grad_norm": 15.765499984405258, "learning_rate": 4.5785797276703074e-07, "logits/chosen": -0.46826171875, "logits/rejected": -0.511474609375, "logps/chosen": -686.5, "logps/rejected": -804.0, "loss": 0.5152, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.60546875, "rewards/margins": 1.046875, "rewards/rejected": -4.65625, "step": 360 }, { "epoch": 0.46208, "grad_norm": 8.245124791845544, "learning_rate": 4.563668051509864e-07, "logits/chosen": -0.49560546875, "logits/rejected": -0.5390625, "logps/chosen": -630.5, "logps/rejected": -747.5, "loss": 0.4031, "rewards/accuracies": 0.8125, "rewards/chosen": -3.111328125, "rewards/margins": 1.2119140625, "rewards/rejected": -4.326171875, "step": 361 }, { "epoch": 0.46336, "grad_norm": 8.559688059984063, "learning_rate": 4.5487350728328796e-07, "logits/chosen": -0.4332275390625, "logits/rejected": -0.4951171875, "logps/chosen": -630.0, "logps/rejected": -726.0, "loss": 0.4651, "rewards/accuracies": 0.78125, "rewards/chosen": -3.236328125, "rewards/margins": 1.109375, "rewards/rejected": -4.34765625, "step": 362 }, { "epoch": 0.46464, "grad_norm": 10.160592154999383, "learning_rate": 4.533781090708244e-07, "logits/chosen": -0.454345703125, "logits/rejected": -0.46337890625, "logps/chosen": -671.5, "logps/rejected": -767.0, "loss": 0.495, "rewards/accuracies": 0.75, "rewards/chosen": -3.326171875, "rewards/margins": 0.933349609375, "rewards/rejected": -4.259765625, "step": 363 }, { "epoch": 0.46592, "grad_norm": 8.364222921089874, "learning_rate": 4.518806404625495e-07, "logits/chosen": -0.494140625, "logits/rejected": -0.537109375, "logps/chosen": -641.0, "logps/rejected": -734.5, "loss": 0.4365, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.1640625, "rewards/margins": 1.05615234375, "rewards/rejected": -4.220703125, "step": 364 }, { "epoch": 0.4672, "grad_norm": 9.713400168567283, "learning_rate": 4.503811314488816e-07, "logits/chosen": -0.428955078125, "logits/rejected": -0.471923828125, "logps/chosen": -652.5, "logps/rejected": -697.0, "loss": 0.5314, "rewards/accuracies": 0.703125, "rewards/chosen": -3.28125, "rewards/margins": 0.78515625, "rewards/rejected": -4.06640625, "step": 365 }, { "epoch": 0.46848, "grad_norm": 8.9296942425344, "learning_rate": 4.488796120611029e-07, "logits/chosen": -0.44140625, "logits/rejected": -0.49609375, "logps/chosen": -610.25, "logps/rejected": -724.5, "loss": 0.5128, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.134765625, "rewards/margins": 1.03662109375, "rewards/rejected": -4.169921875, "step": 366 }, { "epoch": 0.46976, "grad_norm": 8.165959460812232, "learning_rate": 4.4737611237075845e-07, "logits/chosen": -0.47265625, "logits/rejected": -0.504150390625, "logps/chosen": -656.0, "logps/rejected": -809.5, "loss": 0.4489, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.306640625, "rewards/margins": 1.1591796875, "rewards/rejected": -4.46484375, "step": 367 }, { "epoch": 0.47104, "grad_norm": 15.893475743625158, "learning_rate": 4.4587066248905335e-07, "logits/chosen": -0.407470703125, "logits/rejected": -0.41796875, "logps/chosen": -636.0, "logps/rejected": -754.5, "loss": 0.4304, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.076171875, "rewards/margins": 1.13623046875, "rewards/rejected": -4.212890625, "step": 368 }, { "epoch": 0.47232, "grad_norm": 12.204585822720064, "learning_rate": 4.443632925662504e-07, "logits/chosen": -0.416015625, "logits/rejected": -0.44873046875, "logps/chosen": -596.5, "logps/rejected": -670.0, "loss": 0.491, "rewards/accuracies": 0.734375, "rewards/chosen": -2.927734375, "rewards/margins": 0.92626953125, "rewards/rejected": -3.8515625, "step": 369 }, { "epoch": 0.4736, "grad_norm": 11.003968270027663, "learning_rate": 4.4285403279106523e-07, "logits/chosen": -0.3580322265625, "logits/rejected": -0.4219970703125, "logps/chosen": -611.5, "logps/rejected": -725.5, "loss": 0.4313, "rewards/accuracies": 0.796875, "rewards/chosen": -3.126953125, "rewards/margins": 1.04248046875, "rewards/rejected": -4.162109375, "step": 370 }, { "epoch": 0.47488, "grad_norm": 199.6202974665465, "learning_rate": 4.4134291339006305e-07, "logits/chosen": -0.3883056640625, "logits/rejected": -0.38299560546875, "logps/chosen": -646.5, "logps/rejected": -762.0, "loss": 0.5204, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.2890625, "rewards/margins": 1.00927734375, "rewards/rejected": -4.296875, "step": 371 }, { "epoch": 0.47616, "grad_norm": 11.144489957125527, "learning_rate": 4.3982996462705184e-07, "logits/chosen": -0.398193359375, "logits/rejected": -0.4439697265625, "logps/chosen": -675.0, "logps/rejected": -765.5, "loss": 0.4755, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.494140625, "rewards/margins": 1.0732421875, "rewards/rejected": -4.5703125, "step": 372 }, { "epoch": 0.47744, "grad_norm": 12.09850803118018, "learning_rate": 4.383152168024776e-07, "logits/chosen": -0.3857421875, "logits/rejected": -0.430419921875, "logps/chosen": -658.0, "logps/rejected": -753.0, "loss": 0.5137, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.525390625, "rewards/margins": 0.9542236328125, "rewards/rejected": -4.482421875, "step": 373 }, { "epoch": 0.47872, "grad_norm": 10.440049833545846, "learning_rate": 4.3679870025281645e-07, "logits/chosen": -0.3140869140625, "logits/rejected": -0.341064453125, "logps/chosen": -693.0, "logps/rejected": -782.0, "loss": 0.5013, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.654296875, "rewards/margins": 0.970458984375, "rewards/rejected": -4.626953125, "step": 374 }, { "epoch": 0.48, "grad_norm": 18.264633064403437, "learning_rate": 4.3528044534996764e-07, "logits/chosen": -0.296142578125, "logits/rejected": -0.359619140625, "logps/chosen": -638.0, "logps/rejected": -704.5, "loss": 0.5782, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.236328125, "rewards/margins": 0.8702392578125, "rewards/rejected": -4.109375, "step": 375 }, { "epoch": 0.48128, "grad_norm": 8.417915455120893, "learning_rate": 4.337604825006452e-07, "logits/chosen": -0.3731689453125, "logits/rejected": -0.4154052734375, "logps/chosen": -638.0, "logps/rejected": -742.5, "loss": 0.4726, "rewards/accuracies": 0.78125, "rewards/chosen": -3.23828125, "rewards/margins": 1.09521484375, "rewards/rejected": -4.33203125, "step": 376 }, { "epoch": 0.48256, "grad_norm": 8.947246273156576, "learning_rate": 4.3223884214576875e-07, "logits/chosen": -0.369049072265625, "logits/rejected": -0.3988037109375, "logps/chosen": -686.5, "logps/rejected": -759.5, "loss": 0.5215, "rewards/accuracies": 0.78125, "rewards/chosen": -3.53515625, "rewards/margins": 0.819091796875, "rewards/rejected": -4.35546875, "step": 377 }, { "epoch": 0.48384, "grad_norm": 9.885820986822363, "learning_rate": 4.3071555475985404e-07, "logits/chosen": -0.347412109375, "logits/rejected": -0.433349609375, "logps/chosen": -605.5, "logps/rejected": -716.5, "loss": 0.4847, "rewards/accuracies": 0.734375, "rewards/chosen": -3.021484375, "rewards/margins": 1.01611328125, "rewards/rejected": -4.0390625, "step": 378 }, { "epoch": 0.48512, "grad_norm": 8.772239493272059, "learning_rate": 4.2919065085040284e-07, "logits/chosen": -0.371337890625, "logits/rejected": -0.4169921875, "logps/chosen": -646.0, "logps/rejected": -746.0, "loss": 0.4578, "rewards/accuracies": 0.78125, "rewards/chosen": -3.228515625, "rewards/margins": 1.074462890625, "rewards/rejected": -4.298828125, "step": 379 }, { "epoch": 0.4864, "grad_norm": 10.720670863122297, "learning_rate": 4.2766416095729113e-07, "logits/chosen": -0.37646484375, "logits/rejected": -0.43701171875, "logps/chosen": -662.5, "logps/rejected": -751.5, "loss": 0.4435, "rewards/accuracies": 0.828125, "rewards/chosen": -3.1328125, "rewards/margins": 1.14599609375, "rewards/rejected": -4.275390625, "step": 380 }, { "epoch": 0.48768, "grad_norm": 11.03467333595442, "learning_rate": 4.261361156521586e-07, "logits/chosen": -0.458740234375, "logits/rejected": -0.530517578125, "logps/chosen": -665.5, "logps/rejected": -731.0, "loss": 0.4372, "rewards/accuracies": 0.828125, "rewards/chosen": -3.1015625, "rewards/margins": 1.2353515625, "rewards/rejected": -4.33984375, "step": 381 }, { "epoch": 0.48896, "grad_norm": 9.450090321989622, "learning_rate": 4.2460654553779557e-07, "logits/chosen": -0.43212890625, "logits/rejected": -0.47119140625, "logps/chosen": -619.75, "logps/rejected": -714.0, "loss": 0.5313, "rewards/accuracies": 0.734375, "rewards/chosen": -3.197265625, "rewards/margins": 0.80224609375, "rewards/rejected": -3.99609375, "step": 382 }, { "epoch": 0.49024, "grad_norm": 14.256426075381693, "learning_rate": 4.230754812475305e-07, "logits/chosen": -0.40234375, "logits/rejected": -0.401611328125, "logps/chosen": -587.25, "logps/rejected": -672.0, "loss": 0.4617, "rewards/accuracies": 0.8203125, "rewards/chosen": -2.955078125, "rewards/margins": 0.952392578125, "rewards/rejected": -3.90234375, "step": 383 }, { "epoch": 0.49152, "grad_norm": 16.353106293708784, "learning_rate": 4.2154295344461614e-07, "logits/chosen": -0.456787109375, "logits/rejected": -0.4775390625, "logps/chosen": -661.0, "logps/rejected": -743.5, "loss": 0.4423, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.201171875, "rewards/margins": 1.04833984375, "rewards/rejected": -4.251953125, "step": 384 }, { "epoch": 0.4928, "grad_norm": 11.021886637647597, "learning_rate": 4.2000899282161556e-07, "logits/chosen": -0.51171875, "logits/rejected": -0.52880859375, "logps/chosen": -652.5, "logps/rejected": -727.5, "loss": 0.4629, "rewards/accuracies": 0.78125, "rewards/chosen": -3.22265625, "rewards/margins": 1.03076171875, "rewards/rejected": -4.251953125, "step": 385 }, { "epoch": 0.49408, "grad_norm": 12.461963298467701, "learning_rate": 4.1847363009978773e-07, "logits/chosen": -0.3712158203125, "logits/rejected": -0.412353515625, "logps/chosen": -608.0, "logps/rejected": -697.5, "loss": 0.4655, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.07421875, "rewards/margins": 1.073974609375, "rewards/rejected": -4.1484375, "step": 386 }, { "epoch": 0.49536, "grad_norm": 11.664191303420534, "learning_rate": 4.169368960284718e-07, "logits/chosen": -0.2545166015625, "logits/rejected": -0.374755859375, "logps/chosen": -602.5, "logps/rejected": -677.0, "loss": 0.5172, "rewards/accuracies": 0.75, "rewards/chosen": -2.953125, "rewards/margins": 0.90625, "rewards/rejected": -3.857421875, "step": 387 }, { "epoch": 0.49664, "grad_norm": 9.22000344583139, "learning_rate": 4.1539882138447173e-07, "logits/chosen": -0.439453125, "logits/rejected": -0.486328125, "logps/chosen": -675.0, "logps/rejected": -789.5, "loss": 0.437, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.564453125, "rewards/margins": 1.2265625, "rewards/rejected": -4.796875, "step": 388 }, { "epoch": 0.49792, "grad_norm": 10.388005400752823, "learning_rate": 4.138594369714394e-07, "logits/chosen": -0.369140625, "logits/rejected": -0.390625, "logps/chosen": -634.0, "logps/rejected": -700.5, "loss": 0.515, "rewards/accuracies": 0.75, "rewards/chosen": -3.412109375, "rewards/margins": 0.984130859375, "rewards/rejected": -4.400390625, "step": 389 }, { "epoch": 0.4992, "grad_norm": 19.773927339862066, "learning_rate": 4.1231877361925835e-07, "logits/chosen": -0.27435302734375, "logits/rejected": -0.29425048828125, "logps/chosen": -664.0, "logps/rejected": -725.5, "loss": 0.5527, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.544921875, "rewards/margins": 0.83642578125, "rewards/rejected": -4.37890625, "step": 390 }, { "epoch": 0.50048, "grad_norm": 26.019664464089015, "learning_rate": 4.1077686218342565e-07, "logits/chosen": -0.300048828125, "logits/rejected": -0.372314453125, "logps/chosen": -690.5, "logps/rejected": -777.0, "loss": 0.6148, "rewards/accuracies": 0.6953125, "rewards/chosen": -3.828125, "rewards/margins": 0.8681640625, "rewards/rejected": -4.69921875, "step": 391 }, { "epoch": 0.50176, "grad_norm": 9.566179596760582, "learning_rate": 4.0923373354443425e-07, "logits/chosen": -0.34130859375, "logits/rejected": -0.38330078125, "logps/chosen": -703.5, "logps/rejected": -784.0, "loss": 0.5242, "rewards/accuracies": 0.78125, "rewards/chosen": -3.611328125, "rewards/margins": 0.9840087890625, "rewards/rejected": -4.595703125, "step": 392 }, { "epoch": 0.50304, "grad_norm": 9.68452388640175, "learning_rate": 4.076894186071548e-07, "logits/chosen": -0.363037109375, "logits/rejected": -0.41796875, "logps/chosen": -660.0, "logps/rejected": -764.0, "loss": 0.4657, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.33984375, "rewards/margins": 1.11669921875, "rewards/rejected": -4.45703125, "step": 393 }, { "epoch": 0.50432, "grad_norm": 12.917842813325114, "learning_rate": 4.0614394830021604e-07, "logits/chosen": -0.3944091796875, "logits/rejected": -0.46484375, "logps/chosen": -638.5, "logps/rejected": -742.0, "loss": 0.4348, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.1484375, "rewards/margins": 1.259033203125, "rewards/rejected": -4.41015625, "step": 394 }, { "epoch": 0.5056, "grad_norm": 9.314495046466147, "learning_rate": 4.0459735357538624e-07, "logits/chosen": -0.392578125, "logits/rejected": -0.4384765625, "logps/chosen": -630.5, "logps/rejected": -722.0, "loss": 0.4576, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.1796875, "rewards/margins": 1.0947265625, "rewards/rejected": -4.26953125, "step": 395 }, { "epoch": 0.50688, "grad_norm": 10.547291120299455, "learning_rate": 4.030496654069524e-07, "logits/chosen": -0.4755859375, "logits/rejected": -0.510009765625, "logps/chosen": -657.0, "logps/rejected": -736.5, "loss": 0.4343, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.126953125, "rewards/margins": 1.056640625, "rewards/rejected": -4.181640625, "step": 396 }, { "epoch": 0.50816, "grad_norm": 8.647735781009173, "learning_rate": 4.0150091479110063e-07, "logits/chosen": -0.362060546875, "logits/rejected": -0.4324951171875, "logps/chosen": -639.0, "logps/rejected": -751.5, "loss": 0.4441, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1640625, "rewards/margins": 1.125, "rewards/rejected": -4.291015625, "step": 397 }, { "epoch": 0.50944, "grad_norm": 8.598126228517861, "learning_rate": 3.99951132745295e-07, "logits/chosen": -0.4058837890625, "logits/rejected": -0.457275390625, "logps/chosen": -732.0, "logps/rejected": -845.0, "loss": 0.4575, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.447265625, "rewards/margins": 1.125, "rewards/rejected": -4.57421875, "step": 398 }, { "epoch": 0.51072, "grad_norm": 8.013248595167934, "learning_rate": 3.984003503076566e-07, "logits/chosen": -0.36474609375, "logits/rejected": -0.416748046875, "logps/chosen": -674.5, "logps/rejected": -795.0, "loss": 0.424, "rewards/accuracies": 0.828125, "rewards/chosen": -3.474609375, "rewards/margins": 1.25830078125, "rewards/rejected": -4.73046875, "step": 399 }, { "epoch": 0.512, "grad_norm": 9.871007801580504, "learning_rate": 3.968485985363416e-07, "logits/chosen": -0.343994140625, "logits/rejected": -0.3740234375, "logps/chosen": -646.5, "logps/rejected": -742.5, "loss": 0.4432, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.3203125, "rewards/margins": 1.1904296875, "rewards/rejected": -4.5078125, "step": 400 }, { "epoch": 0.512, "eval_logits/chosen": -0.31884765625, "eval_logits/rejected": -0.39837646484375, "eval_logps/chosen": -663.0, "eval_logps/rejected": -743.5, "eval_loss": 0.49900001287460327, "eval_rewards/accuracies": 0.741406261920929, "eval_rewards/chosen": -3.4296875, "eval_rewards/margins": 0.991943359375, "eval_rewards/rejected": -4.4228515625, "eval_runtime": 27.5022, "eval_samples_per_second": 18.18, "eval_steps_per_second": 0.582, "step": 400 }, { "epoch": 0.51328, "grad_norm": 9.918254110325114, "learning_rate": 3.9529590850891934e-07, "logits/chosen": -0.31048583984375, "logits/rejected": -0.33154296875, "logps/chosen": -675.0, "logps/rejected": -770.0, "loss": 0.4547, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.498046875, "rewards/margins": 1.178955078125, "rewards/rejected": -4.67578125, "step": 401 }, { "epoch": 0.51456, "grad_norm": 10.592955302658622, "learning_rate": 3.9374231132175044e-07, "logits/chosen": -0.3048095703125, "logits/rejected": -0.3560791015625, "logps/chosen": -670.0, "logps/rejected": -794.5, "loss": 0.4097, "rewards/accuracies": 0.765625, "rewards/chosen": -3.53515625, "rewards/margins": 1.30126953125, "rewards/rejected": -4.84375, "step": 402 }, { "epoch": 0.51584, "grad_norm": 9.608106146275887, "learning_rate": 3.92187838089363e-07, "logits/chosen": -0.361328125, "logits/rejected": -0.39404296875, "logps/chosen": -707.5, "logps/rejected": -818.5, "loss": 0.3828, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.662109375, "rewards/margins": 1.3974609375, "rewards/rejected": -5.0546875, "step": 403 }, { "epoch": 0.51712, "grad_norm": 14.34108279812411, "learning_rate": 3.906325199438306e-07, "logits/chosen": -0.335693359375, "logits/rejected": -0.384765625, "logps/chosen": -711.5, "logps/rejected": -813.0, "loss": 0.4382, "rewards/accuracies": 0.8125, "rewards/chosen": -3.736328125, "rewards/margins": 1.1923828125, "rewards/rejected": -4.93359375, "step": 404 }, { "epoch": 0.5184, "grad_norm": 11.139031446712131, "learning_rate": 3.890763880341477e-07, "logits/chosen": -0.3443603515625, "logits/rejected": -0.3773193359375, "logps/chosen": -662.75, "logps/rejected": -766.0, "loss": 0.5007, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.744140625, "rewards/margins": 1.0185546875, "rewards/rejected": -4.76953125, "step": 405 }, { "epoch": 0.51968, "grad_norm": 8.55091082093681, "learning_rate": 3.875194735256067e-07, "logits/chosen": -0.3232421875, "logits/rejected": -0.3831787109375, "logps/chosen": -675.0, "logps/rejected": -830.5, "loss": 0.4086, "rewards/accuracies": 0.796875, "rewards/chosen": -3.86328125, "rewards/margins": 1.25634765625, "rewards/rejected": -5.119140625, "step": 406 }, { "epoch": 0.52096, "grad_norm": 11.406620827960465, "learning_rate": 3.859618075991735e-07, "logits/chosen": -0.28350830078125, "logits/rejected": -0.31494140625, "logps/chosen": -730.0, "logps/rejected": -811.0, "loss": 0.5632, "rewards/accuracies": 0.6953125, "rewards/chosen": -3.947265625, "rewards/margins": 0.9580078125, "rewards/rejected": -4.90625, "step": 407 }, { "epoch": 0.52224, "grad_norm": 11.55895260093868, "learning_rate": 3.8440342145086245e-07, "logits/chosen": -0.2462158203125, "logits/rejected": -0.32208251953125, "logps/chosen": -710.5, "logps/rejected": -829.0, "loss": 0.465, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.90234375, "rewards/margins": 1.115234375, "rewards/rejected": -5.01953125, "step": 408 }, { "epoch": 0.52352, "grad_norm": 9.334623486544036, "learning_rate": 3.828443462911127e-07, "logits/chosen": -0.235107421875, "logits/rejected": -0.2991943359375, "logps/chosen": -705.0, "logps/rejected": -859.0, "loss": 0.4231, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.93359375, "rewards/margins": 1.3603515625, "rewards/rejected": -5.2890625, "step": 409 }, { "epoch": 0.5248, "grad_norm": 9.304030036813096, "learning_rate": 3.8128461334416223e-07, "logits/chosen": -0.2989501953125, "logits/rejected": -0.375732421875, "logps/chosen": -671.0, "logps/rejected": -773.0, "loss": 0.4455, "rewards/accuracies": 0.734375, "rewards/chosen": -3.55078125, "rewards/margins": 1.16357421875, "rewards/rejected": -4.71484375, "step": 410 }, { "epoch": 0.52608, "grad_norm": 21.163435172332214, "learning_rate": 3.7972425384742267e-07, "logits/chosen": -0.275146484375, "logits/rejected": -0.3277587890625, "logps/chosen": -727.5, "logps/rejected": -806.0, "loss": 0.5246, "rewards/accuracies": 0.71875, "rewards/chosen": -3.919921875, "rewards/margins": 0.976318359375, "rewards/rejected": -4.89453125, "step": 411 }, { "epoch": 0.52736, "grad_norm": 15.093249035393466, "learning_rate": 3.781632990508541e-07, "logits/chosen": -0.327880859375, "logits/rejected": -0.344482421875, "logps/chosen": -738.5, "logps/rejected": -821.5, "loss": 0.5157, "rewards/accuracies": 0.796875, "rewards/chosen": -3.953125, "rewards/margins": 1.029296875, "rewards/rejected": -4.98046875, "step": 412 }, { "epoch": 0.52864, "grad_norm": 8.914765884163025, "learning_rate": 3.766017802163386e-07, "logits/chosen": -0.361083984375, "logits/rejected": -0.388916015625, "logps/chosen": -689.5, "logps/rejected": -740.5, "loss": 0.4922, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.609375, "rewards/margins": 1.026611328125, "rewards/rejected": -4.6328125, "step": 413 }, { "epoch": 0.52992, "grad_norm": 13.842987260589894, "learning_rate": 3.750397286170548e-07, "logits/chosen": -0.40478515625, "logits/rejected": -0.459716796875, "logps/chosen": -705.5, "logps/rejected": -816.0, "loss": 0.4627, "rewards/accuracies": 0.765625, "rewards/chosen": -3.615234375, "rewards/margins": 1.1142578125, "rewards/rejected": -4.734375, "step": 414 }, { "epoch": 0.5312, "grad_norm": 10.861145048772103, "learning_rate": 3.734771755368508e-07, "logits/chosen": -0.413818359375, "logits/rejected": -0.449462890625, "logps/chosen": -667.0, "logps/rejected": -737.0, "loss": 0.4652, "rewards/accuracies": 0.796875, "rewards/chosen": -3.353515625, "rewards/margins": 0.997314453125, "rewards/rejected": -4.3515625, "step": 415 }, { "epoch": 0.53248, "grad_norm": 10.048437842384944, "learning_rate": 3.7191415226961866e-07, "logits/chosen": -0.4249267578125, "logits/rejected": -0.449951171875, "logps/chosen": -663.5, "logps/rejected": -741.0, "loss": 0.5003, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.177734375, "rewards/margins": 1.00439453125, "rewards/rejected": -4.17578125, "step": 416 }, { "epoch": 0.53376, "grad_norm": 7.560247582517948, "learning_rate": 3.703506901186665e-07, "logits/chosen": -0.41876220703125, "logits/rejected": -0.4765625, "logps/chosen": -679.0, "logps/rejected": -779.5, "loss": 0.4335, "rewards/accuracies": 0.8125, "rewards/chosen": -3.23046875, "rewards/margins": 1.267578125, "rewards/rejected": -4.5, "step": 417 }, { "epoch": 0.53504, "grad_norm": 10.537078564076927, "learning_rate": 3.687868203960925e-07, "logits/chosen": -0.408935546875, "logits/rejected": -0.434326171875, "logps/chosen": -634.0, "logps/rejected": -747.5, "loss": 0.4401, "rewards/accuracies": 0.8125, "rewards/chosen": -2.978515625, "rewards/margins": 1.08984375, "rewards/rejected": -4.06640625, "step": 418 }, { "epoch": 0.53632, "grad_norm": 13.668617572855705, "learning_rate": 3.6722257442215735e-07, "logits/chosen": -0.4404296875, "logits/rejected": -0.4970703125, "logps/chosen": -678.0, "logps/rejected": -792.5, "loss": 0.4615, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.37890625, "rewards/margins": 1.1044921875, "rewards/rejected": -4.48828125, "step": 419 }, { "epoch": 0.5376, "grad_norm": 12.228990359857354, "learning_rate": 3.6565798352465697e-07, "logits/chosen": -0.485107421875, "logits/rejected": -0.503173828125, "logps/chosen": -628.0, "logps/rejected": -704.5, "loss": 0.4871, "rewards/accuracies": 0.75, "rewards/chosen": -3.03125, "rewards/margins": 0.918701171875, "rewards/rejected": -3.947265625, "step": 420 }, { "epoch": 0.53888, "grad_norm": 18.18437340639748, "learning_rate": 3.640930790382953e-07, "logits/chosen": -0.3985595703125, "logits/rejected": -0.4619140625, "logps/chosen": -645.0, "logps/rejected": -726.5, "loss": 0.4444, "rewards/accuracies": 0.796875, "rewards/chosen": -3.162109375, "rewards/margins": 0.968994140625, "rewards/rejected": -4.130859375, "step": 421 }, { "epoch": 0.54016, "grad_norm": 11.370178937016053, "learning_rate": 3.625278923040567e-07, "logits/chosen": -0.40625, "logits/rejected": -0.40673828125, "logps/chosen": -689.5, "logps/rejected": -748.0, "loss": 0.519, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4765625, "rewards/margins": 0.92724609375, "rewards/rejected": -4.40234375, "step": 422 }, { "epoch": 0.54144, "grad_norm": 14.61661184515439, "learning_rate": 3.6096245466857807e-07, "logits/chosen": -0.3848876953125, "logits/rejected": -0.416748046875, "logps/chosen": -681.0, "logps/rejected": -737.5, "loss": 0.4409, "rewards/accuracies": 0.78125, "rewards/chosen": -3.384765625, "rewards/margins": 1.01220703125, "rewards/rejected": -4.396484375, "step": 423 }, { "epoch": 0.54272, "grad_norm": 12.818023117655002, "learning_rate": 3.5939679748352143e-07, "logits/chosen": -0.3614501953125, "logits/rejected": -0.43359375, "logps/chosen": -664.0, "logps/rejected": -786.5, "loss": 0.4274, "rewards/accuracies": 0.828125, "rewards/chosen": -3.3046875, "rewards/margins": 1.12353515625, "rewards/rejected": -4.431640625, "step": 424 }, { "epoch": 0.544, "grad_norm": 12.339337246571736, "learning_rate": 3.578309521049456e-07, "logits/chosen": -0.3673095703125, "logits/rejected": -0.4122314453125, "logps/chosen": -664.0, "logps/rejected": -762.5, "loss": 0.4744, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.564453125, "rewards/margins": 1.170166015625, "rewards/rejected": -4.736328125, "step": 425 }, { "epoch": 0.54528, "grad_norm": 8.658934278329045, "learning_rate": 3.562649498926785e-07, "logits/chosen": -0.2666015625, "logits/rejected": -0.2794189453125, "logps/chosen": -670.0, "logps/rejected": -773.5, "loss": 0.4346, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.388671875, "rewards/margins": 1.10498046875, "rewards/rejected": -4.490234375, "step": 426 }, { "epoch": 0.54656, "grad_norm": 16.97395118117586, "learning_rate": 3.5469882220968913e-07, "logits/chosen": -0.334716796875, "logits/rejected": -0.3743896484375, "logps/chosen": -627.5, "logps/rejected": -758.0, "loss": 0.5011, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.55078125, "rewards/margins": 1.0, "rewards/rejected": -4.5546875, "step": 427 }, { "epoch": 0.54784, "grad_norm": 10.000428326458785, "learning_rate": 3.531326004214592e-07, "logits/chosen": -0.234283447265625, "logits/rejected": -0.29119873046875, "logps/chosen": -709.0, "logps/rejected": -835.5, "loss": 0.4368, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.861328125, "rewards/margins": 1.24755859375, "rewards/rejected": -5.10546875, "step": 428 }, { "epoch": 0.54912, "grad_norm": 17.876551163341734, "learning_rate": 3.5156631589535516e-07, "logits/chosen": -0.305419921875, "logits/rejected": -0.3280029296875, "logps/chosen": -721.5, "logps/rejected": -828.0, "loss": 0.5295, "rewards/accuracies": 0.7421875, "rewards/chosen": -4.10546875, "rewards/margins": 1.10888671875, "rewards/rejected": -5.21875, "step": 429 }, { "epoch": 0.5504, "grad_norm": 22.498342188721203, "learning_rate": 3.5e-07, "logits/chosen": -0.380859375, "logits/rejected": -0.43505859375, "logps/chosen": -739.0, "logps/rejected": -841.0, "loss": 0.533, "rewards/accuracies": 0.71875, "rewards/chosen": -4.107421875, "rewards/margins": 0.91064453125, "rewards/rejected": -5.015625, "step": 430 }, { "epoch": 0.55168, "grad_norm": 12.69840485691294, "learning_rate": 3.484336841046448e-07, "logits/chosen": -0.28363037109375, "logits/rejected": -0.329345703125, "logps/chosen": -718.5, "logps/rejected": -844.0, "loss": 0.4312, "rewards/accuracies": 0.796875, "rewards/chosen": -3.939453125, "rewards/margins": 1.17919921875, "rewards/rejected": -5.11328125, "step": 431 }, { "epoch": 0.55296, "grad_norm": 15.269189564746817, "learning_rate": 3.468673995785409e-07, "logits/chosen": -0.3050537109375, "logits/rejected": -0.2933349609375, "logps/chosen": -757.5, "logps/rejected": -809.5, "loss": 0.5201, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.078125, "rewards/margins": 1.0419921875, "rewards/rejected": -5.1171875, "step": 432 }, { "epoch": 0.55424, "grad_norm": 10.916565840639656, "learning_rate": 3.4530117779031095e-07, "logits/chosen": -0.28759765625, "logits/rejected": -0.30859375, "logps/chosen": -741.0, "logps/rejected": -798.5, "loss": 0.5074, "rewards/accuracies": 0.796875, "rewards/chosen": -3.970703125, "rewards/margins": 0.98486328125, "rewards/rejected": -4.95703125, "step": 433 }, { "epoch": 0.55552, "grad_norm": 8.956894754018172, "learning_rate": 3.4373505010732153e-07, "logits/chosen": -0.336181640625, "logits/rejected": -0.366455078125, "logps/chosen": -745.0, "logps/rejected": -831.0, "loss": 0.4109, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.966796875, "rewards/margins": 1.26806640625, "rewards/rejected": -5.23046875, "step": 434 }, { "epoch": 0.5568, "grad_norm": 10.251791767078773, "learning_rate": 3.4216904789505446e-07, "logits/chosen": -0.2631034851074219, "logits/rejected": -0.291534423828125, "logps/chosen": -701.5, "logps/rejected": -824.0, "loss": 0.4353, "rewards/accuracies": 0.796875, "rewards/chosen": -3.8203125, "rewards/margins": 1.3251953125, "rewards/rejected": -5.14453125, "step": 435 }, { "epoch": 0.55808, "grad_norm": 17.42176040192877, "learning_rate": 3.4060320251647864e-07, "logits/chosen": -0.309478759765625, "logits/rejected": -0.3680419921875, "logps/chosen": -712.5, "logps/rejected": -840.5, "loss": 0.4913, "rewards/accuracies": 0.765625, "rewards/chosen": -3.96484375, "rewards/margins": 1.13525390625, "rewards/rejected": -5.10546875, "step": 436 }, { "epoch": 0.55936, "grad_norm": 8.54035065196297, "learning_rate": 3.3903754533142195e-07, "logits/chosen": -0.256591796875, "logits/rejected": -0.332275390625, "logps/chosen": -719.0, "logps/rejected": -851.0, "loss": 0.4279, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.943359375, "rewards/margins": 1.35791015625, "rewards/rejected": -5.30078125, "step": 437 }, { "epoch": 0.56064, "grad_norm": 8.537378854753193, "learning_rate": 3.3747210769594327e-07, "logits/chosen": -0.25079345703125, "logits/rejected": -0.331298828125, "logps/chosen": -699.0, "logps/rejected": -829.0, "loss": 0.4387, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.8828125, "rewards/margins": 1.21044921875, "rewards/rejected": -5.08984375, "step": 438 }, { "epoch": 0.56192, "grad_norm": 9.144351590625533, "learning_rate": 3.359069209617048e-07, "logits/chosen": -0.314208984375, "logits/rejected": -0.3914337158203125, "logps/chosen": -697.0, "logps/rejected": -799.0, "loss": 0.4947, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.8203125, "rewards/margins": 1.109375, "rewards/rejected": -4.9296875, "step": 439 }, { "epoch": 0.5632, "grad_norm": 14.655918977438441, "learning_rate": 3.3434201647534305e-07, "logits/chosen": -0.3173828125, "logits/rejected": -0.363525390625, "logps/chosen": -686.0, "logps/rejected": -800.5, "loss": 0.4074, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.587890625, "rewards/margins": 1.189453125, "rewards/rejected": -4.76953125, "step": 440 }, { "epoch": 0.56448, "grad_norm": 9.37268098222402, "learning_rate": 3.327774255778426e-07, "logits/chosen": -0.35009765625, "logits/rejected": -0.3543701171875, "logps/chosen": -706.0, "logps/rejected": -837.0, "loss": 0.4304, "rewards/accuracies": 0.796875, "rewards/chosen": -3.71875, "rewards/margins": 1.23095703125, "rewards/rejected": -4.94921875, "step": 441 }, { "epoch": 0.56576, "grad_norm": 8.893240235048026, "learning_rate": 3.312131796039074e-07, "logits/chosen": -0.3359375, "logits/rejected": -0.35595703125, "logps/chosen": -720.0, "logps/rejected": -820.5, "loss": 0.4758, "rewards/accuracies": 0.8125, "rewards/chosen": -3.982421875, "rewards/margins": 1.11669921875, "rewards/rejected": -5.10546875, "step": 442 }, { "epoch": 0.56704, "grad_norm": 9.905678959597498, "learning_rate": 3.2964930988133347e-07, "logits/chosen": -0.31103515625, "logits/rejected": -0.3660888671875, "logps/chosen": -677.5, "logps/rejected": -814.5, "loss": 0.498, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.58984375, "rewards/margins": 1.150390625, "rewards/rejected": -4.73828125, "step": 443 }, { "epoch": 0.56832, "grad_norm": 9.08992784313469, "learning_rate": 3.280858477303813e-07, "logits/chosen": -0.39501953125, "logits/rejected": -0.420166015625, "logps/chosen": -693.0, "logps/rejected": -835.5, "loss": 0.4553, "rewards/accuracies": 0.84375, "rewards/chosen": -3.744140625, "rewards/margins": 1.2099609375, "rewards/rejected": -4.94921875, "step": 444 }, { "epoch": 0.5696, "grad_norm": 11.048886602570247, "learning_rate": 3.265228244631491e-07, "logits/chosen": -0.3572998046875, "logits/rejected": -0.3760986328125, "logps/chosen": -655.0, "logps/rejected": -766.5, "loss": 0.4808, "rewards/accuracies": 0.796875, "rewards/chosen": -3.58203125, "rewards/margins": 0.9876708984375, "rewards/rejected": -4.57421875, "step": 445 }, { "epoch": 0.57088, "grad_norm": 16.49283952772338, "learning_rate": 3.2496027138294534e-07, "logits/chosen": -0.3392333984375, "logits/rejected": -0.38818359375, "logps/chosen": -695.5, "logps/rejected": -786.0, "loss": 0.5221, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.708984375, "rewards/margins": 1.129638671875, "rewards/rejected": -4.84375, "step": 446 }, { "epoch": 0.57216, "grad_norm": 13.865402250549591, "learning_rate": 3.2339821978366143e-07, "logits/chosen": -0.3529052734375, "logits/rejected": -0.415283203125, "logps/chosen": -705.5, "logps/rejected": -801.0, "loss": 0.414, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.7890625, "rewards/margins": 1.2333984375, "rewards/rejected": -5.015625, "step": 447 }, { "epoch": 0.57344, "grad_norm": 9.732935361331597, "learning_rate": 3.218367009491459e-07, "logits/chosen": -0.30615234375, "logits/rejected": -0.3333740234375, "logps/chosen": -690.0, "logps/rejected": -790.5, "loss": 0.5015, "rewards/accuracies": 0.78125, "rewards/chosen": -3.7265625, "rewards/margins": 1.05517578125, "rewards/rejected": -4.78125, "step": 448 }, { "epoch": 0.57472, "grad_norm": 9.154715837042305, "learning_rate": 3.2027574615257724e-07, "logits/chosen": -0.3072509765625, "logits/rejected": -0.3399658203125, "logps/chosen": -722.5, "logps/rejected": -808.0, "loss": 0.4555, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.783203125, "rewards/margins": 1.1298828125, "rewards/rejected": -4.91015625, "step": 449 }, { "epoch": 0.576, "grad_norm": 8.928902784266844, "learning_rate": 3.1871538665583784e-07, "logits/chosen": -0.33447265625, "logits/rejected": -0.3792724609375, "logps/chosen": -701.0, "logps/rejected": -844.5, "loss": 0.439, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.796875, "rewards/margins": 1.2041015625, "rewards/rejected": -5.00390625, "step": 450 }, { "epoch": 0.57728, "grad_norm": 12.00098250688527, "learning_rate": 3.1715565370888724e-07, "logits/chosen": -0.29534912109375, "logits/rejected": -0.3519287109375, "logps/chosen": -685.0, "logps/rejected": -775.0, "loss": 0.5097, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.541015625, "rewards/margins": 1.02294921875, "rewards/rejected": -4.5625, "step": 451 }, { "epoch": 0.57856, "grad_norm": 12.6273655314952, "learning_rate": 3.155965785491375e-07, "logits/chosen": -0.2778053283691406, "logits/rejected": -0.3127593994140625, "logps/chosen": -666.5, "logps/rejected": -733.0, "loss": 0.5567, "rewards/accuracies": 0.6171875, "rewards/chosen": -3.607421875, "rewards/margins": 0.78857421875, "rewards/rejected": -4.392578125, "step": 452 }, { "epoch": 0.57984, "grad_norm": 15.49253693464398, "learning_rate": 3.140381924008266e-07, "logits/chosen": -0.3681640625, "logits/rejected": -0.415771484375, "logps/chosen": -679.5, "logps/rejected": -758.5, "loss": 0.5302, "rewards/accuracies": 0.75, "rewards/chosen": -3.544921875, "rewards/margins": 1.06982421875, "rewards/rejected": -4.61328125, "step": 453 }, { "epoch": 0.58112, "grad_norm": 14.072242757167011, "learning_rate": 3.1248052647439325e-07, "logits/chosen": -0.314361572265625, "logits/rejected": -0.35498046875, "logps/chosen": -683.0, "logps/rejected": -766.5, "loss": 0.4328, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.375, "rewards/margins": 1.099609375, "rewards/rejected": -4.47265625, "step": 454 }, { "epoch": 0.5824, "grad_norm": 8.998262402466967, "learning_rate": 3.109236119658523e-07, "logits/chosen": -0.39013671875, "logits/rejected": -0.46435546875, "logps/chosen": -694.0, "logps/rejected": -787.0, "loss": 0.4334, "rewards/accuracies": 0.828125, "rewards/chosen": -3.5234375, "rewards/margins": 1.29052734375, "rewards/rejected": -4.81640625, "step": 455 }, { "epoch": 0.58368, "grad_norm": 9.654473976654984, "learning_rate": 3.0936748005616934e-07, "logits/chosen": -0.4014892578125, "logits/rejected": -0.4482421875, "logps/chosen": -652.5, "logps/rejected": -773.0, "loss": 0.4763, "rewards/accuracies": 0.734375, "rewards/chosen": -3.390625, "rewards/margins": 1.03076171875, "rewards/rejected": -4.41796875, "step": 456 }, { "epoch": 0.58496, "grad_norm": 9.304597500860096, "learning_rate": 3.07812161910637e-07, "logits/chosen": -0.38525390625, "logits/rejected": -0.466064453125, "logps/chosen": -677.0, "logps/rejected": -786.5, "loss": 0.4882, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.505859375, "rewards/margins": 1.04443359375, "rewards/rejected": -4.55078125, "step": 457 }, { "epoch": 0.58624, "grad_norm": 10.03299414048815, "learning_rate": 3.062576886782496e-07, "logits/chosen": -0.33111572265625, "logits/rejected": -0.343994140625, "logps/chosen": -669.0, "logps/rejected": -749.0, "loss": 0.4664, "rewards/accuracies": 0.84375, "rewards/chosen": -3.41796875, "rewards/margins": 1.0966796875, "rewards/rejected": -4.51953125, "step": 458 }, { "epoch": 0.58752, "grad_norm": 8.481068595485672, "learning_rate": 3.0470409149108057e-07, "logits/chosen": -0.376220703125, "logits/rejected": -0.4453125, "logps/chosen": -693.5, "logps/rejected": -784.5, "loss": 0.4547, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.419921875, "rewards/margins": 1.06201171875, "rewards/rejected": -4.478515625, "step": 459 }, { "epoch": 0.5888, "grad_norm": 12.321469175204998, "learning_rate": 3.0315140146365854e-07, "logits/chosen": -0.338134765625, "logits/rejected": -0.415283203125, "logps/chosen": -632.5, "logps/rejected": -762.0, "loss": 0.4179, "rewards/accuracies": 0.828125, "rewards/chosen": -3.32421875, "rewards/margins": 1.26171875, "rewards/rejected": -4.5859375, "step": 460 }, { "epoch": 0.59008, "grad_norm": 10.17822938822944, "learning_rate": 3.0159964969234345e-07, "logits/chosen": -0.31658935546875, "logits/rejected": -0.399658203125, "logps/chosen": -660.0, "logps/rejected": -803.5, "loss": 0.4285, "rewards/accuracies": 0.828125, "rewards/chosen": -3.5390625, "rewards/margins": 1.218505859375, "rewards/rejected": -4.75390625, "step": 461 }, { "epoch": 0.59136, "grad_norm": 7.7993467189482075, "learning_rate": 3.00048867254705e-07, "logits/chosen": -0.43994140625, "logits/rejected": -0.50244140625, "logps/chosen": -688.0, "logps/rejected": -806.0, "loss": 0.4446, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.525390625, "rewards/margins": 1.19580078125, "rewards/rejected": -4.71875, "step": 462 }, { "epoch": 0.59264, "grad_norm": 12.021053759681038, "learning_rate": 2.9849908520889934e-07, "logits/chosen": -0.35205078125, "logits/rejected": -0.4013671875, "logps/chosen": -703.5, "logps/rejected": -813.0, "loss": 0.4805, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.78515625, "rewards/margins": 1.2294921875, "rewards/rejected": -5.015625, "step": 463 }, { "epoch": 0.59392, "grad_norm": 9.606440908673006, "learning_rate": 2.9695033459304765e-07, "logits/chosen": -0.3104248046875, "logits/rejected": -0.3370361328125, "logps/chosen": -719.5, "logps/rejected": -809.5, "loss": 0.4665, "rewards/accuracies": 0.75, "rewards/chosen": -3.779296875, "rewards/margins": 1.021728515625, "rewards/rejected": -4.80078125, "step": 464 }, { "epoch": 0.5952, "grad_norm": 9.429466706631539, "learning_rate": 2.954026464246138e-07, "logits/chosen": -0.308349609375, "logits/rejected": -0.3970947265625, "logps/chosen": -685.0, "logps/rejected": -803.5, "loss": 0.4333, "rewards/accuracies": 0.765625, "rewards/chosen": -3.5859375, "rewards/margins": 1.34521484375, "rewards/rejected": -4.93359375, "step": 465 }, { "epoch": 0.59648, "grad_norm": 8.733380658733111, "learning_rate": 2.938560516997839e-07, "logits/chosen": -0.3392333984375, "logits/rejected": -0.385986328125, "logps/chosen": -679.0, "logps/rejected": -788.0, "loss": 0.4456, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.580078125, "rewards/margins": 1.158203125, "rewards/rejected": -4.732421875, "step": 466 }, { "epoch": 0.59776, "grad_norm": 11.051837804187693, "learning_rate": 2.923105813928453e-07, "logits/chosen": -0.35302734375, "logits/rejected": -0.400390625, "logps/chosen": -681.5, "logps/rejected": -801.0, "loss": 0.4298, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.771484375, "rewards/margins": 1.23046875, "rewards/rejected": -5.00390625, "step": 467 }, { "epoch": 0.59904, "grad_norm": 10.90589653843686, "learning_rate": 2.907662664555658e-07, "logits/chosen": -0.24847412109375, "logits/rejected": -0.321044921875, "logps/chosen": -716.5, "logps/rejected": -895.5, "loss": 0.4646, "rewards/accuracies": 0.796875, "rewards/chosen": -4.005859375, "rewards/margins": 1.11767578125, "rewards/rejected": -5.125, "step": 468 }, { "epoch": 0.60032, "grad_norm": 11.37286435597706, "learning_rate": 2.8922313781657437e-07, "logits/chosen": -0.258544921875, "logits/rejected": -0.29833984375, "logps/chosen": -707.0, "logps/rejected": -853.0, "loss": 0.4346, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.875, "rewards/margins": 1.4248046875, "rewards/rejected": -5.302734375, "step": 469 }, { "epoch": 0.6016, "grad_norm": 12.238759131932019, "learning_rate": 2.876812263807417e-07, "logits/chosen": -0.31640625, "logits/rejected": -0.36279296875, "logps/chosen": -757.0, "logps/rejected": -893.5, "loss": 0.3726, "rewards/accuracies": 0.828125, "rewards/chosen": -4.361328125, "rewards/margins": 1.45849609375, "rewards/rejected": -5.8125, "step": 470 }, { "epoch": 0.60288, "grad_norm": 14.41231269695093, "learning_rate": 2.861405630285606e-07, "logits/chosen": -0.203125, "logits/rejected": -0.233642578125, "logps/chosen": -747.0, "logps/rejected": -901.5, "loss": 0.3898, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.962890625, "rewards/margins": 1.37841796875, "rewards/rejected": -5.33984375, "step": 471 }, { "epoch": 0.60416, "grad_norm": 10.17467437176661, "learning_rate": 2.8460117861552834e-07, "logits/chosen": -0.1627197265625, "logits/rejected": -0.205810546875, "logps/chosen": -750.5, "logps/rejected": -889.5, "loss": 0.4308, "rewards/accuracies": 0.84375, "rewards/chosen": -4.59375, "rewards/margins": 1.18017578125, "rewards/rejected": -5.77734375, "step": 472 }, { "epoch": 0.60544, "grad_norm": 18.928934732354712, "learning_rate": 2.8306310397152813e-07, "logits/chosen": -0.1800537109375, "logits/rejected": -0.21759796142578125, "logps/chosen": -792.5, "logps/rejected": -936.0, "loss": 0.4427, "rewards/accuracies": 0.8125, "rewards/chosen": -4.73828125, "rewards/margins": 1.591796875, "rewards/rejected": -6.33203125, "step": 473 }, { "epoch": 0.60672, "grad_norm": 22.570954000361684, "learning_rate": 2.815263699002124e-07, "logits/chosen": -0.104888916015625, "logits/rejected": -0.14886474609375, "logps/chosen": -865.0, "logps/rejected": -989.0, "loss": 0.4815, "rewards/accuracies": 0.75, "rewards/chosen": -5.34765625, "rewards/margins": 1.2919921875, "rewards/rejected": -6.640625, "step": 474 }, { "epoch": 0.608, "grad_norm": 11.072003778958646, "learning_rate": 2.799910071783845e-07, "logits/chosen": -0.0941619873046875, "logits/rejected": -0.12345123291015625, "logps/chosen": -875.0, "logps/rejected": -1017.5, "loss": 0.4723, "rewards/accuracies": 0.8046875, "rewards/chosen": -5.3984375, "rewards/margins": 1.45703125, "rewards/rejected": -6.8515625, "step": 475 }, { "epoch": 0.60928, "grad_norm": 22.291924161435414, "learning_rate": 2.7845704655538383e-07, "logits/chosen": -0.145660400390625, "logits/rejected": -0.185394287109375, "logps/chosen": -858.5, "logps/rejected": -1001.5, "loss": 0.5071, "rewards/accuracies": 0.75, "rewards/chosen": -5.328125, "rewards/margins": 1.337890625, "rewards/rejected": -6.66796875, "step": 476 }, { "epoch": 0.61056, "grad_norm": 49.81448523508282, "learning_rate": 2.7692451875246955e-07, "logits/chosen": -0.1240692138671875, "logits/rejected": -0.1556243896484375, "logps/chosen": -825.5, "logps/rejected": -936.0, "loss": 0.637, "rewards/accuracies": 0.703125, "rewards/chosen": -5.234375, "rewards/margins": 1.135498046875, "rewards/rejected": -6.375, "step": 477 }, { "epoch": 0.61184, "grad_norm": 13.559544047897164, "learning_rate": 2.753934544622044e-07, "logits/chosen": -0.0382232666015625, "logits/rejected": -0.0726776123046875, "logps/chosen": -860.0, "logps/rejected": -959.5, "loss": 0.5071, "rewards/accuracies": 0.7265625, "rewards/chosen": -5.5390625, "rewards/margins": 1.2286376953125, "rewards/rejected": -6.765625, "step": 478 }, { "epoch": 0.61312, "grad_norm": 44.498812483306665, "learning_rate": 2.7386388434784144e-07, "logits/chosen": -0.1600341796875, "logits/rejected": -0.229248046875, "logps/chosen": -800.0, "logps/rejected": -891.0, "loss": 0.541, "rewards/accuracies": 0.75, "rewards/chosen": -4.859375, "rewards/margins": 1.0419921875, "rewards/rejected": -5.90234375, "step": 479 }, { "epoch": 0.6144, "grad_norm": 10.090939267052912, "learning_rate": 2.723358390427089e-07, "logits/chosen": -0.07647705078125, "logits/rejected": -0.1278076171875, "logps/chosen": -830.0, "logps/rejected": -963.0, "loss": 0.3957, "rewards/accuracies": 0.8046875, "rewards/chosen": -5.05078125, "rewards/margins": 1.451171875, "rewards/rejected": -6.49609375, "step": 480 }, { "epoch": 0.61568, "grad_norm": 36.24985599924041, "learning_rate": 2.708093491495973e-07, "logits/chosen": -0.191162109375, "logits/rejected": -0.2340087890625, "logps/chosen": -795.0, "logps/rejected": -943.5, "loss": 0.5742, "rewards/accuracies": 0.71875, "rewards/chosen": -4.89453125, "rewards/margins": 1.2020416259765625, "rewards/rejected": -6.09375, "step": 481 }, { "epoch": 0.61696, "grad_norm": 9.559993471114256, "learning_rate": 2.6928444524014593e-07, "logits/chosen": -0.0987396240234375, "logits/rejected": -0.13189697265625, "logps/chosen": -821.5, "logps/rejected": -962.0, "loss": 0.4207, "rewards/accuracies": 0.8046875, "rewards/chosen": -4.91015625, "rewards/margins": 1.5185546875, "rewards/rejected": -6.4296875, "step": 482 }, { "epoch": 0.61824, "grad_norm": 21.064286757896273, "learning_rate": 2.677611578542312e-07, "logits/chosen": -0.180572509765625, "logits/rejected": -0.177886962890625, "logps/chosen": -761.5, "logps/rejected": -863.5, "loss": 0.518, "rewards/accuracies": 0.75, "rewards/chosen": -4.3125, "rewards/margins": 1.28662109375, "rewards/rejected": -5.59765625, "step": 483 }, { "epoch": 0.61952, "grad_norm": 16.47721480533661, "learning_rate": 2.6623951749935486e-07, "logits/chosen": -0.19525146484375, "logits/rejected": -0.232696533203125, "logps/chosen": -750.5, "logps/rejected": -880.0, "loss": 0.5357, "rewards/accuracies": 0.7109375, "rewards/chosen": -4.375, "rewards/margins": 1.01416015625, "rewards/rejected": -5.39453125, "step": 484 }, { "epoch": 0.6208, "grad_norm": 14.729444247013049, "learning_rate": 2.6471955465003233e-07, "logits/chosen": -0.146209716796875, "logits/rejected": -0.181396484375, "logps/chosen": -727.0, "logps/rejected": -855.5, "loss": 0.4494, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.181640625, "rewards/margins": 1.361328125, "rewards/rejected": -5.54296875, "step": 485 }, { "epoch": 0.62208, "grad_norm": 17.822317812146366, "learning_rate": 2.6320129974718357e-07, "logits/chosen": -0.2357177734375, "logits/rejected": -0.266845703125, "logps/chosen": -755.5, "logps/rejected": -898.5, "loss": 0.3869, "rewards/accuracies": 0.8359375, "rewards/chosen": -4.2265625, "rewards/margins": 1.5546875, "rewards/rejected": -5.78125, "step": 486 }, { "epoch": 0.62336, "grad_norm": 18.035398000203894, "learning_rate": 2.6168478319752235e-07, "logits/chosen": -0.25274658203125, "logits/rejected": -0.2952880859375, "logps/chosen": -715.5, "logps/rejected": -808.5, "loss": 0.53, "rewards/accuracies": 0.765625, "rewards/chosen": -3.82421875, "rewards/margins": 0.89306640625, "rewards/rejected": -4.7109375, "step": 487 }, { "epoch": 0.62464, "grad_norm": 11.523996728652335, "learning_rate": 2.6017003537294813e-07, "logits/chosen": -0.28271484375, "logits/rejected": -0.321533203125, "logps/chosen": -650.0, "logps/rejected": -767.5, "loss": 0.4265, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.513671875, "rewards/margins": 1.33837890625, "rewards/rejected": -4.85546875, "step": 488 }, { "epoch": 0.62592, "grad_norm": 9.3216241550967, "learning_rate": 2.58657086609937e-07, "logits/chosen": -0.29815673828125, "logits/rejected": -0.323974609375, "logps/chosen": -692.25, "logps/rejected": -793.5, "loss": 0.4857, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.697265625, "rewards/margins": 0.97802734375, "rewards/rejected": -4.673828125, "step": 489 }, { "epoch": 0.6272, "grad_norm": 9.12204855358185, "learning_rate": 2.5714596720893474e-07, "logits/chosen": -0.1910400390625, "logits/rejected": -0.24755859375, "logps/chosen": -638.0, "logps/rejected": -753.0, "loss": 0.49, "rewards/accuracies": 0.765625, "rewards/chosen": -3.484375, "rewards/margins": 1.13525390625, "rewards/rejected": -4.62109375, "step": 490 }, { "epoch": 0.62848, "grad_norm": 9.031432208381775, "learning_rate": 2.5563670743374974e-07, "logits/chosen": -0.311279296875, "logits/rejected": -0.331787109375, "logps/chosen": -661.0, "logps/rejected": -733.5, "loss": 0.4686, "rewards/accuracies": 0.828125, "rewards/chosen": -3.4296875, "rewards/margins": 1.078125, "rewards/rejected": -4.509765625, "step": 491 }, { "epoch": 0.62976, "grad_norm": 11.835986491275541, "learning_rate": 2.541293375109466e-07, "logits/chosen": -0.323974609375, "logits/rejected": -0.363525390625, "logps/chosen": -632.5, "logps/rejected": -759.0, "loss": 0.4363, "rewards/accuracies": 0.8125, "rewards/chosen": -3.412109375, "rewards/margins": 1.064453125, "rewards/rejected": -4.4765625, "step": 492 }, { "epoch": 0.63104, "grad_norm": 8.263522495741764, "learning_rate": 2.5262388762924157e-07, "logits/chosen": -0.341064453125, "logits/rejected": -0.380859375, "logps/chosen": -661.5, "logps/rejected": -789.5, "loss": 0.401, "rewards/accuracies": 0.8125, "rewards/chosen": -3.357421875, "rewards/margins": 1.31689453125, "rewards/rejected": -4.671875, "step": 493 }, { "epoch": 0.63232, "grad_norm": 16.717553631249775, "learning_rate": 2.511203879388971e-07, "logits/chosen": -0.28839111328125, "logits/rejected": -0.35125732421875, "logps/chosen": -655.5, "logps/rejected": -778.0, "loss": 0.399, "rewards/accuracies": 0.828125, "rewards/chosen": -3.291015625, "rewards/margins": 1.16796875, "rewards/rejected": -4.45703125, "step": 494 }, { "epoch": 0.6336, "grad_norm": 9.39340513518324, "learning_rate": 2.496188685511185e-07, "logits/chosen": -0.27813720703125, "logits/rejected": -0.288360595703125, "logps/chosen": -666.5, "logps/rejected": -738.0, "loss": 0.4937, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.365234375, "rewards/margins": 0.90673828125, "rewards/rejected": -4.26953125, "step": 495 }, { "epoch": 0.63488, "grad_norm": 8.21962622555432, "learning_rate": 2.481193595374505e-07, "logits/chosen": -0.22674560546875, "logits/rejected": -0.27642822265625, "logps/chosen": -603.75, "logps/rejected": -724.0, "loss": 0.478, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.115234375, "rewards/margins": 1.0169677734375, "rewards/rejected": -4.12890625, "step": 496 }, { "epoch": 0.63616, "grad_norm": 10.565279172163, "learning_rate": 2.466218909291756e-07, "logits/chosen": -0.32861328125, "logits/rejected": -0.385009765625, "logps/chosen": -659.5, "logps/rejected": -752.5, "loss": 0.494, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.30078125, "rewards/margins": 1.0400390625, "rewards/rejected": -4.337890625, "step": 497 }, { "epoch": 0.63744, "grad_norm": 14.800631609015028, "learning_rate": 2.451264927167121e-07, "logits/chosen": -0.378173828125, "logits/rejected": -0.388671875, "logps/chosen": -676.5, "logps/rejected": -738.5, "loss": 0.5109, "rewards/accuracies": 0.78125, "rewards/chosen": -3.45703125, "rewards/margins": 0.9403076171875, "rewards/rejected": -4.396484375, "step": 498 }, { "epoch": 0.63872, "grad_norm": 14.511150427574877, "learning_rate": 2.436331948490136e-07, "logits/chosen": -0.3680419921875, "logits/rejected": -0.44677734375, "logps/chosen": -638.5, "logps/rejected": -751.0, "loss": 0.5025, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.4140625, "rewards/margins": 1.0439453125, "rewards/rejected": -4.453125, "step": 499 }, { "epoch": 0.64, "grad_norm": 9.963559206561316, "learning_rate": 2.4214202723296923e-07, "logits/chosen": -0.3682861328125, "logits/rejected": -0.392578125, "logps/chosen": -666.5, "logps/rejected": -715.0, "loss": 0.4913, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.439453125, "rewards/margins": 0.968505859375, "rewards/rejected": -4.41015625, "step": 500 }, { "epoch": 0.64, "eval_logits/chosen": -0.30975341796875, "eval_logits/rejected": -0.39111328125, "eval_logps/chosen": -649.75, "eval_logps/rejected": -732.0, "eval_loss": 0.4821406304836273, "eval_rewards/accuracies": 0.737500011920929, "eval_rewards/chosen": -3.2939453125, "eval_rewards/margins": 1.017822265625, "eval_rewards/rejected": -4.3095703125, "eval_runtime": 27.6376, "eval_samples_per_second": 18.091, "eval_steps_per_second": 0.579, "step": 500 }, { "epoch": 0.64128, "grad_norm": 9.125881510784152, "learning_rate": 2.4065301973280486e-07, "logits/chosen": -0.3564453125, "logits/rejected": -0.4180908203125, "logps/chosen": -678.5, "logps/rejected": -829.5, "loss": 0.4451, "rewards/accuracies": 0.8125, "rewards/chosen": -3.44921875, "rewards/margins": 1.13232421875, "rewards/rejected": -4.58203125, "step": 501 }, { "epoch": 0.64256, "grad_norm": 17.366268825982257, "learning_rate": 2.391662021694847e-07, "logits/chosen": -0.3870849609375, "logits/rejected": -0.4088134765625, "logps/chosen": -630.5, "logps/rejected": -714.0, "loss": 0.5572, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.23828125, "rewards/margins": 0.932861328125, "rewards/rejected": -4.16796875, "step": 502 }, { "epoch": 0.64384, "grad_norm": 15.66439643538157, "learning_rate": 2.3768160432011394e-07, "logits/chosen": -0.332275390625, "logits/rejected": -0.3670654296875, "logps/chosen": -644.5, "logps/rejected": -734.5, "loss": 0.4228, "rewards/accuracies": 0.828125, "rewards/chosen": -2.994140625, "rewards/margins": 1.0615234375, "rewards/rejected": -4.0546875, "step": 503 }, { "epoch": 0.64512, "grad_norm": 13.777723328035924, "learning_rate": 2.361992559173432e-07, "logits/chosen": -0.399169921875, "logits/rejected": -0.425537109375, "logps/chosen": -669.5, "logps/rejected": -773.5, "loss": 0.4306, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2421875, "rewards/margins": 1.294921875, "rewards/rejected": -4.53515625, "step": 504 }, { "epoch": 0.6464, "grad_norm": 9.719276284192894, "learning_rate": 2.3471918664877214e-07, "logits/chosen": -0.35498046875, "logits/rejected": -0.3848876953125, "logps/chosen": -668.5, "logps/rejected": -758.5, "loss": 0.4765, "rewards/accuracies": 0.78125, "rewards/chosen": -3.39453125, "rewards/margins": 1.04248046875, "rewards/rejected": -4.439453125, "step": 505 }, { "epoch": 0.64768, "grad_norm": 16.89908454302761, "learning_rate": 2.3324142615635527e-07, "logits/chosen": -0.355712890625, "logits/rejected": -0.3896484375, "logps/chosen": -676.0, "logps/rejected": -796.0, "loss": 0.3675, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.345703125, "rewards/margins": 1.384765625, "rewards/rejected": -4.73046875, "step": 506 }, { "epoch": 0.64896, "grad_norm": 9.037083694795681, "learning_rate": 2.317660040358085e-07, "logits/chosen": -0.2799072265625, "logits/rejected": -0.35302734375, "logps/chosen": -653.5, "logps/rejected": -813.5, "loss": 0.4096, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.427734375, "rewards/margins": 1.4033203125, "rewards/rejected": -4.83203125, "step": 507 }, { "epoch": 0.65024, "grad_norm": 13.711223830769715, "learning_rate": 2.3029294983601597e-07, "logits/chosen": -0.3631591796875, "logits/rejected": -0.3857421875, "logps/chosen": -639.0, "logps/rejected": -715.0, "loss": 0.5367, "rewards/accuracies": 0.7109375, "rewards/chosen": -3.181640625, "rewards/margins": 0.947265625, "rewards/rejected": -4.130859375, "step": 508 }, { "epoch": 0.65152, "grad_norm": 15.850008639937514, "learning_rate": 2.2882229305843867e-07, "logits/chosen": -0.31689453125, "logits/rejected": -0.3739013671875, "logps/chosen": -603.0, "logps/rejected": -715.0, "loss": 0.4067, "rewards/accuracies": 0.8671875, "rewards/chosen": -3.0390625, "rewards/margins": 1.18798828125, "rewards/rejected": -4.22265625, "step": 509 }, { "epoch": 0.6528, "grad_norm": 13.27611006512659, "learning_rate": 2.2735406315652325e-07, "logits/chosen": -0.3470458984375, "logits/rejected": -0.40087890625, "logps/chosen": -654.5, "logps/rejected": -778.5, "loss": 0.4238, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.30078125, "rewards/margins": 1.177734375, "rewards/rejected": -4.47265625, "step": 510 }, { "epoch": 0.65408, "grad_norm": 9.557828693226199, "learning_rate": 2.2588828953511252e-07, "logits/chosen": -0.250823974609375, "logits/rejected": -0.336669921875, "logps/chosen": -663.0, "logps/rejected": -774.0, "loss": 0.513, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.5625, "rewards/margins": 1.05712890625, "rewards/rejected": -4.625, "step": 511 }, { "epoch": 0.65536, "grad_norm": 15.537086503024735, "learning_rate": 2.2442500154985642e-07, "logits/chosen": -0.236083984375, "logits/rejected": -0.26043701171875, "logps/chosen": -640.5, "logps/rejected": -765.5, "loss": 0.5179, "rewards/accuracies": 0.765625, "rewards/chosen": -3.447265625, "rewards/margins": 1.009765625, "rewards/rejected": -4.458984375, "step": 512 }, { "epoch": 0.65664, "grad_norm": 8.84797363334171, "learning_rate": 2.229642285066236e-07, "logits/chosen": -0.33831787109375, "logits/rejected": -0.3756103515625, "logps/chosen": -658.0, "logps/rejected": -813.0, "loss": 0.3957, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5234375, "rewards/margins": 1.4072265625, "rewards/rejected": -4.931640625, "step": 513 }, { "epoch": 0.65792, "grad_norm": 8.898032465683093, "learning_rate": 2.2150599966091535e-07, "logits/chosen": -0.23968505859375, "logits/rejected": -0.240509033203125, "logps/chosen": -670.0, "logps/rejected": -768.0, "loss": 0.4581, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.576171875, "rewards/margins": 1.08349609375, "rewards/rejected": -4.66015625, "step": 514 }, { "epoch": 0.6592, "grad_norm": 9.301895530550201, "learning_rate": 2.200503442172792e-07, "logits/chosen": -0.2496337890625, "logits/rejected": -0.316650390625, "logps/chosen": -694.5, "logps/rejected": -817.0, "loss": 0.4339, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.78515625, "rewards/margins": 1.220703125, "rewards/rejected": -5.0, "step": 515 }, { "epoch": 0.66048, "grad_norm": 9.315272518266339, "learning_rate": 2.1859729132872407e-07, "logits/chosen": -0.2623291015625, "logits/rejected": -0.2822265625, "logps/chosen": -610.25, "logps/rejected": -698.0, "loss": 0.5085, "rewards/accuracies": 0.8125, "rewards/chosen": -3.298828125, "rewards/margins": 1.0263671875, "rewards/rejected": -4.326171875, "step": 516 }, { "epoch": 0.66176, "grad_norm": 8.846499820634536, "learning_rate": 2.171468700961363e-07, "logits/chosen": -0.243621826171875, "logits/rejected": -0.24200439453125, "logps/chosen": -702.5, "logps/rejected": -804.0, "loss": 0.4638, "rewards/accuracies": 0.75, "rewards/chosen": -3.7265625, "rewards/margins": 1.20751953125, "rewards/rejected": -4.931640625, "step": 517 }, { "epoch": 0.66304, "grad_norm": 19.450404147912273, "learning_rate": 2.1569910956769707e-07, "logits/chosen": -0.264404296875, "logits/rejected": -0.3031005859375, "logps/chosen": -731.0, "logps/rejected": -857.0, "loss": 0.3519, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.779296875, "rewards/margins": 1.4775390625, "rewards/rejected": -5.25390625, "step": 518 }, { "epoch": 0.66432, "grad_norm": 11.9383597079162, "learning_rate": 2.1425403873830082e-07, "logits/chosen": -0.267974853515625, "logits/rejected": -0.300872802734375, "logps/chosen": -758.5, "logps/rejected": -859.5, "loss": 0.4013, "rewards/accuracies": 0.8046875, "rewards/chosen": -4.015625, "rewards/margins": 1.35791015625, "rewards/rejected": -5.37109375, "step": 519 }, { "epoch": 0.6656, "grad_norm": 9.054749925855036, "learning_rate": 2.1281168654897377e-07, "logits/chosen": -0.24322509765625, "logits/rejected": -0.27294921875, "logps/chosen": -732.0, "logps/rejected": -846.0, "loss": 0.4282, "rewards/accuracies": 0.796875, "rewards/chosen": -4.001953125, "rewards/margins": 1.291015625, "rewards/rejected": -5.29296875, "step": 520 }, { "epoch": 0.66688, "grad_norm": 17.341035886669818, "learning_rate": 2.113720818862951e-07, "logits/chosen": -0.206298828125, "logits/rejected": -0.20355224609375, "logps/chosen": -724.0, "logps/rejected": -787.0, "loss": 0.5664, "rewards/accuracies": 0.7109375, "rewards/chosen": -3.859375, "rewards/margins": 0.95849609375, "rewards/rejected": -4.81640625, "step": 521 }, { "epoch": 0.66816, "grad_norm": 9.905351905849056, "learning_rate": 2.0993525358181822e-07, "logits/chosen": -0.275146484375, "logits/rejected": -0.3148193359375, "logps/chosen": -760.0, "logps/rejected": -883.0, "loss": 0.4874, "rewards/accuracies": 0.796875, "rewards/chosen": -4.158203125, "rewards/margins": 1.284912109375, "rewards/rejected": -5.453125, "step": 522 }, { "epoch": 0.66944, "grad_norm": 11.092239401478107, "learning_rate": 2.085012304114933e-07, "logits/chosen": -0.150665283203125, "logits/rejected": -0.187652587890625, "logps/chosen": -728.5, "logps/rejected": -839.0, "loss": 0.4484, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.9375, "rewards/margins": 1.2138671875, "rewards/rejected": -5.14453125, "step": 523 }, { "epoch": 0.67072, "grad_norm": 15.689220713255043, "learning_rate": 2.0707004109509057e-07, "logits/chosen": -0.125213623046875, "logits/rejected": -0.13275146484375, "logps/chosen": -734.5, "logps/rejected": -794.0, "loss": 0.5323, "rewards/accuracies": 0.71875, "rewards/chosen": -4.185546875, "rewards/margins": 1.04541015625, "rewards/rejected": -5.23828125, "step": 524 }, { "epoch": 0.672, "grad_norm": 22.019953630911694, "learning_rate": 2.0564171429562586e-07, "logits/chosen": -0.18068695068359375, "logits/rejected": -0.2034912109375, "logps/chosen": -683.0, "logps/rejected": -821.5, "loss": 0.3464, "rewards/accuracies": 0.8828125, "rewards/chosen": -3.689453125, "rewards/margins": 1.6474609375, "rewards/rejected": -5.328125, "step": 525 }, { "epoch": 0.67328, "grad_norm": 9.269392815729315, "learning_rate": 2.042162786187862e-07, "logits/chosen": -0.1710357666015625, "logits/rejected": -0.20377349853515625, "logps/chosen": -755.5, "logps/rejected": -875.5, "loss": 0.4172, "rewards/accuracies": 0.828125, "rewards/chosen": -4.275390625, "rewards/margins": 1.36376953125, "rewards/rejected": -5.63671875, "step": 526 }, { "epoch": 0.67456, "grad_norm": 18.30397649803818, "learning_rate": 2.027937626123565e-07, "logits/chosen": -0.2088623046875, "logits/rejected": -0.22777557373046875, "logps/chosen": -816.0, "logps/rejected": -960.0, "loss": 0.4332, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.57421875, "rewards/margins": 1.33251953125, "rewards/rejected": -5.91015625, "step": 527 }, { "epoch": 0.67584, "grad_norm": 10.323921457948874, "learning_rate": 2.0137419476564897e-07, "logits/chosen": -0.12468719482421875, "logits/rejected": -0.16400146484375, "logps/chosen": -781.0, "logps/rejected": -894.0, "loss": 0.4222, "rewards/accuracies": 0.765625, "rewards/chosen": -4.625, "rewards/margins": 1.38525390625, "rewards/rejected": -6.00390625, "step": 528 }, { "epoch": 0.67712, "grad_norm": 19.985890606151916, "learning_rate": 1.9995760350893097e-07, "logits/chosen": -0.15547943115234375, "logits/rejected": -0.218109130859375, "logps/chosen": -795.5, "logps/rejected": -912.0, "loss": 0.4694, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.646484375, "rewards/margins": 1.234375, "rewards/rejected": -5.88671875, "step": 529 }, { "epoch": 0.6784, "grad_norm": 20.81916495287454, "learning_rate": 1.985440172128573e-07, "logits/chosen": -0.15631103515625, "logits/rejected": -0.201385498046875, "logps/chosen": -786.0, "logps/rejected": -920.5, "loss": 0.3556, "rewards/accuracies": 0.875, "rewards/chosen": -4.556640625, "rewards/margins": 1.4970703125, "rewards/rejected": -6.05859375, "step": 530 }, { "epoch": 0.67968, "grad_norm": 11.921218446157969, "learning_rate": 1.9713346418790056e-07, "logits/chosen": -0.126007080078125, "logits/rejected": -0.188629150390625, "logps/chosen": -756.5, "logps/rejected": -899.0, "loss": 0.4957, "rewards/accuracies": 0.75, "rewards/chosen": -4.63671875, "rewards/margins": 1.23779296875, "rewards/rejected": -5.87109375, "step": 531 }, { "epoch": 0.68096, "grad_norm": 11.545371418939542, "learning_rate": 1.957259726837849e-07, "logits/chosen": -0.177947998046875, "logits/rejected": -0.2209320068359375, "logps/chosen": -775.0, "logps/rejected": -927.0, "loss": 0.4769, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.494140625, "rewards/margins": 1.47119140625, "rewards/rejected": -5.97265625, "step": 532 }, { "epoch": 0.68224, "grad_norm": 19.85526612585022, "learning_rate": 1.9432157088892065e-07, "logits/chosen": -0.1734619140625, "logits/rejected": -0.236083984375, "logps/chosen": -811.5, "logps/rejected": -915.0, "loss": 0.4809, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.828125, "rewards/margins": 1.300048828125, "rewards/rejected": -6.1328125, "step": 533 }, { "epoch": 0.68352, "grad_norm": 35.982005117679044, "learning_rate": 1.9292028692983824e-07, "logits/chosen": -0.162567138671875, "logits/rejected": -0.21380615234375, "logps/chosen": -779.0, "logps/rejected": -879.5, "loss": 0.5361, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.6953125, "rewards/margins": 1.1611328125, "rewards/rejected": -5.8515625, "step": 534 }, { "epoch": 0.6848, "grad_norm": 9.401343460064473, "learning_rate": 1.9152214887062702e-07, "logits/chosen": -0.18548583984375, "logits/rejected": -0.222076416015625, "logps/chosen": -784.0, "logps/rejected": -911.5, "loss": 0.4402, "rewards/accuracies": 0.8046875, "rewards/chosen": -4.724609375, "rewards/margins": 1.4794921875, "rewards/rejected": -6.19921875, "step": 535 }, { "epoch": 0.68608, "grad_norm": 19.636202317515714, "learning_rate": 1.9012718471237144e-07, "logits/chosen": -0.20050048828125, "logits/rejected": -0.25152587890625, "logps/chosen": -883.5, "logps/rejected": -995.0, "loss": 0.4611, "rewards/accuracies": 0.7734375, "rewards/chosen": -5.19140625, "rewards/margins": 1.3603515625, "rewards/rejected": -6.55078125, "step": 536 }, { "epoch": 0.68736, "grad_norm": 9.609636299261897, "learning_rate": 1.8873542239259109e-07, "logits/chosen": -0.13946533203125, "logits/rejected": -0.18389892578125, "logps/chosen": -783.5, "logps/rejected": -957.0, "loss": 0.4048, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.6484375, "rewards/margins": 1.47802734375, "rewards/rejected": -6.125, "step": 537 }, { "epoch": 0.68864, "grad_norm": 28.14830330305055, "learning_rate": 1.8734688978468098e-07, "logits/chosen": -0.23590087890625, "logits/rejected": -0.26971435546875, "logps/chosen": -811.0, "logps/rejected": -912.0, "loss": 0.5124, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.46875, "rewards/margins": 1.19091796875, "rewards/rejected": -5.66015625, "step": 538 }, { "epoch": 0.68992, "grad_norm": 17.73425969488332, "learning_rate": 1.8596161469735374e-07, "logits/chosen": -0.2036590576171875, "logits/rejected": -0.2774658203125, "logps/chosen": -808.0, "logps/rejected": -948.0, "loss": 0.4463, "rewards/accuracies": 0.8125, "rewards/chosen": -4.76953125, "rewards/margins": 1.41064453125, "rewards/rejected": -6.17578125, "step": 539 }, { "epoch": 0.6912, "grad_norm": 25.111899072723798, "learning_rate": 1.8457962487408174e-07, "logits/chosen": -0.12348175048828125, "logits/rejected": -0.16607093811035156, "logps/chosen": -771.0, "logps/rejected": -857.0, "loss": 0.5007, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.49609375, "rewards/margins": 1.124267578125, "rewards/rejected": -5.6171875, "step": 540 }, { "epoch": 0.69248, "grad_norm": 12.189044823657042, "learning_rate": 1.8320094799254222e-07, "logits/chosen": -0.20697021484375, "logits/rejected": -0.2476806640625, "logps/chosen": -793.0, "logps/rejected": -945.5, "loss": 0.4448, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.625, "rewards/margins": 1.31787109375, "rewards/rejected": -5.9453125, "step": 541 }, { "epoch": 0.69376, "grad_norm": 19.576872251438612, "learning_rate": 1.8182561166406308e-07, "logits/chosen": -0.22357177734375, "logits/rejected": -0.2642822265625, "logps/chosen": -749.5, "logps/rejected": -835.0, "loss": 0.5487, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.265625, "rewards/margins": 0.9267578125, "rewards/rejected": -5.1953125, "step": 542 }, { "epoch": 0.69504, "grad_norm": 9.991255716215326, "learning_rate": 1.8045364343306914e-07, "logits/chosen": -0.1717987060546875, "logits/rejected": -0.21246337890625, "logps/chosen": -731.5, "logps/rejected": -863.5, "loss": 0.4698, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.220703125, "rewards/margins": 1.2138671875, "rewards/rejected": -5.4375, "step": 543 }, { "epoch": 0.69632, "grad_norm": 9.256546508375417, "learning_rate": 1.7908507077653123e-07, "logits/chosen": -0.220703125, "logits/rejected": -0.2713623046875, "logps/chosen": -718.5, "logps/rejected": -876.0, "loss": 0.412, "rewards/accuracies": 0.796875, "rewards/chosen": -4.1015625, "rewards/margins": 1.4931640625, "rewards/rejected": -5.59375, "step": 544 }, { "epoch": 0.6976, "grad_norm": 10.085774446505868, "learning_rate": 1.7771992110341532e-07, "logits/chosen": -0.181396484375, "logits/rejected": -0.2060546875, "logps/chosen": -746.0, "logps/rejected": -893.0, "loss": 0.4511, "rewards/accuracies": 0.84375, "rewards/chosen": -3.97265625, "rewards/margins": 1.49462890625, "rewards/rejected": -5.46875, "step": 545 }, { "epoch": 0.69888, "grad_norm": 10.969992056433007, "learning_rate": 1.7635822175413445e-07, "logits/chosen": -0.284912109375, "logits/rejected": -0.3399658203125, "logps/chosen": -665.0, "logps/rejected": -803.0, "loss": 0.4535, "rewards/accuracies": 0.796875, "rewards/chosen": -3.654296875, "rewards/margins": 1.290283203125, "rewards/rejected": -4.94140625, "step": 546 }, { "epoch": 0.70016, "grad_norm": 18.256669397800174, "learning_rate": 1.7500000000000007e-07, "logits/chosen": -0.3035888671875, "logits/rejected": -0.346923828125, "logps/chosen": -667.0, "logps/rejected": -792.0, "loss": 0.3832, "rewards/accuracies": 0.859375, "rewards/chosen": -3.58984375, "rewards/margins": 1.271484375, "rewards/rejected": -4.86328125, "step": 547 }, { "epoch": 0.70144, "grad_norm": 12.455904821192838, "learning_rate": 1.7364528304267644e-07, "logits/chosen": -0.2991943359375, "logits/rejected": -0.333740234375, "logps/chosen": -641.5, "logps/rejected": -753.0, "loss": 0.437, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.29296875, "rewards/margins": 1.140625, "rewards/rejected": -4.435546875, "step": 548 }, { "epoch": 0.70272, "grad_norm": 9.000099856418279, "learning_rate": 1.7229409801363634e-07, "logits/chosen": -0.29541015625, "logits/rejected": -0.36572265625, "logps/chosen": -667.5, "logps/rejected": -761.5, "loss": 0.4534, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.552734375, "rewards/margins": 1.12939453125, "rewards/rejected": -4.68359375, "step": 549 }, { "epoch": 0.704, "grad_norm": 9.631183638602456, "learning_rate": 1.7094647197361656e-07, "logits/chosen": -0.322052001953125, "logits/rejected": -0.3463134765625, "logps/chosen": -696.5, "logps/rejected": -818.0, "loss": 0.4881, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.470703125, "rewards/margins": 1.14697265625, "rewards/rejected": -4.615234375, "step": 550 }, { "epoch": 0.70528, "grad_norm": 10.011321742256536, "learning_rate": 1.6960243191207686e-07, "logits/chosen": -0.296875, "logits/rejected": -0.35693359375, "logps/chosen": -691.5, "logps/rejected": -786.0, "loss": 0.4621, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.609375, "rewards/margins": 1.216796875, "rewards/rejected": -4.8203125, "step": 551 }, { "epoch": 0.70656, "grad_norm": 13.798592755194116, "learning_rate": 1.682620047466589e-07, "logits/chosen": -0.35546875, "logits/rejected": -0.391357421875, "logps/chosen": -674.0, "logps/rejected": -765.5, "loss": 0.4362, "rewards/accuracies": 0.78125, "rewards/chosen": -3.546875, "rewards/margins": 1.2548828125, "rewards/rejected": -4.80078125, "step": 552 }, { "epoch": 0.70784, "grad_norm": 8.22483168916061, "learning_rate": 1.6692521732264789e-07, "logits/chosen": -0.32275390625, "logits/rejected": -0.35107421875, "logps/chosen": -708.0, "logps/rejected": -820.0, "loss": 0.4363, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.662109375, "rewards/margins": 1.157958984375, "rewards/rejected": -4.82421875, "step": 553 }, { "epoch": 0.70912, "grad_norm": 11.12423189134255, "learning_rate": 1.655920964124339e-07, "logits/chosen": -0.30712890625, "logits/rejected": -0.342376708984375, "logps/chosen": -666.5, "logps/rejected": -764.5, "loss": 0.4255, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.408203125, "rewards/margins": 1.1845703125, "rewards/rejected": -4.595703125, "step": 554 }, { "epoch": 0.7104, "grad_norm": 9.470624413639532, "learning_rate": 1.642626687149765e-07, "logits/chosen": -0.298095703125, "logits/rejected": -0.3577880859375, "logps/chosen": -672.0, "logps/rejected": -760.0, "loss": 0.4545, "rewards/accuracies": 0.765625, "rewards/chosen": -3.388671875, "rewards/margins": 1.16259765625, "rewards/rejected": -4.5546875, "step": 555 }, { "epoch": 0.71168, "grad_norm": 10.343750237066667, "learning_rate": 1.629369608552696e-07, "logits/chosen": -0.3017578125, "logits/rejected": -0.359619140625, "logps/chosen": -703.5, "logps/rejected": -814.5, "loss": 0.4353, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.642578125, "rewards/margins": 1.1728515625, "rewards/rejected": -4.81640625, "step": 556 }, { "epoch": 0.71296, "grad_norm": 10.683608221205487, "learning_rate": 1.6161499938380874e-07, "logits/chosen": -0.3658447265625, "logits/rejected": -0.42822265625, "logps/chosen": -701.5, "logps/rejected": -829.0, "loss": 0.4438, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.548828125, "rewards/margins": 1.4580078125, "rewards/rejected": -5.005859375, "step": 557 }, { "epoch": 0.71424, "grad_norm": 9.927387665144652, "learning_rate": 1.6029681077605865e-07, "logits/chosen": -0.347412109375, "logits/rejected": -0.39990234375, "logps/chosen": -686.5, "logps/rejected": -856.5, "loss": 0.4152, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.654296875, "rewards/margins": 1.3212890625, "rewards/rejected": -4.9765625, "step": 558 }, { "epoch": 0.71552, "grad_norm": 8.62349737887557, "learning_rate": 1.5898242143192336e-07, "logits/chosen": -0.307373046875, "logits/rejected": -0.3692626953125, "logps/chosen": -644.5, "logps/rejected": -791.5, "loss": 0.3838, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.294921875, "rewards/margins": 1.3662109375, "rewards/rejected": -4.66796875, "step": 559 }, { "epoch": 0.7168, "grad_norm": 20.08172331534478, "learning_rate": 1.576718576752179e-07, "logits/chosen": -0.25872802734375, "logits/rejected": -0.3118896484375, "logps/chosen": -621.0, "logps/rejected": -742.5, "loss": 0.4035, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.26171875, "rewards/margins": 1.31884765625, "rewards/rejected": -4.58203125, "step": 560 }, { "epoch": 0.71808, "grad_norm": 9.964788088459969, "learning_rate": 1.5636514575314023e-07, "logits/chosen": -0.3486328125, "logits/rejected": -0.4261474609375, "logps/chosen": -727.0, "logps/rejected": -832.0, "loss": 0.5178, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.638671875, "rewards/margins": 1.04345703125, "rewards/rejected": -4.6796875, "step": 561 }, { "epoch": 0.71936, "grad_norm": 8.246036203914949, "learning_rate": 1.550623118357463e-07, "logits/chosen": -0.261474609375, "logits/rejected": -0.327880859375, "logps/chosen": -699.0, "logps/rejected": -844.0, "loss": 0.4117, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.875, "rewards/margins": 1.34619140625, "rewards/rejected": -5.22265625, "step": 562 }, { "epoch": 0.72064, "grad_norm": 9.24571549890142, "learning_rate": 1.5376338201542535e-07, "logits/chosen": -0.3028564453125, "logits/rejected": -0.3468017578125, "logps/chosen": -667.0, "logps/rejected": -782.0, "loss": 0.4665, "rewards/accuracies": 0.7265625, "rewards/chosen": -3.62109375, "rewards/margins": 1.04150390625, "rewards/rejected": -4.66015625, "step": 563 }, { "epoch": 0.72192, "grad_norm": 11.314423673740029, "learning_rate": 1.524683823063783e-07, "logits/chosen": -0.33697509765625, "logits/rejected": -0.35467529296875, "logps/chosen": -703.0, "logps/rejected": -758.5, "loss": 0.5141, "rewards/accuracies": 0.75, "rewards/chosen": -3.708984375, "rewards/margins": 1.0166015625, "rewards/rejected": -4.73046875, "step": 564 }, { "epoch": 0.7232, "grad_norm": 8.185307525881214, "learning_rate": 1.5117733864409549e-07, "logits/chosen": -0.247314453125, "logits/rejected": -0.314208984375, "logps/chosen": -650.5, "logps/rejected": -819.5, "loss": 0.3748, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.607421875, "rewards/margins": 1.447265625, "rewards/rejected": -5.05859375, "step": 565 }, { "epoch": 0.72448, "grad_norm": 18.20769264970902, "learning_rate": 1.4989027688483806e-07, "logits/chosen": -0.311767578125, "logits/rejected": -0.364501953125, "logps/chosen": -695.5, "logps/rejected": -808.0, "loss": 0.4326, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.751953125, "rewards/margins": 1.30908203125, "rewards/rejected": -5.06640625, "step": 566 }, { "epoch": 0.72576, "grad_norm": 8.80192874409478, "learning_rate": 1.4860722280512023e-07, "logits/chosen": -0.2942657470703125, "logits/rejected": -0.33642578125, "logps/chosen": -649.5, "logps/rejected": -767.0, "loss": 0.4278, "rewards/accuracies": 0.796875, "rewards/chosen": -3.607421875, "rewards/margins": 1.18017578125, "rewards/rejected": -4.78515625, "step": 567 }, { "epoch": 0.72704, "grad_norm": 10.943134384510522, "learning_rate": 1.4732820210119238e-07, "logits/chosen": -0.3031005859375, "logits/rejected": -0.3203125, "logps/chosen": -734.0, "logps/rejected": -838.0, "loss": 0.3886, "rewards/accuracies": 0.828125, "rewards/chosen": -3.98828125, "rewards/margins": 1.3251953125, "rewards/rejected": -5.3125, "step": 568 }, { "epoch": 0.72832, "grad_norm": 11.85355005830973, "learning_rate": 1.4605324038852707e-07, "logits/chosen": -0.20941162109375, "logits/rejected": -0.2779541015625, "logps/chosen": -668.0, "logps/rejected": -806.5, "loss": 0.4069, "rewards/accuracies": 0.859375, "rewards/chosen": -3.740234375, "rewards/margins": 1.44677734375, "rewards/rejected": -5.1875, "step": 569 }, { "epoch": 0.7296, "grad_norm": 8.286237978866287, "learning_rate": 1.4478236320130553e-07, "logits/chosen": -0.217041015625, "logits/rejected": -0.267822265625, "logps/chosen": -680.0, "logps/rejected": -779.0, "loss": 0.4393, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.6484375, "rewards/margins": 1.163330078125, "rewards/rejected": -4.8125, "step": 570 }, { "epoch": 0.73088, "grad_norm": 13.68132204214939, "learning_rate": 1.4351559599190707e-07, "logits/chosen": -0.17962646484375, "logits/rejected": -0.239990234375, "logps/chosen": -720.0, "logps/rejected": -853.0, "loss": 0.4864, "rewards/accuracies": 0.75, "rewards/chosen": -4.06640625, "rewards/margins": 1.2568359375, "rewards/rejected": -5.32421875, "step": 571 }, { "epoch": 0.73216, "grad_norm": 9.213667928480618, "learning_rate": 1.4225296413039794e-07, "logits/chosen": -0.20670700073242188, "logits/rejected": -0.26763916015625, "logps/chosen": -705.0, "logps/rejected": -866.0, "loss": 0.4484, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.021484375, "rewards/margins": 1.3505859375, "rewards/rejected": -5.37109375, "step": 572 }, { "epoch": 0.73344, "grad_norm": 9.155271393542527, "learning_rate": 1.409944929040249e-07, "logits/chosen": -0.188690185546875, "logits/rejected": -0.23065185546875, "logps/chosen": -703.5, "logps/rejected": -821.5, "loss": 0.4569, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.984375, "rewards/margins": 1.224609375, "rewards/rejected": -5.21484375, "step": 573 }, { "epoch": 0.73472, "grad_norm": 12.402920193014143, "learning_rate": 1.3974020751670732e-07, "logits/chosen": -0.237548828125, "logits/rejected": -0.302734375, "logps/chosen": -666.0, "logps/rejected": -824.5, "loss": 0.4361, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.7734375, "rewards/margins": 1.27490234375, "rewards/rejected": -5.048828125, "step": 574 }, { "epoch": 0.736, "grad_norm": 13.672795096747066, "learning_rate": 1.3849013308853368e-07, "logits/chosen": -0.1479034423828125, "logits/rejected": -0.21832275390625, "logps/chosen": -779.0, "logps/rejected": -881.5, "loss": 0.4502, "rewards/accuracies": 0.78125, "rewards/chosen": -4.23828125, "rewards/margins": 1.22607421875, "rewards/rejected": -5.4609375, "step": 575 }, { "epoch": 0.73728, "grad_norm": 9.971405880324383, "learning_rate": 1.3724429465525732e-07, "logits/chosen": -0.1641998291015625, "logits/rejected": -0.186248779296875, "logps/chosen": -713.0, "logps/rejected": -790.0, "loss": 0.499, "rewards/accuracies": 0.75, "rewards/chosen": -3.955078125, "rewards/margins": 1.1162109375, "rewards/rejected": -5.06640625, "step": 576 }, { "epoch": 0.73856, "grad_norm": 8.772600149464196, "learning_rate": 1.360027171677957e-07, "logits/chosen": -0.2245025634765625, "logits/rejected": -0.2720947265625, "logps/chosen": -705.0, "logps/rejected": -841.0, "loss": 0.4277, "rewards/accuracies": 0.8125, "rewards/chosen": -3.859375, "rewards/margins": 1.580078125, "rewards/rejected": -5.4375, "step": 577 }, { "epoch": 0.73984, "grad_norm": 18.81863742515878, "learning_rate": 1.3476542549173096e-07, "logits/chosen": -0.2587890625, "logits/rejected": -0.3282470703125, "logps/chosen": -762.5, "logps/rejected": -856.0, "loss": 0.5151, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.23828125, "rewards/margins": 1.15771484375, "rewards/rejected": -5.39453125, "step": 578 }, { "epoch": 0.74112, "grad_norm": 9.936151878622386, "learning_rate": 1.335324444068108e-07, "logits/chosen": -0.2752685546875, "logits/rejected": -0.334228515625, "logps/chosen": -753.0, "logps/rejected": -913.0, "loss": 0.3748, "rewards/accuracies": 0.828125, "rewards/chosen": -4.279296875, "rewards/margins": 1.607421875, "rewards/rejected": -5.890625, "step": 579 }, { "epoch": 0.7424, "grad_norm": 14.86753917739169, "learning_rate": 1.3230379860645365e-07, "logits/chosen": -0.25177001953125, "logits/rejected": -0.300537109375, "logps/chosen": -718.0, "logps/rejected": -872.0, "loss": 0.3886, "rewards/accuracies": 0.8203125, "rewards/chosen": -4.125, "rewards/margins": 1.49072265625, "rewards/rejected": -5.625, "step": 580 }, { "epoch": 0.74368, "grad_norm": 14.0480837535775, "learning_rate": 1.3107951269725286e-07, "logits/chosen": -0.1736297607421875, "logits/rejected": -0.22271728515625, "logps/chosen": -788.5, "logps/rejected": -911.5, "loss": 0.4652, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.408203125, "rewards/margins": 1.3837890625, "rewards/rejected": -5.8046875, "step": 581 }, { "epoch": 0.74496, "grad_norm": 9.252489484832754, "learning_rate": 1.2985961119848506e-07, "logits/chosen": -0.15355682373046875, "logits/rejected": -0.20587158203125, "logps/chosen": -729.5, "logps/rejected": -853.0, "loss": 0.4643, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.099609375, "rewards/margins": 1.3515625, "rewards/rejected": -5.453125, "step": 582 }, { "epoch": 0.74624, "grad_norm": 18.79861359200076, "learning_rate": 1.28644118541618e-07, "logits/chosen": -0.179351806640625, "logits/rejected": -0.22210693359375, "logps/chosen": -792.0, "logps/rejected": -912.0, "loss": 0.5022, "rewards/accuracies": 0.8203125, "rewards/chosen": -4.345703125, "rewards/margins": 1.33837890625, "rewards/rejected": -5.68359375, "step": 583 }, { "epoch": 0.74752, "grad_norm": 9.556413731103051, "learning_rate": 1.2743305906982183e-07, "logits/chosen": -0.142608642578125, "logits/rejected": -0.1815185546875, "logps/chosen": -772.0, "logps/rejected": -916.0, "loss": 0.4378, "rewards/accuracies": 0.75, "rewards/chosen": -4.3984375, "rewards/margins": 1.37841796875, "rewards/rejected": -5.7734375, "step": 584 }, { "epoch": 0.7488, "grad_norm": 10.524837392255892, "learning_rate": 1.2622645703748163e-07, "logits/chosen": -0.0958251953125, "logits/rejected": -0.1723480224609375, "logps/chosen": -704.5, "logps/rejected": -864.5, "loss": 0.4003, "rewards/accuracies": 0.8203125, "rewards/chosen": -4.115234375, "rewards/margins": 1.41064453125, "rewards/rejected": -5.5234375, "step": 585 }, { "epoch": 0.75008, "grad_norm": 13.312445253403741, "learning_rate": 1.2502433660971123e-07, "logits/chosen": -0.19342041015625, "logits/rejected": -0.23333740234375, "logps/chosen": -759.0, "logps/rejected": -883.0, "loss": 0.3936, "rewards/accuracies": 0.8125, "rewards/chosen": -4.291015625, "rewards/margins": 1.40185546875, "rewards/rejected": -5.69921875, "step": 586 }, { "epoch": 0.75136, "grad_norm": 8.960818660553267, "learning_rate": 1.2382672186187003e-07, "logits/chosen": -0.2032470703125, "logits/rejected": -0.2041015625, "logps/chosen": -725.5, "logps/rejected": -849.0, "loss": 0.3993, "rewards/accuracies": 0.8125, "rewards/chosen": -3.953125, "rewards/margins": 1.3955078125, "rewards/rejected": -5.3515625, "step": 587 }, { "epoch": 0.75264, "grad_norm": 10.646689306325051, "learning_rate": 1.2263363677907974e-07, "logits/chosen": -0.123687744140625, "logits/rejected": -0.1717071533203125, "logps/chosen": -723.0, "logps/rejected": -856.5, "loss": 0.4223, "rewards/accuracies": 0.78125, "rewards/chosen": -4.0390625, "rewards/margins": 1.4228515625, "rewards/rejected": -5.4609375, "step": 588 }, { "epoch": 0.75392, "grad_norm": 12.393571858690176, "learning_rate": 1.214451052557453e-07, "logits/chosen": -0.177520751953125, "logits/rejected": -0.19673919677734375, "logps/chosen": -767.0, "logps/rejected": -846.5, "loss": 0.5177, "rewards/accuracies": 0.765625, "rewards/chosen": -4.318359375, "rewards/margins": 0.98583984375, "rewards/rejected": -5.30078125, "step": 589 }, { "epoch": 0.7552, "grad_norm": 22.161788073197705, "learning_rate": 1.202611510950747e-07, "logits/chosen": -0.08966064453125, "logits/rejected": -0.11224365234375, "logps/chosen": -767.5, "logps/rejected": -872.0, "loss": 0.5003, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.390625, "rewards/margins": 1.14306640625, "rewards/rejected": -5.53515625, "step": 590 }, { "epoch": 0.75648, "grad_norm": 12.260254703657399, "learning_rate": 1.1908179800860415e-07, "logits/chosen": -0.0744476318359375, "logits/rejected": -0.13702392578125, "logps/chosen": -723.0, "logps/rejected": -851.5, "loss": 0.4506, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.1640625, "rewards/margins": 1.28857421875, "rewards/rejected": -5.4453125, "step": 591 }, { "epoch": 0.75776, "grad_norm": 9.937355029365806, "learning_rate": 1.1790706961572176e-07, "logits/chosen": -0.20186614990234375, "logits/rejected": -0.202545166015625, "logps/chosen": -788.0, "logps/rejected": -897.5, "loss": 0.4231, "rewards/accuracies": 0.8046875, "rewards/chosen": -4.359375, "rewards/margins": 1.36572265625, "rewards/rejected": -5.71875, "step": 592 }, { "epoch": 0.75904, "grad_norm": 10.470389968279672, "learning_rate": 1.1673698944319491e-07, "logits/chosen": -0.11907958984375, "logits/rejected": -0.17713165283203125, "logps/chosen": -760.5, "logps/rejected": -864.5, "loss": 0.4464, "rewards/accuracies": 0.78125, "rewards/chosen": -4.271484375, "rewards/margins": 1.3154296875, "rewards/rejected": -5.58984375, "step": 593 }, { "epoch": 0.76032, "grad_norm": 25.904113311082135, "learning_rate": 1.1557158092469967e-07, "logits/chosen": -0.13934326171875, "logits/rejected": -0.1776123046875, "logps/chosen": -762.0, "logps/rejected": -914.5, "loss": 0.5208, "rewards/accuracies": 0.78125, "rewards/chosen": -4.568359375, "rewards/margins": 1.5400390625, "rewards/rejected": -6.109375, "step": 594 }, { "epoch": 0.7616, "grad_norm": 14.910481470992487, "learning_rate": 1.1441086740035036e-07, "logits/chosen": -0.197296142578125, "logits/rejected": -0.231689453125, "logps/chosen": -748.5, "logps/rejected": -863.0, "loss": 0.3905, "rewards/accuracies": 0.828125, "rewards/chosen": -4.0625, "rewards/margins": 1.39453125, "rewards/rejected": -5.4609375, "step": 595 }, { "epoch": 0.76288, "grad_norm": 10.59387700561826, "learning_rate": 1.1325487211623342e-07, "logits/chosen": -0.15478515625, "logits/rejected": -0.188751220703125, "logps/chosen": -709.0, "logps/rejected": -851.0, "loss": 0.4572, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.25390625, "rewards/margins": 1.296875, "rewards/rejected": -5.546875, "step": 596 }, { "epoch": 0.76416, "grad_norm": 11.067694544255176, "learning_rate": 1.1210361822394029e-07, "logits/chosen": -0.2096710205078125, "logits/rejected": -0.2550048828125, "logps/chosen": -758.0, "logps/rejected": -878.5, "loss": 0.4847, "rewards/accuracies": 0.765625, "rewards/chosen": -4.25, "rewards/margins": 1.408203125, "rewards/rejected": -5.65234375, "step": 597 }, { "epoch": 0.76544, "grad_norm": 45.69276578416623, "learning_rate": 1.1095712878010541e-07, "logits/chosen": -0.0606689453125, "logits/rejected": -0.1527099609375, "logps/chosen": -777.0, "logps/rejected": -876.0, "loss": 0.5514, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.46484375, "rewards/margins": 1.1143798828125, "rewards/rejected": -5.58203125, "step": 598 }, { "epoch": 0.76672, "grad_norm": 12.679098112978194, "learning_rate": 1.0981542674594328e-07, "logits/chosen": -0.1817779541015625, "logits/rejected": -0.23337554931640625, "logps/chosen": -700.0, "logps/rejected": -842.5, "loss": 0.4295, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.791015625, "rewards/margins": 1.537109375, "rewards/rejected": -5.33203125, "step": 599 }, { "epoch": 0.768, "grad_norm": 10.800303505547818, "learning_rate": 1.0867853498678901e-07, "logits/chosen": -0.273681640625, "logits/rejected": -0.293701171875, "logps/chosen": -705.0, "logps/rejected": -885.5, "loss": 0.4958, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.986328125, "rewards/margins": 1.387939453125, "rewards/rejected": -5.375, "step": 600 }, { "epoch": 0.768, "eval_logits/chosen": -0.19268035888671875, "eval_logits/rejected": -0.27252197265625, "eval_logps/chosen": -706.25, "eval_logps/rejected": -804.5, "eval_loss": 0.47617968916893005, "eval_rewards/accuracies": 0.740234375, "eval_rewards/chosen": -3.86328125, "eval_rewards/margins": 1.166748046875, "eval_rewards/rejected": -5.025390625, "eval_runtime": 27.9176, "eval_samples_per_second": 17.91, "eval_steps_per_second": 0.573, "step": 600 }, { "epoch": 0.76928, "grad_norm": 9.048271943790935, "learning_rate": 1.0754647627164022e-07, "logits/chosen": -0.184814453125, "logits/rejected": -0.2506103515625, "logps/chosen": -703.5, "logps/rejected": -881.5, "loss": 0.3442, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.921875, "rewards/margins": 1.6376953125, "rewards/rejected": -5.5625, "step": 601 }, { "epoch": 0.77056, "grad_norm": 26.0709495707422, "learning_rate": 1.064192732727016e-07, "logits/chosen": -0.19481658935546875, "logits/rejected": -0.232147216796875, "logps/chosen": -694.5, "logps/rejected": -756.0, "loss": 0.6118, "rewards/accuracies": 0.6953125, "rewards/chosen": -3.634765625, "rewards/margins": 0.9560546875, "rewards/rejected": -4.59375, "step": 602 }, { "epoch": 0.77184, "grad_norm": 9.88642766961704, "learning_rate": 1.0529694856493002e-07, "logits/chosen": -0.2410888671875, "logits/rejected": -0.287353515625, "logps/chosen": -764.0, "logps/rejected": -886.5, "loss": 0.4617, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.41015625, "rewards/margins": 1.09234619140625, "rewards/rejected": -5.50390625, "step": 603 }, { "epoch": 0.77312, "grad_norm": 9.433226751910178, "learning_rate": 1.0417952462558286e-07, "logits/chosen": -0.21282958984375, "logits/rejected": -0.2587127685546875, "logps/chosen": -719.0, "logps/rejected": -844.5, "loss": 0.4325, "rewards/accuracies": 0.828125, "rewards/chosen": -3.951171875, "rewards/margins": 1.25732421875, "rewards/rejected": -5.2109375, "step": 604 }, { "epoch": 0.7744, "grad_norm": 11.539221682126705, "learning_rate": 1.0306702383376813e-07, "logits/chosen": -0.20572662353515625, "logits/rejected": -0.2346649169921875, "logps/chosen": -729.0, "logps/rejected": -841.5, "loss": 0.4227, "rewards/accuracies": 0.84375, "rewards/chosen": -3.88671875, "rewards/margins": 1.31982421875, "rewards/rejected": -5.20703125, "step": 605 }, { "epoch": 0.77568, "grad_norm": 9.610274231941142, "learning_rate": 1.0195946846999551e-07, "logits/chosen": -0.274658203125, "logits/rejected": -0.3345947265625, "logps/chosen": -688.5, "logps/rejected": -803.5, "loss": 0.4396, "rewards/accuracies": 0.765625, "rewards/chosen": -3.70703125, "rewards/margins": 1.340576171875, "rewards/rejected": -5.04296875, "step": 606 }, { "epoch": 0.77696, "grad_norm": 9.567408589138397, "learning_rate": 1.0085688071573085e-07, "logits/chosen": -0.221038818359375, "logits/rejected": -0.2277069091796875, "logps/chosen": -713.0, "logps/rejected": -840.0, "loss": 0.4913, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.861328125, "rewards/margins": 1.158935546875, "rewards/rejected": -5.015625, "step": 607 }, { "epoch": 0.77824, "grad_norm": 12.50027732304155, "learning_rate": 9.975928265295139e-08, "logits/chosen": -0.26611328125, "logits/rejected": -0.309814453125, "logps/chosen": -691.5, "logps/rejected": -806.5, "loss": 0.3885, "rewards/accuracies": 0.796875, "rewards/chosen": -3.720703125, "rewards/margins": 1.33642578125, "rewards/rejected": -5.05859375, "step": 608 }, { "epoch": 0.77952, "grad_norm": 10.627712453917534, "learning_rate": 9.866669626370412e-08, "logits/chosen": -0.2711181640625, "logits/rejected": -0.32177734375, "logps/chosen": -739.5, "logps/rejected": -880.0, "loss": 0.4459, "rewards/accuracies": 0.8125, "rewards/chosen": -3.81640625, "rewards/margins": 1.3115234375, "rewards/rejected": -5.12109375, "step": 609 }, { "epoch": 0.7808, "grad_norm": 9.075340025404623, "learning_rate": 9.757914342966495e-08, "logits/chosen": -0.17635345458984375, "logits/rejected": -0.23785400390625, "logps/chosen": -698.5, "logps/rejected": -799.5, "loss": 0.4333, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.68359375, "rewards/margins": 1.27197265625, "rewards/rejected": -4.958984375, "step": 610 }, { "epoch": 0.78208, "grad_norm": 17.6370736058902, "learning_rate": 9.64966459317006e-08, "logits/chosen": -0.2579345703125, "logits/rejected": -0.31915283203125, "logps/chosen": -692.5, "logps/rejected": -871.0, "loss": 0.3659, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.849609375, "rewards/margins": 1.578125, "rewards/rejected": -5.421875, "step": 611 }, { "epoch": 0.78336, "grad_norm": 10.37299929145861, "learning_rate": 9.541922544943294e-08, "logits/chosen": -0.2517547607421875, "logits/rejected": -0.315185546875, "logps/chosen": -712.5, "logps/rejected": -842.0, "loss": 0.4197, "rewards/accuracies": 0.8125, "rewards/chosen": -3.94140625, "rewards/margins": 1.3232421875, "rewards/rejected": -5.265625, "step": 612 }, { "epoch": 0.78464, "grad_norm": 8.76113086501841, "learning_rate": 9.434690356080393e-08, "logits/chosen": -0.3062744140625, "logits/rejected": -0.37060546875, "logps/chosen": -683.0, "logps/rejected": -821.0, "loss": 0.4658, "rewards/accuracies": 0.75, "rewards/chosen": -3.763671875, "rewards/margins": 1.23486328125, "rewards/rejected": -5.00390625, "step": 613 }, { "epoch": 0.78592, "grad_norm": 12.189626085161583, "learning_rate": 9.327970174164408e-08, "logits/chosen": -0.16046142578125, "logits/rejected": -0.1937255859375, "logps/chosen": -695.5, "logps/rejected": -782.0, "loss": 0.534, "rewards/accuracies": 0.75, "rewards/chosen": -3.953125, "rewards/margins": 0.9755859375, "rewards/rejected": -4.92578125, "step": 614 }, { "epoch": 0.7872, "grad_norm": 15.30898275082047, "learning_rate": 9.221764136524202e-08, "logits/chosen": -0.2682647705078125, "logits/rejected": -0.337371826171875, "logps/chosen": -689.5, "logps/rejected": -798.0, "loss": 0.417, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.775390625, "rewards/margins": 1.229736328125, "rewards/rejected": -5.0078125, "step": 615 }, { "epoch": 0.78848, "grad_norm": 9.836705918506746, "learning_rate": 9.116074370191705e-08, "logits/chosen": -0.2236480712890625, "logits/rejected": -0.28759765625, "logps/chosen": -670.0, "logps/rejected": -761.0, "loss": 0.4567, "rewards/accuracies": 0.796875, "rewards/chosen": -3.65234375, "rewards/margins": 1.11376953125, "rewards/rejected": -4.765625, "step": 616 }, { "epoch": 0.78976, "grad_norm": 9.891228631270444, "learning_rate": 9.010902991859196e-08, "logits/chosen": -0.19852447509765625, "logits/rejected": -0.21734619140625, "logps/chosen": -712.0, "logps/rejected": -813.5, "loss": 0.4786, "rewards/accuracies": 0.765625, "rewards/chosen": -3.890625, "rewards/margins": 1.10205078125, "rewards/rejected": -4.98828125, "step": 617 }, { "epoch": 0.79104, "grad_norm": 10.619085582647367, "learning_rate": 8.906252107837054e-08, "logits/chosen": -0.24609375, "logits/rejected": -0.298095703125, "logps/chosen": -682.5, "logps/rejected": -806.0, "loss": 0.4685, "rewards/accuracies": 0.8125, "rewards/chosen": -3.83203125, "rewards/margins": 1.31005859375, "rewards/rejected": -5.1328125, "step": 618 }, { "epoch": 0.79232, "grad_norm": 8.55320493819975, "learning_rate": 8.802123814011458e-08, "logits/chosen": -0.2899169921875, "logits/rejected": -0.3363037109375, "logps/chosen": -700.0, "logps/rejected": -820.5, "loss": 0.4403, "rewards/accuracies": 0.765625, "rewards/chosen": -3.98046875, "rewards/margins": 1.17822265625, "rewards/rejected": -5.15625, "step": 619 }, { "epoch": 0.7936, "grad_norm": 9.70388545104585, "learning_rate": 8.698520195802499e-08, "logits/chosen": -0.26741790771484375, "logits/rejected": -0.2891845703125, "logps/chosen": -744.5, "logps/rejected": -860.0, "loss": 0.4775, "rewards/accuracies": 0.8125, "rewards/chosen": -3.951171875, "rewards/margins": 1.2177734375, "rewards/rejected": -5.171875, "step": 620 }, { "epoch": 0.79488, "grad_norm": 12.896195486210319, "learning_rate": 8.595443328122345e-08, "logits/chosen": -0.253082275390625, "logits/rejected": -0.288360595703125, "logps/chosen": -699.0, "logps/rejected": -840.0, "loss": 0.4228, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.779296875, "rewards/margins": 1.33056640625, "rewards/rejected": -5.1171875, "step": 621 }, { "epoch": 0.79616, "grad_norm": 10.316721984106971, "learning_rate": 8.492895275333704e-08, "logits/chosen": -0.242431640625, "logits/rejected": -0.29632568359375, "logps/chosen": -647.0, "logps/rejected": -806.5, "loss": 0.4143, "rewards/accuracies": 0.796875, "rewards/chosen": -3.740234375, "rewards/margins": 1.31005859375, "rewards/rejected": -5.05078125, "step": 622 }, { "epoch": 0.79744, "grad_norm": 10.794420061393975, "learning_rate": 8.390878091208543e-08, "logits/chosen": -0.1590423583984375, "logits/rejected": -0.21337890625, "logps/chosen": -706.0, "logps/rejected": -777.0, "loss": 0.46, "rewards/accuracies": 0.796875, "rewards/chosen": -3.787109375, "rewards/margins": 1.115234375, "rewards/rejected": -4.90625, "step": 623 }, { "epoch": 0.79872, "grad_norm": 8.48224104747155, "learning_rate": 8.289393818886838e-08, "logits/chosen": -0.1787109375, "logits/rejected": -0.205230712890625, "logps/chosen": -709.5, "logps/rejected": -812.0, "loss": 0.4332, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.748046875, "rewards/margins": 1.294921875, "rewards/rejected": -5.0546875, "step": 624 }, { "epoch": 0.8, "grad_norm": 23.137925812568916, "learning_rate": 8.188444490835773e-08, "logits/chosen": -0.23029327392578125, "logits/rejected": -0.27685546875, "logps/chosen": -721.5, "logps/rejected": -848.0, "loss": 0.508, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.85546875, "rewards/margins": 1.2080078125, "rewards/rejected": -5.0546875, "step": 625 }, { "epoch": 0.80128, "grad_norm": 9.512426613145827, "learning_rate": 8.088032128808952e-08, "logits/chosen": -0.16583251953125, "logits/rejected": -0.2113037109375, "logps/chosen": -700.0, "logps/rejected": -822.5, "loss": 0.4431, "rewards/accuracies": 0.765625, "rewards/chosen": -3.73046875, "rewards/margins": 1.28515625, "rewards/rejected": -5.021484375, "step": 626 }, { "epoch": 0.80256, "grad_norm": 9.523485315600638, "learning_rate": 7.988158743805972e-08, "logits/chosen": -0.28082275390625, "logits/rejected": -0.3321533203125, "logps/chosen": -685.5, "logps/rejected": -781.0, "loss": 0.4848, "rewards/accuracies": 0.75, "rewards/chosen": -3.615234375, "rewards/margins": 1.0703125, "rewards/rejected": -4.6875, "step": 627 }, { "epoch": 0.80384, "grad_norm": 9.450418163489136, "learning_rate": 7.888826336032093e-08, "logits/chosen": -0.2333984375, "logits/rejected": -0.3123779296875, "logps/chosen": -714.5, "logps/rejected": -828.0, "loss": 0.4416, "rewards/accuracies": 0.828125, "rewards/chosen": -3.89453125, "rewards/margins": 1.34619140625, "rewards/rejected": -5.2421875, "step": 628 }, { "epoch": 0.80512, "grad_norm": 13.14269906470784, "learning_rate": 7.790036894858197e-08, "logits/chosen": -0.24493408203125, "logits/rejected": -0.3167724609375, "logps/chosen": -700.0, "logps/rejected": -795.0, "loss": 0.4833, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.8125, "rewards/margins": 1.1923828125, "rewards/rejected": -5.0, "step": 629 }, { "epoch": 0.8064, "grad_norm": 17.566771895062967, "learning_rate": 7.691792398780962e-08, "logits/chosen": -0.18544769287109375, "logits/rejected": -0.23531341552734375, "logps/chosen": -714.0, "logps/rejected": -850.0, "loss": 0.4277, "rewards/accuracies": 0.796875, "rewards/chosen": -3.693359375, "rewards/margins": 1.42333984375, "rewards/rejected": -5.1171875, "step": 630 }, { "epoch": 0.80768, "grad_norm": 10.9996007916988, "learning_rate": 7.594094815383224e-08, "logits/chosen": -0.1927490234375, "logits/rejected": -0.25665283203125, "logps/chosen": -712.0, "logps/rejected": -824.5, "loss": 0.4316, "rewards/accuracies": 0.828125, "rewards/chosen": -3.744140625, "rewards/margins": 1.3701171875, "rewards/rejected": -5.10546875, "step": 631 }, { "epoch": 0.80896, "grad_norm": 16.150017435469856, "learning_rate": 7.496946101294586e-08, "logits/chosen": -0.2930908203125, "logits/rejected": -0.3349609375, "logps/chosen": -733.5, "logps/rejected": -856.0, "loss": 0.423, "rewards/accuracies": 0.78125, "rewards/chosen": -3.951171875, "rewards/margins": 1.19189453125, "rewards/rejected": -5.140625, "step": 632 }, { "epoch": 0.81024, "grad_norm": 22.47987317628445, "learning_rate": 7.400348202152192e-08, "logits/chosen": -0.12548828125, "logits/rejected": -0.17365264892578125, "logps/chosen": -735.0, "logps/rejected": -837.5, "loss": 0.5789, "rewards/accuracies": 0.7421875, "rewards/chosen": -4.09765625, "rewards/margins": 0.86962890625, "rewards/rejected": -4.958984375, "step": 633 }, { "epoch": 0.81152, "grad_norm": 15.769251930029661, "learning_rate": 7.304303052561841e-08, "logits/chosen": -0.16961669921875, "logits/rejected": -0.24853515625, "logps/chosen": -687.0, "logps/rejected": -828.5, "loss": 0.3706, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.740234375, "rewards/margins": 1.4990234375, "rewards/rejected": -5.23046875, "step": 634 }, { "epoch": 0.8128, "grad_norm": 14.6903117429968, "learning_rate": 7.208812576059112e-08, "logits/chosen": -0.35009765625, "logits/rejected": -0.3636474609375, "logps/chosen": -746.5, "logps/rejected": -820.5, "loss": 0.5197, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.94140625, "rewards/margins": 1.1171875, "rewards/rejected": -5.06640625, "step": 635 }, { "epoch": 0.81408, "grad_norm": 13.239985107487296, "learning_rate": 7.113878685070993e-08, "logits/chosen": -0.2686767578125, "logits/rejected": -0.2921142578125, "logps/chosen": -738.5, "logps/rejected": -828.5, "loss": 0.5338, "rewards/accuracies": 0.765625, "rewards/chosen": -4.21875, "rewards/margins": 0.9364013671875, "rewards/rejected": -5.1484375, "step": 636 }, { "epoch": 0.81536, "grad_norm": 18.26280480241238, "learning_rate": 7.019503280877466e-08, "logits/chosen": -0.207672119140625, "logits/rejected": -0.2611083984375, "logps/chosen": -721.0, "logps/rejected": -863.5, "loss": 0.3718, "rewards/accuracies": 0.890625, "rewards/chosen": -3.927734375, "rewards/margins": 1.470703125, "rewards/rejected": -5.40234375, "step": 637 }, { "epoch": 0.81664, "grad_norm": 11.402715846459442, "learning_rate": 6.925688253573465e-08, "logits/chosen": -0.165740966796875, "logits/rejected": -0.204833984375, "logps/chosen": -711.0, "logps/rejected": -823.0, "loss": 0.4429, "rewards/accuracies": 0.8125, "rewards/chosen": -3.888671875, "rewards/margins": 1.33349609375, "rewards/rejected": -5.22265625, "step": 638 }, { "epoch": 0.81792, "grad_norm": 9.972091848028276, "learning_rate": 6.832435482031064e-08, "logits/chosen": -0.244384765625, "logits/rejected": -0.24786376953125, "logps/chosen": -713.5, "logps/rejected": -818.0, "loss": 0.445, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.828125, "rewards/margins": 1.15966796875, "rewards/rejected": -4.986328125, "step": 639 }, { "epoch": 0.8192, "grad_norm": 15.756161036698968, "learning_rate": 6.739746833861759e-08, "logits/chosen": -0.227813720703125, "logits/rejected": -0.30035400390625, "logps/chosen": -700.5, "logps/rejected": -846.5, "loss": 0.4093, "rewards/accuracies": 0.828125, "rewards/chosen": -3.9375, "rewards/margins": 1.30615234375, "rewards/rejected": -5.25390625, "step": 640 }, { "epoch": 0.82048, "grad_norm": 12.006378355004856, "learning_rate": 6.647624165379173e-08, "logits/chosen": -0.203277587890625, "logits/rejected": -0.27020263671875, "logps/chosen": -650.0, "logps/rejected": -774.0, "loss": 0.4311, "rewards/accuracies": 0.8125, "rewards/chosen": -3.59765625, "rewards/margins": 1.26904296875, "rewards/rejected": -4.8671875, "step": 641 }, { "epoch": 0.82176, "grad_norm": 8.47930046439374, "learning_rate": 6.55606932156175e-08, "logits/chosen": -0.185546875, "logits/rejected": -0.24249267578125, "logps/chosen": -706.5, "logps/rejected": -811.5, "loss": 0.404, "rewards/accuracies": 0.8125, "rewards/chosen": -3.810546875, "rewards/margins": 1.3720703125, "rewards/rejected": -5.1796875, "step": 642 }, { "epoch": 0.82304, "grad_norm": 8.893390613363472, "learning_rate": 6.46508413601595e-08, "logits/chosen": -0.22678756713867188, "logits/rejected": -0.26580810546875, "logps/chosen": -698.0, "logps/rejected": -811.5, "loss": 0.4419, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.79296875, "rewards/margins": 1.20361328125, "rewards/rejected": -4.99609375, "step": 643 }, { "epoch": 0.82432, "grad_norm": 9.63687460143777, "learning_rate": 6.374670430939404e-08, "logits/chosen": -0.19427490234375, "logits/rejected": -0.2384033203125, "logps/chosen": -729.5, "logps/rejected": -863.0, "loss": 0.4036, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.90234375, "rewards/margins": 1.35791015625, "rewards/rejected": -5.2578125, "step": 644 }, { "epoch": 0.8256, "grad_norm": 16.684775851774532, "learning_rate": 6.284830017084488e-08, "logits/chosen": -0.223876953125, "logits/rejected": -0.291748046875, "logps/chosen": -698.5, "logps/rejected": -857.0, "loss": 0.4179, "rewards/accuracies": 0.8359375, "rewards/chosen": -4.015625, "rewards/margins": 1.4716796875, "rewards/rejected": -5.4921875, "step": 645 }, { "epoch": 0.82688, "grad_norm": 20.24004777671895, "learning_rate": 6.195564693722028e-08, "logits/chosen": -0.24462890625, "logits/rejected": -0.2711181640625, "logps/chosen": -721.0, "logps/rejected": -830.5, "loss": 0.5167, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.87109375, "rewards/margins": 1.27490234375, "rewards/rejected": -5.140625, "step": 646 }, { "epoch": 0.82816, "grad_norm": 9.754642974447716, "learning_rate": 6.1068762486053e-08, "logits/chosen": -0.10089111328125, "logits/rejected": -0.14161300659179688, "logps/chosen": -691.5, "logps/rejected": -848.5, "loss": 0.449, "rewards/accuracies": 0.796875, "rewards/chosen": -4.041015625, "rewards/margins": 1.12451171875, "rewards/rejected": -5.17578125, "step": 647 }, { "epoch": 0.82944, "grad_norm": 8.380496897716654, "learning_rate": 6.018766457934177e-08, "logits/chosen": -0.157867431640625, "logits/rejected": -0.177886962890625, "logps/chosen": -722.0, "logps/rejected": -858.5, "loss": 0.4101, "rewards/accuracies": 0.8671875, "rewards/chosen": -3.986328125, "rewards/margins": 1.50341796875, "rewards/rejected": -5.49609375, "step": 648 }, { "epoch": 0.83072, "grad_norm": 30.217667929527913, "learning_rate": 5.931237086319592e-08, "logits/chosen": -0.240966796875, "logits/rejected": -0.3031005859375, "logps/chosen": -713.0, "logps/rejected": -784.0, "loss": 0.6239, "rewards/accuracies": 0.7265625, "rewards/chosen": -4.0, "rewards/margins": 0.900146484375, "rewards/rejected": -4.8984375, "step": 649 }, { "epoch": 0.832, "grad_norm": 13.251227443569483, "learning_rate": 5.844289886748196e-08, "logits/chosen": -0.1904144287109375, "logits/rejected": -0.260955810546875, "logps/chosen": -729.0, "logps/rejected": -828.0, "loss": 0.4859, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.0078125, "rewards/margins": 1.28173828125, "rewards/rejected": -5.29296875, "step": 650 }, { "epoch": 0.83328, "grad_norm": 10.941982402370376, "learning_rate": 5.7579266005472304e-08, "logits/chosen": -0.22198486328125, "logits/rejected": -0.2850189208984375, "logps/chosen": -731.0, "logps/rejected": -805.5, "loss": 0.4837, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.943359375, "rewards/margins": 1.12548828125, "rewards/rejected": -5.072265625, "step": 651 }, { "epoch": 0.83456, "grad_norm": 11.451372721475789, "learning_rate": 5.672148957349661e-08, "logits/chosen": -0.2811279296875, "logits/rejected": -0.3087158203125, "logps/chosen": -737.5, "logps/rejected": -837.5, "loss": 0.5053, "rewards/accuracies": 0.765625, "rewards/chosen": -4.259765625, "rewards/margins": 1.033203125, "rewards/rejected": -5.29296875, "step": 652 }, { "epoch": 0.83584, "grad_norm": 13.98757298272134, "learning_rate": 5.586958675059548e-08, "logits/chosen": -0.2623291015625, "logits/rejected": -0.28857421875, "logps/chosen": -730.0, "logps/rejected": -806.0, "loss": 0.5169, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.154296875, "rewards/margins": 0.978271484375, "rewards/rejected": -5.13671875, "step": 653 }, { "epoch": 0.83712, "grad_norm": 12.725608737110653, "learning_rate": 5.502357459817639e-08, "logits/chosen": -0.3026123046875, "logits/rejected": -0.3392333984375, "logps/chosen": -785.5, "logps/rejected": -902.5, "loss": 0.4683, "rewards/accuracies": 0.78125, "rewards/chosen": -4.27734375, "rewards/margins": 1.37109375, "rewards/rejected": -5.6484375, "step": 654 }, { "epoch": 0.8384, "grad_norm": 18.59261713230035, "learning_rate": 5.418347005967189e-08, "logits/chosen": -0.22393798828125, "logits/rejected": -0.2733154296875, "logps/chosen": -670.5, "logps/rejected": -854.0, "loss": 0.3584, "rewards/accuracies": 0.875, "rewards/chosen": -3.83203125, "rewards/margins": 1.6171875, "rewards/rejected": -5.453125, "step": 655 }, { "epoch": 0.83968, "grad_norm": 15.681430188596131, "learning_rate": 5.334928996020012e-08, "logits/chosen": -0.2601318359375, "logits/rejected": -0.2978515625, "logps/chosen": -706.0, "logps/rejected": -789.0, "loss": 0.4116, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.630859375, "rewards/margins": 1.21728515625, "rewards/rejected": -4.84375, "step": 656 }, { "epoch": 0.84096, "grad_norm": 14.42329884280154, "learning_rate": 5.2521051006228475e-08, "logits/chosen": -0.16900634765625, "logits/rejected": -0.22943115234375, "logps/chosen": -719.0, "logps/rejected": -837.5, "loss": 0.4993, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.09765625, "rewards/margins": 1.08544921875, "rewards/rejected": -5.18359375, "step": 657 }, { "epoch": 0.84224, "grad_norm": 8.370169088229156, "learning_rate": 5.169876978523828e-08, "logits/chosen": -0.2957763671875, "logits/rejected": -0.3585205078125, "logps/chosen": -735.0, "logps/rejected": -874.0, "loss": 0.4017, "rewards/accuracies": 0.828125, "rewards/chosen": -4.001953125, "rewards/margins": 1.5009765625, "rewards/rejected": -5.51171875, "step": 658 }, { "epoch": 0.84352, "grad_norm": 9.292854951761994, "learning_rate": 5.088246276539292e-08, "logits/chosen": -0.2945556640625, "logits/rejected": -0.3333740234375, "logps/chosen": -772.5, "logps/rejected": -869.5, "loss": 0.4474, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.111328125, "rewards/margins": 1.32177734375, "rewards/rejected": -5.4296875, "step": 659 }, { "epoch": 0.8448, "grad_norm": 12.841846778247005, "learning_rate": 5.0072146295208e-08, "logits/chosen": -0.3035888671875, "logits/rejected": -0.3436279296875, "logps/chosen": -690.0, "logps/rejected": -835.0, "loss": 0.3979, "rewards/accuracies": 0.828125, "rewards/chosen": -3.791015625, "rewards/margins": 1.52734375, "rewards/rejected": -5.31640625, "step": 660 }, { "epoch": 0.84608, "grad_norm": 12.621963464933918, "learning_rate": 4.926783660322411e-08, "logits/chosen": -0.275390625, "logits/rejected": -0.326904296875, "logps/chosen": -724.0, "logps/rejected": -809.0, "loss": 0.4929, "rewards/accuracies": 0.78125, "rewards/chosen": -3.876953125, "rewards/margins": 1.166015625, "rewards/rejected": -5.046875, "step": 661 }, { "epoch": 0.84736, "grad_norm": 9.6461916615264, "learning_rate": 4.846954979768149e-08, "logits/chosen": -0.308380126953125, "logits/rejected": -0.3372802734375, "logps/chosen": -733.0, "logps/rejected": -825.0, "loss": 0.4582, "rewards/accuracies": 0.78125, "rewards/chosen": -3.904296875, "rewards/margins": 0.998046875, "rewards/rejected": -4.90625, "step": 662 }, { "epoch": 0.84864, "grad_norm": 12.039674839827219, "learning_rate": 4.7677301866197455e-08, "logits/chosen": -0.2691650390625, "logits/rejected": -0.311279296875, "logps/chosen": -731.5, "logps/rejected": -866.0, "loss": 0.4554, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.056640625, "rewards/margins": 1.3349609375, "rewards/rejected": -5.38671875, "step": 663 }, { "epoch": 0.84992, "grad_norm": 8.987446887684213, "learning_rate": 4.689110867544645e-08, "logits/chosen": -0.169097900390625, "logits/rejected": -0.20819091796875, "logps/chosen": -667.5, "logps/rejected": -767.0, "loss": 0.4374, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.666015625, "rewards/margins": 1.1484375, "rewards/rejected": -4.8046875, "step": 664 }, { "epoch": 0.8512, "grad_norm": 14.644380809572876, "learning_rate": 4.611098597084226e-08, "logits/chosen": -0.1959228515625, "logits/rejected": -0.249267578125, "logps/chosen": -718.0, "logps/rejected": -857.5, "loss": 0.5053, "rewards/accuracies": 0.7265625, "rewards/chosen": -4.177734375, "rewards/margins": 1.05908203125, "rewards/rejected": -5.23046875, "step": 665 }, { "epoch": 0.85248, "grad_norm": 8.230847446292731, "learning_rate": 4.5336949376222274e-08, "logits/chosen": -0.2403564453125, "logits/rejected": -0.26849365234375, "logps/chosen": -734.5, "logps/rejected": -859.0, "loss": 0.3859, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.775390625, "rewards/margins": 1.490234375, "rewards/rejected": -5.2734375, "step": 666 }, { "epoch": 0.85376, "grad_norm": 9.397546683815692, "learning_rate": 4.4569014393534986e-08, "logits/chosen": -0.22314453125, "logits/rejected": -0.26226806640625, "logps/chosen": -684.5, "logps/rejected": -783.0, "loss": 0.4507, "rewards/accuracies": 0.765625, "rewards/chosen": -3.8203125, "rewards/margins": 1.06103515625, "rewards/rejected": -4.87890625, "step": 667 }, { "epoch": 0.85504, "grad_norm": 9.127958781532016, "learning_rate": 4.380719640252953e-08, "logits/chosen": -0.2354736328125, "logits/rejected": -0.2667236328125, "logps/chosen": -749.5, "logps/rejected": -839.5, "loss": 0.456, "rewards/accuracies": 0.78125, "rewards/chosen": -4.07421875, "rewards/margins": 1.228515625, "rewards/rejected": -5.30078125, "step": 668 }, { "epoch": 0.85632, "grad_norm": 9.421323267573934, "learning_rate": 4.3051510660447335e-08, "logits/chosen": -0.295379638671875, "logits/rejected": -0.35400390625, "logps/chosen": -676.0, "logps/rejected": -819.0, "loss": 0.4437, "rewards/accuracies": 0.8125, "rewards/chosen": -3.71875, "rewards/margins": 1.28564453125, "rewards/rejected": -5.0078125, "step": 669 }, { "epoch": 0.8576, "grad_norm": 10.534790363492052, "learning_rate": 4.230197230171693e-08, "logits/chosen": -0.3251953125, "logits/rejected": -0.375244140625, "logps/chosen": -760.5, "logps/rejected": -831.0, "loss": 0.4472, "rewards/accuracies": 0.8125, "rewards/chosen": -3.83984375, "rewards/margins": 1.1640625, "rewards/rejected": -4.998046875, "step": 670 }, { "epoch": 0.85888, "grad_norm": 8.740923113820415, "learning_rate": 4.155859633765044e-08, "logits/chosen": -0.27716064453125, "logits/rejected": -0.295166015625, "logps/chosen": -698.0, "logps/rejected": -825.0, "loss": 0.459, "rewards/accuracies": 0.78125, "rewards/chosen": -3.890625, "rewards/margins": 1.258544921875, "rewards/rejected": -5.15234375, "step": 671 }, { "epoch": 0.86016, "grad_norm": 11.346727433404798, "learning_rate": 4.08213976561435e-08, "logits/chosen": -0.2637290954589844, "logits/rejected": -0.3646240234375, "logps/chosen": -735.5, "logps/rejected": -866.5, "loss": 0.4928, "rewards/accuracies": 0.796875, "rewards/chosen": -4.146484375, "rewards/margins": 1.279296875, "rewards/rejected": -5.42578125, "step": 672 }, { "epoch": 0.86144, "grad_norm": 10.398636636838543, "learning_rate": 4.009039102137657e-08, "logits/chosen": -0.2867431640625, "logits/rejected": -0.31884765625, "logps/chosen": -739.5, "logps/rejected": -823.0, "loss": 0.52, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.947265625, "rewards/margins": 1.0927734375, "rewards/rejected": -5.03515625, "step": 673 }, { "epoch": 0.86272, "grad_norm": 13.027404663461114, "learning_rate": 3.936559107351939e-08, "logits/chosen": -0.25640869140625, "logits/rejected": -0.2908935546875, "logps/chosen": -682.0, "logps/rejected": -813.5, "loss": 0.3722, "rewards/accuracies": 0.828125, "rewards/chosen": -3.75, "rewards/margins": 1.4267578125, "rewards/rejected": -5.17578125, "step": 674 }, { "epoch": 0.864, "grad_norm": 9.010470752300636, "learning_rate": 3.864701232843808e-08, "logits/chosen": -0.335205078125, "logits/rejected": -0.3759765625, "logps/chosen": -703.5, "logps/rejected": -835.0, "loss": 0.4373, "rewards/accuracies": 0.78125, "rewards/chosen": -3.712890625, "rewards/margins": 1.20849609375, "rewards/rejected": -4.921875, "step": 675 }, { "epoch": 0.86528, "grad_norm": 9.78070477469183, "learning_rate": 3.7934669177404015e-08, "logits/chosen": -0.2587890625, "logits/rejected": -0.32421875, "logps/chosen": -728.5, "logps/rejected": -826.0, "loss": 0.4832, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.0625, "rewards/margins": 1.38916015625, "rewards/rejected": -5.453125, "step": 676 }, { "epoch": 0.86656, "grad_norm": 12.145508203259396, "learning_rate": 3.722857588680574e-08, "logits/chosen": -0.2547607421875, "logits/rejected": -0.296630859375, "logps/chosen": -678.0, "logps/rejected": -843.5, "loss": 0.3965, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.734375, "rewards/margins": 1.4375, "rewards/rejected": -5.171875, "step": 677 }, { "epoch": 0.86784, "grad_norm": 14.052967853348232, "learning_rate": 3.652874659786328e-08, "logits/chosen": -0.268218994140625, "logits/rejected": -0.321044921875, "logps/chosen": -726.0, "logps/rejected": -888.5, "loss": 0.366, "rewards/accuracies": 0.875, "rewards/chosen": -3.98046875, "rewards/margins": 1.5517578125, "rewards/rejected": -5.53125, "step": 678 }, { "epoch": 0.86912, "grad_norm": 8.818365407249955, "learning_rate": 3.583519532634516e-08, "logits/chosen": -0.2730865478515625, "logits/rejected": -0.3010711669921875, "logps/chosen": -701.0, "logps/rejected": -813.5, "loss": 0.4284, "rewards/accuracies": 0.828125, "rewards/chosen": -3.82421875, "rewards/margins": 1.16748046875, "rewards/rejected": -4.98828125, "step": 679 }, { "epoch": 0.8704, "grad_norm": 12.067218675895933, "learning_rate": 3.514793596228702e-08, "logits/chosen": -0.2110137939453125, "logits/rejected": -0.283447265625, "logps/chosen": -705.0, "logps/rejected": -851.0, "loss": 0.3709, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.740234375, "rewards/margins": 1.58203125, "rewards/rejected": -5.32421875, "step": 680 }, { "epoch": 0.87168, "grad_norm": 15.163206625422314, "learning_rate": 3.4466982269714396e-08, "logits/chosen": -0.2342529296875, "logits/rejected": -0.271240234375, "logps/chosen": -675.0, "logps/rejected": -841.5, "loss": 0.4759, "rewards/accuracies": 0.75, "rewards/chosen": -3.54296875, "rewards/margins": 1.16650390625, "rewards/rejected": -4.70703125, "step": 681 }, { "epoch": 0.87296, "grad_norm": 13.083926981225705, "learning_rate": 3.379234788636626e-08, "logits/chosen": -0.24346923828125, "logits/rejected": -0.260040283203125, "logps/chosen": -682.0, "logps/rejected": -784.0, "loss": 0.499, "rewards/accuracies": 0.796875, "rewards/chosen": -3.755859375, "rewards/margins": 1.13623046875, "rewards/rejected": -4.892578125, "step": 682 }, { "epoch": 0.87424, "grad_norm": 9.209267131224845, "learning_rate": 3.31240463234221e-08, "logits/chosen": -0.2317047119140625, "logits/rejected": -0.2623291015625, "logps/chosen": -707.5, "logps/rejected": -845.5, "loss": 0.3632, "rewards/accuracies": 0.8125, "rewards/chosen": -3.77734375, "rewards/margins": 1.587890625, "rewards/rejected": -5.3671875, "step": 683 }, { "epoch": 0.87552, "grad_norm": 10.57378130101519, "learning_rate": 3.246209096523176e-08, "logits/chosen": -0.233551025390625, "logits/rejected": -0.2782135009765625, "logps/chosen": -741.0, "logps/rejected": -830.0, "loss": 0.4497, "rewards/accuracies": 0.796875, "rewards/chosen": -4.083984375, "rewards/margins": 1.17626953125, "rewards/rejected": -5.26171875, "step": 684 }, { "epoch": 0.8768, "grad_norm": 12.296255284365948, "learning_rate": 3.180649506904667e-08, "logits/chosen": -0.2994384765625, "logits/rejected": -0.3275146484375, "logps/chosen": -699.5, "logps/rejected": -800.0, "loss": 0.4883, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.74609375, "rewards/margins": 1.09814453125, "rewards/rejected": -4.84765625, "step": 685 }, { "epoch": 0.87808, "grad_norm": 12.690499499739776, "learning_rate": 3.115727176475508e-08, "logits/chosen": -0.2972412109375, "logits/rejected": -0.36767578125, "logps/chosen": -686.0, "logps/rejected": -812.0, "loss": 0.427, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.5546875, "rewards/margins": 1.294921875, "rewards/rejected": -4.8515625, "step": 686 }, { "epoch": 0.87936, "grad_norm": 9.788929929625333, "learning_rate": 3.051443405461822e-08, "logits/chosen": -0.2645263671875, "logits/rejected": -0.296142578125, "logps/chosen": -722.0, "logps/rejected": -802.0, "loss": 0.4763, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.84765625, "rewards/margins": 1.03076171875, "rewards/rejected": -4.875, "step": 687 }, { "epoch": 0.88064, "grad_norm": 11.417012062202968, "learning_rate": 2.987799481301091e-08, "logits/chosen": -0.282470703125, "logits/rejected": -0.306640625, "logps/chosen": -679.5, "logps/rejected": -821.0, "loss": 0.4559, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.6875, "rewards/margins": 1.14501953125, "rewards/rejected": -4.83984375, "step": 688 }, { "epoch": 0.88192, "grad_norm": 11.033813131632254, "learning_rate": 2.924796678616297e-08, "logits/chosen": -0.2666015625, "logits/rejected": -0.3145751953125, "logps/chosen": -709.5, "logps/rejected": -838.0, "loss": 0.4804, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.859375, "rewards/margins": 1.21484375, "rewards/rejected": -5.07421875, "step": 689 }, { "epoch": 0.8832, "grad_norm": 9.639277971365505, "learning_rate": 2.862436259190414e-08, "logits/chosen": -0.24395751953125, "logits/rejected": -0.2987060546875, "logps/chosen": -702.0, "logps/rejected": -878.5, "loss": 0.471, "rewards/accuracies": 0.765625, "rewards/chosen": -4.1015625, "rewards/margins": 1.365234375, "rewards/rejected": -5.46484375, "step": 690 }, { "epoch": 0.88448, "grad_norm": 8.604549708205145, "learning_rate": 2.800719471941152e-08, "logits/chosen": -0.26934814453125, "logits/rejected": -0.3045654296875, "logps/chosen": -693.5, "logps/rejected": -802.5, "loss": 0.4883, "rewards/accuracies": 0.796875, "rewards/chosen": -3.83203125, "rewards/margins": 1.09423828125, "rewards/rejected": -4.9296875, "step": 691 }, { "epoch": 0.88576, "grad_norm": 8.224767246318196, "learning_rate": 2.739647552895949e-08, "logits/chosen": -0.24432373046875, "logits/rejected": -0.268463134765625, "logps/chosen": -714.5, "logps/rejected": -807.0, "loss": 0.413, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.919921875, "rewards/margins": 1.251953125, "rewards/rejected": -5.162109375, "step": 692 }, { "epoch": 0.88704, "grad_norm": 10.030626998099564, "learning_rate": 2.6792217251671745e-08, "logits/chosen": -0.239654541015625, "logits/rejected": -0.248565673828125, "logps/chosen": -754.5, "logps/rejected": -814.5, "loss": 0.435, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.072265625, "rewards/margins": 1.16650390625, "rewards/rejected": -5.23828125, "step": 693 }, { "epoch": 0.88832, "grad_norm": 11.888659481208316, "learning_rate": 2.6194431989276773e-08, "logits/chosen": -0.2733154296875, "logits/rejected": -0.331298828125, "logps/chosen": -675.5, "logps/rejected": -841.5, "loss": 0.47, "rewards/accuracies": 0.8203125, "rewards/chosen": -3.80078125, "rewards/margins": 1.370849609375, "rewards/rejected": -5.171875, "step": 694 }, { "epoch": 0.8896, "grad_norm": 9.80399886350075, "learning_rate": 2.5603131713865372e-08, "logits/chosen": -0.26300048828125, "logits/rejected": -0.3204345703125, "logps/chosen": -745.0, "logps/rejected": -841.0, "loss": 0.4927, "rewards/accuracies": 0.75, "rewards/chosen": -4.22265625, "rewards/margins": 1.11181640625, "rewards/rejected": -5.33984375, "step": 695 }, { "epoch": 0.89088, "grad_norm": 11.910638478728464, "learning_rate": 2.5018328267650796e-08, "logits/chosen": -0.264404296875, "logits/rejected": -0.276702880859375, "logps/chosen": -774.5, "logps/rejected": -894.0, "loss": 0.4589, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.185546875, "rewards/margins": 1.25048828125, "rewards/rejected": -5.43359375, "step": 696 }, { "epoch": 0.89216, "grad_norm": 9.464215441247289, "learning_rate": 2.4440033362731626e-08, "logits/chosen": -0.2911376953125, "logits/rejected": -0.3267822265625, "logps/chosen": -739.5, "logps/rejected": -840.5, "loss": 0.4642, "rewards/accuracies": 0.7421875, "rewards/chosen": -4.09375, "rewards/margins": 1.214111328125, "rewards/rejected": -5.30859375, "step": 697 }, { "epoch": 0.89344, "grad_norm": 11.076879696440999, "learning_rate": 2.3868258580857163e-08, "logits/chosen": -0.2603759765625, "logits/rejected": -0.3201904296875, "logps/chosen": -723.0, "logps/rejected": -831.5, "loss": 0.4242, "rewards/accuracies": 0.8125, "rewards/chosen": -3.93359375, "rewards/margins": 1.27734375, "rewards/rejected": -5.2109375, "step": 698 }, { "epoch": 0.89472, "grad_norm": 9.93416968714434, "learning_rate": 2.330301537319571e-08, "logits/chosen": -0.3101806640625, "logits/rejected": -0.36083984375, "logps/chosen": -727.5, "logps/rejected": -926.0, "loss": 0.4007, "rewards/accuracies": 0.796875, "rewards/chosen": -4.072265625, "rewards/margins": 1.513671875, "rewards/rejected": -5.5859375, "step": 699 }, { "epoch": 0.896, "grad_norm": 8.352284052025437, "learning_rate": 2.2744315060104845e-08, "logits/chosen": -0.173828125, "logits/rejected": -0.23614501953125, "logps/chosen": -693.0, "logps/rejected": -882.5, "loss": 0.3667, "rewards/accuracies": 0.828125, "rewards/chosen": -3.837890625, "rewards/margins": 1.92333984375, "rewards/rejected": -5.76171875, "step": 700 }, { "epoch": 0.896, "eval_logits/chosen": -0.2226715087890625, "eval_logits/rejected": -0.30279541015625, "eval_logps/chosen": -703.0, "eval_logps/rejected": -800.5, "eval_loss": 0.4712187647819519, "eval_rewards/accuracies": 0.74609375, "eval_rewards/chosen": -3.8193359375, "eval_rewards/margins": 1.1708984375, "eval_rewards/rejected": -4.990234375, "eval_runtime": 27.3686, "eval_samples_per_second": 18.269, "eval_steps_per_second": 0.585, "step": 700 }, { "epoch": 0.89728, "grad_norm": 9.595567585238719, "learning_rate": 2.2192168830904962e-08, "logits/chosen": -0.2698974609375, "logits/rejected": -0.294189453125, "logps/chosen": -675.5, "logps/rejected": -832.0, "loss": 0.4001, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.697265625, "rewards/margins": 1.458984375, "rewards/rejected": -5.15625, "step": 701 }, { "epoch": 0.89856, "grad_norm": 12.777276327167845, "learning_rate": 2.164658774365529e-08, "logits/chosen": -0.28564453125, "logits/rejected": -0.334320068359375, "logps/chosen": -748.5, "logps/rejected": -879.5, "loss": 0.4901, "rewards/accuracies": 0.765625, "rewards/chosen": -4.021484375, "rewards/margins": 1.2333984375, "rewards/rejected": -5.25390625, "step": 702 }, { "epoch": 0.89984, "grad_norm": 9.224895376105101, "learning_rate": 2.1107582724932088e-08, "logits/chosen": -0.229522705078125, "logits/rejected": -0.2841796875, "logps/chosen": -706.0, "logps/rejected": -798.5, "loss": 0.483, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.982421875, "rewards/margins": 1.16064453125, "rewards/rejected": -5.14453125, "step": 703 }, { "epoch": 0.90112, "grad_norm": 9.774698117520298, "learning_rate": 2.0575164569610016e-08, "logits/chosen": -0.194671630859375, "logits/rejected": -0.2525634765625, "logps/chosen": -700.0, "logps/rejected": -818.0, "loss": 0.4548, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.830078125, "rewards/margins": 1.3994140625, "rewards/rejected": -5.23046875, "step": 704 }, { "epoch": 0.9024, "grad_norm": 9.309057650942739, "learning_rate": 2.0049343940645937e-08, "logits/chosen": -0.20440673828125, "logits/rejected": -0.270263671875, "logps/chosen": -697.0, "logps/rejected": -826.5, "loss": 0.4319, "rewards/accuracies": 0.765625, "rewards/chosen": -3.767578125, "rewards/margins": 1.33984375, "rewards/rejected": -5.109375, "step": 705 }, { "epoch": 0.90368, "grad_norm": 13.991840106741984, "learning_rate": 1.953013136886541e-08, "logits/chosen": -0.2295684814453125, "logits/rejected": -0.2913818359375, "logps/chosen": -684.5, "logps/rejected": -847.0, "loss": 0.3583, "rewards/accuracies": 0.84375, "rewards/chosen": -3.62890625, "rewards/margins": 1.5439453125, "rewards/rejected": -5.16796875, "step": 706 }, { "epoch": 0.90496, "grad_norm": 8.819520427002814, "learning_rate": 1.901753725275166e-08, "logits/chosen": -0.14373779296875, "logits/rejected": -0.20538330078125, "logps/chosen": -721.5, "logps/rejected": -832.5, "loss": 0.4801, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.9609375, "rewards/margins": 1.1591796875, "rewards/rejected": -5.12109375, "step": 707 }, { "epoch": 0.90624, "grad_norm": 13.348990809308994, "learning_rate": 1.8511571858237356e-08, "logits/chosen": -0.13494873046875, "logits/rejected": -0.16680908203125, "logps/chosen": -714.0, "logps/rejected": -808.5, "loss": 0.4172, "rewards/accuracies": 0.8125, "rewards/chosen": -3.765625, "rewards/margins": 1.22314453125, "rewards/rejected": -4.984375, "step": 708 }, { "epoch": 0.90752, "grad_norm": 14.870724504070937, "learning_rate": 1.801224531849908e-08, "logits/chosen": -0.171142578125, "logits/rejected": -0.218017578125, "logps/chosen": -668.0, "logps/rejected": -773.0, "loss": 0.4573, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.64453125, "rewards/margins": 1.0498046875, "rewards/rejected": -4.69140625, "step": 709 }, { "epoch": 0.9088, "grad_norm": 11.205083148950653, "learning_rate": 1.751956763375435e-08, "logits/chosen": -0.22489166259765625, "logits/rejected": -0.2996826171875, "logps/chosen": -690.5, "logps/rejected": -798.0, "loss": 0.4199, "rewards/accuracies": 0.84375, "rewards/chosen": -3.837890625, "rewards/margins": 1.08837890625, "rewards/rejected": -4.921875, "step": 710 }, { "epoch": 0.91008, "grad_norm": 9.544243168409615, "learning_rate": 1.70335486710614e-08, "logits/chosen": -0.2523193359375, "logits/rejected": -0.2890625, "logps/chosen": -686.5, "logps/rejected": -831.0, "loss": 0.4423, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.88671875, "rewards/margins": 1.15478515625, "rewards/rejected": -5.0390625, "step": 711 }, { "epoch": 0.91136, "grad_norm": 12.6996755573186, "learning_rate": 1.6554198164121263e-08, "logits/chosen": -0.2281494140625, "logits/rejected": -0.2470703125, "logps/chosen": -725.0, "logps/rejected": -819.5, "loss": 0.4172, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.76953125, "rewards/margins": 1.28759765625, "rewards/rejected": -5.0546875, "step": 712 }, { "epoch": 0.91264, "grad_norm": 10.207786072588835, "learning_rate": 1.6081525713083427e-08, "logits/chosen": -0.20416259765625, "logits/rejected": -0.2568359375, "logps/chosen": -692.0, "logps/rejected": -842.0, "loss": 0.4328, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.908203125, "rewards/margins": 1.1376953125, "rewards/rejected": -5.04296875, "step": 713 }, { "epoch": 0.91392, "grad_norm": 10.540319829243698, "learning_rate": 1.561554078435296e-08, "logits/chosen": -0.2947998046875, "logits/rejected": -0.3328857421875, "logps/chosen": -727.0, "logps/rejected": -864.0, "loss": 0.4035, "rewards/accuracies": 0.796875, "rewards/chosen": -4.021484375, "rewards/margins": 1.36328125, "rewards/rejected": -5.38671875, "step": 714 }, { "epoch": 0.9152, "grad_norm": 15.698890404120037, "learning_rate": 1.5156252710401207e-08, "logits/chosen": -0.192352294921875, "logits/rejected": -0.27923583984375, "logps/chosen": -698.0, "logps/rejected": -886.0, "loss": 0.3362, "rewards/accuracies": 0.8671875, "rewards/chosen": -3.923828125, "rewards/margins": 1.6572265625, "rewards/rejected": -5.578125, "step": 715 }, { "epoch": 0.91648, "grad_norm": 13.504679296790746, "learning_rate": 1.4703670689578884e-08, "logits/chosen": -0.232635498046875, "logits/rejected": -0.3350830078125, "logps/chosen": -711.0, "logps/rejected": -889.5, "loss": 0.3787, "rewards/accuracies": 0.859375, "rewards/chosen": -3.85546875, "rewards/margins": 1.46630859375, "rewards/rejected": -5.32421875, "step": 716 }, { "epoch": 0.91776, "grad_norm": 15.869638544760475, "learning_rate": 1.4257803785931927e-08, "logits/chosen": -0.24554443359375, "logits/rejected": -0.268798828125, "logps/chosen": -677.0, "logps/rejected": -788.5, "loss": 0.5024, "rewards/accuracies": 0.8125, "rewards/chosen": -3.693359375, "rewards/margins": 1.0751953125, "rewards/rejected": -4.767578125, "step": 717 }, { "epoch": 0.91904, "grad_norm": 9.319905258380075, "learning_rate": 1.3818660929019715e-08, "logits/chosen": -0.240478515625, "logits/rejected": -0.2955322265625, "logps/chosen": -679.5, "logps/rejected": -822.0, "loss": 0.4519, "rewards/accuracies": 0.7578125, "rewards/chosen": -3.9453125, "rewards/margins": 1.27001953125, "rewards/rejected": -5.2109375, "step": 718 }, { "epoch": 0.92032, "grad_norm": 14.445515309974768, "learning_rate": 1.3386250913736408e-08, "logits/chosen": -0.259307861328125, "logits/rejected": -0.30859375, "logps/chosen": -763.5, "logps/rejected": -907.0, "loss": 0.4477, "rewards/accuracies": 0.78125, "rewards/chosen": -4.28515625, "rewards/margins": 1.2734375, "rewards/rejected": -5.55859375, "step": 719 }, { "epoch": 0.9216, "grad_norm": 13.180277849379051, "learning_rate": 1.2960582400134912e-08, "logits/chosen": -0.3209228515625, "logits/rejected": -0.3797607421875, "logps/chosen": -695.5, "logps/rejected": -893.5, "loss": 0.3956, "rewards/accuracies": 0.8359375, "rewards/chosen": -3.771484375, "rewards/margins": 1.61767578125, "rewards/rejected": -5.390625, "step": 720 }, { "epoch": 0.92288, "grad_norm": 12.337718937891417, "learning_rate": 1.2541663913253191e-08, "logits/chosen": -0.19183349609375, "logits/rejected": -0.258819580078125, "logps/chosen": -740.5, "logps/rejected": -852.5, "loss": 0.5031, "rewards/accuracies": 0.7265625, "rewards/chosen": -4.0703125, "rewards/margins": 1.23486328125, "rewards/rejected": -5.30078125, "step": 721 }, { "epoch": 0.92416, "grad_norm": 9.489271375743723, "learning_rate": 1.2129503842943645e-08, "logits/chosen": -0.22479248046875, "logits/rejected": -0.24298095703125, "logps/chosen": -699.0, "logps/rejected": -837.5, "loss": 0.4083, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.0859375, "rewards/margins": 1.3935546875, "rewards/rejected": -5.48046875, "step": 722 }, { "epoch": 0.92544, "grad_norm": 10.82430093944136, "learning_rate": 1.1724110443705115e-08, "logits/chosen": -0.24761962890625, "logits/rejected": -0.265869140625, "logps/chosen": -704.0, "logps/rejected": -775.0, "loss": 0.5147, "rewards/accuracies": 0.75, "rewards/chosen": -3.76953125, "rewards/margins": 0.96337890625, "rewards/rejected": -4.734375, "step": 723 }, { "epoch": 0.92672, "grad_norm": 10.645094500743772, "learning_rate": 1.1325491834517675e-08, "logits/chosen": -0.246551513671875, "logits/rejected": -0.2755126953125, "logps/chosen": -762.0, "logps/rejected": -847.5, "loss": 0.5002, "rewards/accuracies": 0.75, "rewards/chosen": -4.234375, "rewards/margins": 1.134033203125, "rewards/rejected": -5.3671875, "step": 724 }, { "epoch": 0.928, "grad_norm": 10.724174338748702, "learning_rate": 1.0933655998679653e-08, "logits/chosen": -0.199462890625, "logits/rejected": -0.2744140625, "logps/chosen": -688.0, "logps/rejected": -800.5, "loss": 0.4812, "rewards/accuracies": 0.796875, "rewards/chosen": -3.857421875, "rewards/margins": 1.195068359375, "rewards/rejected": -5.052734375, "step": 725 }, { "epoch": 0.92928, "grad_norm": 9.575511353587679, "learning_rate": 1.0548610783648198e-08, "logits/chosen": -0.1815185546875, "logits/rejected": -0.19659423828125, "logps/chosen": -692.5, "logps/rejected": -795.5, "loss": 0.4822, "rewards/accuracies": 0.78125, "rewards/chosen": -3.77734375, "rewards/margins": 1.10498046875, "rewards/rejected": -4.88671875, "step": 726 }, { "epoch": 0.93056, "grad_norm": 11.150367172489458, "learning_rate": 1.0170363900881796e-08, "logits/chosen": -0.236328125, "logits/rejected": -0.3133544921875, "logps/chosen": -714.0, "logps/rejected": -836.5, "loss": 0.4768, "rewards/accuracies": 0.796875, "rewards/chosen": -4.025390625, "rewards/margins": 1.1396484375, "rewards/rejected": -5.16015625, "step": 727 }, { "epoch": 0.93184, "grad_norm": 9.876466042629044, "learning_rate": 9.798922925685992e-09, "logits/chosen": -0.14385986328125, "logits/rejected": -0.172760009765625, "logps/chosen": -773.0, "logps/rejected": -894.5, "loss": 0.4802, "rewards/accuracies": 0.78125, "rewards/chosen": -4.216796875, "rewards/margins": 1.3310546875, "rewards/rejected": -5.5546875, "step": 728 }, { "epoch": 0.93312, "grad_norm": 13.034111105340324, "learning_rate": 9.434295297061668e-09, "logits/chosen": -0.2009124755859375, "logits/rejected": -0.24066162109375, "logps/chosen": -754.0, "logps/rejected": -856.0, "loss": 0.5133, "rewards/accuracies": 0.765625, "rewards/chosen": -4.19921875, "rewards/margins": 1.14404296875, "rewards/rejected": -5.34765625, "step": 729 }, { "epoch": 0.9344, "grad_norm": 8.571599786583675, "learning_rate": 9.076488317555886e-09, "logits/chosen": -0.32958984375, "logits/rejected": -0.383056640625, "logps/chosen": -708.5, "logps/rejected": -856.5, "loss": 0.3725, "rewards/accuracies": 0.859375, "rewards/chosen": -3.923828125, "rewards/margins": 1.5029296875, "rewards/rejected": -5.43359375, "step": 730 }, { "epoch": 0.93568, "grad_norm": 8.597215059260597, "learning_rate": 8.725509153115918e-09, "logits/chosen": -0.2611083984375, "logits/rejected": -0.32562255859375, "logps/chosen": -733.5, "logps/rejected": -851.5, "loss": 0.4326, "rewards/accuracies": 0.8125, "rewards/chosen": -4.072265625, "rewards/margins": 1.204833984375, "rewards/rejected": -5.275390625, "step": 731 }, { "epoch": 0.93696, "grad_norm": 12.286472292592713, "learning_rate": 8.381364832945459e-09, "logits/chosen": -0.28240966796875, "logits/rejected": -0.30352783203125, "logps/chosen": -752.0, "logps/rejected": -879.5, "loss": 0.4621, "rewards/accuracies": 0.78125, "rewards/chosen": -4.296875, "rewards/margins": 1.1865234375, "rewards/rejected": -5.48046875, "step": 732 }, { "epoch": 0.93824, "grad_norm": 11.941872693164612, "learning_rate": 8.044062249364047e-09, "logits/chosen": -0.18436813354492188, "logits/rejected": -0.243377685546875, "logps/chosen": -772.0, "logps/rejected": -890.5, "loss": 0.4532, "rewards/accuracies": 0.8046875, "rewards/chosen": -4.326171875, "rewards/margins": 1.33349609375, "rewards/rejected": -5.65625, "step": 733 }, { "epoch": 0.93952, "grad_norm": 10.424030696964069, "learning_rate": 7.713608157668921e-09, "logits/chosen": -0.21624755859375, "logits/rejected": -0.2452392578125, "logps/chosen": -746.0, "logps/rejected": -870.0, "loss": 0.4378, "rewards/accuracies": 0.8125, "rewards/chosen": -4.1875, "rewards/margins": 1.3876953125, "rewards/rejected": -5.57421875, "step": 734 }, { "epoch": 0.9408, "grad_norm": 14.949460184750698, "learning_rate": 7.390009175999834e-09, "logits/chosen": -0.2874755859375, "logits/rejected": -0.319091796875, "logps/chosen": -735.0, "logps/rejected": -890.5, "loss": 0.3524, "rewards/accuracies": 0.8828125, "rewards/chosen": -4.1015625, "rewards/margins": 1.51220703125, "rewards/rejected": -5.61328125, "step": 735 }, { "epoch": 0.94208, "grad_norm": 9.527587985419053, "learning_rate": 7.073271785206314e-09, "logits/chosen": -0.279052734375, "logits/rejected": -0.291259765625, "logps/chosen": -720.0, "logps/rejected": -811.5, "loss": 0.4741, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.892578125, "rewards/margins": 1.220703125, "rewards/rejected": -5.11328125, "step": 736 }, { "epoch": 0.94336, "grad_norm": 12.15877893601555, "learning_rate": 6.763402328718115e-09, "logits/chosen": -0.2161865234375, "logits/rejected": -0.2706298828125, "logps/chosen": -712.5, "logps/rejected": -831.0, "loss": 0.3963, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.826171875, "rewards/margins": 1.43603515625, "rewards/rejected": -5.26171875, "step": 737 }, { "epoch": 0.94464, "grad_norm": 8.913274393831173, "learning_rate": 6.460407012417918e-09, "logits/chosen": -0.214202880859375, "logits/rejected": -0.218719482421875, "logps/chosen": -779.0, "logps/rejected": -907.5, "loss": 0.458, "rewards/accuracies": 0.8203125, "rewards/chosen": -4.17578125, "rewards/margins": 1.3076171875, "rewards/rejected": -5.4921875, "step": 738 }, { "epoch": 0.94592, "grad_norm": 13.019357743010502, "learning_rate": 6.164291904517333e-09, "logits/chosen": -0.22491455078125, "logits/rejected": -0.2719573974609375, "logps/chosen": -704.0, "logps/rejected": -834.5, "loss": 0.4764, "rewards/accuracies": 0.71875, "rewards/chosen": -3.91015625, "rewards/margins": 1.169921875, "rewards/rejected": -5.078125, "step": 739 }, { "epoch": 0.9472, "grad_norm": 14.066472266902457, "learning_rate": 5.875062935435121e-09, "logits/chosen": -0.2550048828125, "logits/rejected": -0.27685546875, "logps/chosen": -748.5, "logps/rejected": -869.0, "loss": 0.4442, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.072265625, "rewards/margins": 1.40234375, "rewards/rejected": -5.47265625, "step": 740 }, { "epoch": 0.94848, "grad_norm": 13.888471288481163, "learning_rate": 5.592725897678446e-09, "logits/chosen": -0.2496337890625, "logits/rejected": -0.2772216796875, "logps/chosen": -710.5, "logps/rejected": -837.0, "loss": 0.4921, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.01171875, "rewards/margins": 1.125, "rewards/rejected": -5.1328125, "step": 741 }, { "epoch": 0.94976, "grad_norm": 9.351078828106235, "learning_rate": 5.317286445727193e-09, "logits/chosen": -0.1865997314453125, "logits/rejected": -0.22784423828125, "logps/chosen": -721.5, "logps/rejected": -857.0, "loss": 0.4165, "rewards/accuracies": 0.8515625, "rewards/chosen": -4.095703125, "rewards/margins": 1.375, "rewards/rejected": -5.47265625, "step": 742 }, { "epoch": 0.95104, "grad_norm": 8.64152279502156, "learning_rate": 5.048750095920151e-09, "logits/chosen": -0.2593994140625, "logits/rejected": -0.294921875, "logps/chosen": -705.0, "logps/rejected": -815.5, "loss": 0.4576, "rewards/accuracies": 0.7734375, "rewards/chosen": -3.935546875, "rewards/margins": 1.132080078125, "rewards/rejected": -5.06640625, "step": 743 }, { "epoch": 0.95232, "grad_norm": 14.443814841229473, "learning_rate": 4.787122226345014e-09, "logits/chosen": -0.17755126953125, "logits/rejected": -0.22381591796875, "logps/chosen": -720.0, "logps/rejected": -811.5, "loss": 0.5046, "rewards/accuracies": 0.765625, "rewards/chosen": -3.841796875, "rewards/margins": 1.265869140625, "rewards/rejected": -5.11328125, "step": 744 }, { "epoch": 0.9536, "grad_norm": 10.100416966960784, "learning_rate": 4.532408076730504e-09, "logits/chosen": -0.1763916015625, "logits/rejected": -0.219482421875, "logps/chosen": -746.5, "logps/rejected": -882.0, "loss": 0.4617, "rewards/accuracies": 0.7890625, "rewards/chosen": -4.29296875, "rewards/margins": 1.4052734375, "rewards/rejected": -5.6953125, "step": 745 }, { "epoch": 0.95488, "grad_norm": 16.507849057578323, "learning_rate": 4.284612748341421e-09, "logits/chosen": -0.1527099609375, "logits/rejected": -0.189727783203125, "logps/chosen": -747.5, "logps/rejected": -908.5, "loss": 0.3097, "rewards/accuracies": 0.8828125, "rewards/chosen": -3.98828125, "rewards/margins": 1.7822265625, "rewards/rejected": -5.76953125, "step": 746 }, { "epoch": 0.95616, "grad_norm": 14.09994700103146, "learning_rate": 4.0437412038764826e-09, "logits/chosen": -0.1898193359375, "logits/rejected": -0.2166748046875, "logps/chosen": -748.0, "logps/rejected": -843.0, "loss": 0.409, "rewards/accuracies": 0.796875, "rewards/chosen": -4.23046875, "rewards/margins": 1.23681640625, "rewards/rejected": -5.47265625, "step": 747 }, { "epoch": 0.95744, "grad_norm": 9.05904451158265, "learning_rate": 3.80979826736893e-09, "logits/chosen": -0.25396728515625, "logits/rejected": -0.2950439453125, "logps/chosen": -742.0, "logps/rejected": -919.0, "loss": 0.383, "rewards/accuracies": 0.8046875, "rewards/chosen": -4.208984375, "rewards/margins": 1.5869140625, "rewards/rejected": -5.79296875, "step": 748 }, { "epoch": 0.95872, "grad_norm": 9.091126144194298, "learning_rate": 3.5827886240899998e-09, "logits/chosen": -0.2483062744140625, "logits/rejected": -0.3052978515625, "logps/chosen": -716.0, "logps/rejected": -846.0, "loss": 0.4343, "rewards/accuracies": 0.78125, "rewards/chosen": -3.90625, "rewards/margins": 1.361572265625, "rewards/rejected": -5.265625, "step": 749 }, { "epoch": 0.96, "grad_norm": 10.017266905551178, "learning_rate": 3.3627168204549304e-09, "logits/chosen": -0.2337646484375, "logits/rejected": -0.277587890625, "logps/chosen": -712.5, "logps/rejected": -828.5, "loss": 0.4071, "rewards/accuracies": 0.796875, "rewards/chosen": -3.931640625, "rewards/margins": 1.38330078125, "rewards/rejected": -5.31640625, "step": 750 }, { "epoch": 0.96128, "grad_norm": 14.829233382165352, "learning_rate": 3.149587263932035e-09, "logits/chosen": -0.312744140625, "logits/rejected": -0.321533203125, "logps/chosen": -753.5, "logps/rejected": -829.5, "loss": 0.5308, "rewards/accuracies": 0.75, "rewards/chosen": -4.072265625, "rewards/margins": 1.01171875, "rewards/rejected": -5.08984375, "step": 751 }, { "epoch": 0.96256, "grad_norm": 15.392472528414778, "learning_rate": 2.9434042229544544e-09, "logits/chosen": -0.29705810546875, "logits/rejected": -0.330322265625, "logps/chosen": -716.5, "logps/rejected": -865.5, "loss": 0.3916, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.953125, "rewards/margins": 1.419921875, "rewards/rejected": -5.37109375, "step": 752 }, { "epoch": 0.96384, "grad_norm": 9.893884489429142, "learning_rate": 2.744171826834474e-09, "logits/chosen": -0.2510986328125, "logits/rejected": -0.26873779296875, "logps/chosen": -753.0, "logps/rejected": -834.0, "loss": 0.4875, "rewards/accuracies": 0.7109375, "rewards/chosen": -3.814453125, "rewards/margins": 1.1533203125, "rewards/rejected": -4.9765625, "step": 753 }, { "epoch": 0.96512, "grad_norm": 14.07898237216757, "learning_rate": 2.5518940656811094e-09, "logits/chosen": -0.281005859375, "logits/rejected": -0.3043212890625, "logps/chosen": -718.5, "logps/rejected": -836.0, "loss": 0.5053, "rewards/accuracies": 0.78125, "rewards/chosen": -3.9921875, "rewards/margins": 1.32275390625, "rewards/rejected": -5.3125, "step": 754 }, { "epoch": 0.9664, "grad_norm": 15.140775410792148, "learning_rate": 2.366574790319942e-09, "logits/chosen": -0.1983642578125, "logits/rejected": -0.257568359375, "logps/chosen": -697.5, "logps/rejected": -834.5, "loss": 0.4643, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.939453125, "rewards/margins": 1.399169921875, "rewards/rejected": -5.33984375, "step": 755 }, { "epoch": 0.96768, "grad_norm": 20.58163084682585, "learning_rate": 2.188217712216217e-09, "logits/chosen": -0.274169921875, "logits/rejected": -0.28857421875, "logps/chosen": -736.0, "logps/rejected": -826.0, "loss": 0.3668, "rewards/accuracies": 0.859375, "rewards/chosen": -3.97265625, "rewards/margins": 1.29736328125, "rewards/rejected": -5.2734375, "step": 756 }, { "epoch": 0.96896, "grad_norm": 13.921962866937543, "learning_rate": 2.01682640340024e-09, "logits/chosen": -0.2674560546875, "logits/rejected": -0.28857421875, "logps/chosen": -742.5, "logps/rejected": -847.5, "loss": 0.4826, "rewards/accuracies": 0.765625, "rewards/chosen": -4.158203125, "rewards/margins": 1.14892578125, "rewards/rejected": -5.30859375, "step": 757 }, { "epoch": 0.97024, "grad_norm": 10.786302286042769, "learning_rate": 1.8524042963961096e-09, "logits/chosen": -0.2705078125, "logits/rejected": -0.306640625, "logps/chosen": -766.5, "logps/rejected": -916.5, "loss": 0.4616, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.330078125, "rewards/margins": 1.359375, "rewards/rejected": -5.6796875, "step": 758 }, { "epoch": 0.97152, "grad_norm": 8.73289653841681, "learning_rate": 1.6949546841528607e-09, "logits/chosen": -0.16156005859375, "logits/rejected": -0.21087646484375, "logps/chosen": -673.0, "logps/rejected": -794.5, "loss": 0.4352, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.763671875, "rewards/margins": 1.30078125, "rewards/rejected": -5.064453125, "step": 759 }, { "epoch": 0.9728, "grad_norm": 11.011261520088386, "learning_rate": 1.5444807199784471e-09, "logits/chosen": -0.217681884765625, "logits/rejected": -0.2626953125, "logps/chosen": -699.0, "logps/rejected": -835.5, "loss": 0.4936, "rewards/accuracies": 0.78125, "rewards/chosen": -3.9140625, "rewards/margins": 1.07958984375, "rewards/rejected": -4.9921875, "step": 760 }, { "epoch": 0.97408, "grad_norm": 20.320010738404367, "learning_rate": 1.4009854174767521e-09, "logits/chosen": -0.29150390625, "logits/rejected": -0.3443603515625, "logps/chosen": -717.5, "logps/rejected": -860.5, "loss": 0.5028, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.16796875, "rewards/margins": 1.07177734375, "rewards/rejected": -5.23828125, "step": 761 }, { "epoch": 0.97536, "grad_norm": 12.1440388856, "learning_rate": 1.264471650487009e-09, "logits/chosen": -0.253204345703125, "logits/rejected": -0.306884765625, "logps/chosen": -789.5, "logps/rejected": -921.5, "loss": 0.3703, "rewards/accuracies": 0.8359375, "rewards/chosen": -4.087890625, "rewards/margins": 1.7041015625, "rewards/rejected": -5.80078125, "step": 762 }, { "epoch": 0.97664, "grad_norm": 19.756277043960345, "learning_rate": 1.1349421530265247e-09, "logits/chosen": -0.25432395935058594, "logits/rejected": -0.265899658203125, "logps/chosen": -721.5, "logps/rejected": -802.0, "loss": 0.5194, "rewards/accuracies": 0.7578125, "rewards/chosen": -4.05859375, "rewards/margins": 1.168212890625, "rewards/rejected": -5.2265625, "step": 763 }, { "epoch": 0.97792, "grad_norm": 11.30781557188891, "learning_rate": 1.0123995192356182e-09, "logits/chosen": -0.27215576171875, "logits/rejected": -0.3157958984375, "logps/chosen": -729.5, "logps/rejected": -847.5, "loss": 0.4481, "rewards/accuracies": 0.796875, "rewards/chosen": -4.09375, "rewards/margins": 1.2841796875, "rewards/rejected": -5.37890625, "step": 764 }, { "epoch": 0.9792, "grad_norm": 11.090064368669557, "learning_rate": 8.968462033259405e-10, "logits/chosen": -0.2755126953125, "logits/rejected": -0.30224609375, "logps/chosen": -734.0, "logps/rejected": -863.5, "loss": 0.4139, "rewards/accuracies": 0.8203125, "rewards/chosen": -4.12890625, "rewards/margins": 1.259765625, "rewards/rejected": -5.390625, "step": 765 }, { "epoch": 0.98048, "grad_norm": 8.905708808696662, "learning_rate": 7.882845195312016e-10, "logits/chosen": -0.2078857421875, "logits/rejected": -0.2855224609375, "logps/chosen": -683.25, "logps/rejected": -863.5, "loss": 0.3897, "rewards/accuracies": 0.84375, "rewards/chosen": -4.033203125, "rewards/margins": 1.23291015625, "rewards/rejected": -5.26953125, "step": 766 }, { "epoch": 0.98176, "grad_norm": 13.207336578714601, "learning_rate": 6.867166420607362e-10, "logits/chosen": -0.230712890625, "logits/rejected": -0.24365234375, "logps/chosen": -749.0, "logps/rejected": -829.5, "loss": 0.4359, "rewards/accuracies": 0.8046875, "rewards/chosen": -4.16796875, "rewards/margins": 1.08740234375, "rewards/rejected": -5.2578125, "step": 767 }, { "epoch": 0.98304, "grad_norm": 15.119029537418763, "learning_rate": 5.921446050561385e-10, "logits/chosen": -0.179931640625, "logits/rejected": -0.24912261962890625, "logps/chosen": -678.5, "logps/rejected": -854.0, "loss": 0.3582, "rewards/accuracies": 0.8515625, "rewards/chosen": -3.90625, "rewards/margins": 1.5869140625, "rewards/rejected": -5.49609375, "step": 768 }, { "epoch": 0.98432, "grad_norm": 18.31926688721905, "learning_rate": 5.045703025503834e-10, "logits/chosen": -0.200286865234375, "logits/rejected": -0.2401123046875, "logps/chosen": -715.5, "logps/rejected": -845.5, "loss": 0.5413, "rewards/accuracies": 0.7421875, "rewards/chosen": -3.947265625, "rewards/margins": 1.12841796875, "rewards/rejected": -5.0703125, "step": 769 }, { "epoch": 0.9856, "grad_norm": 9.632067095301219, "learning_rate": 4.2399548842994017e-10, "logits/chosen": -0.2724609375, "logits/rejected": -0.30517578125, "logps/chosen": -731.5, "logps/rejected": -849.5, "loss": 0.4275, "rewards/accuracies": 0.796875, "rewards/chosen": -3.91796875, "rewards/margins": 1.27099609375, "rewards/rejected": -5.1875, "step": 770 }, { "epoch": 0.98688, "grad_norm": 20.57119149988017, "learning_rate": 3.5042177639972304e-10, "logits/chosen": -0.143798828125, "logits/rejected": -0.193359375, "logps/chosen": -734.0, "logps/rejected": -852.0, "loss": 0.5429, "rewards/accuracies": 0.78125, "rewards/chosen": -4.04296875, "rewards/margins": 1.1286468505859375, "rewards/rejected": -5.16796875, "step": 771 }, { "epoch": 0.98816, "grad_norm": 20.50073312922875, "learning_rate": 2.8385063995064463e-10, "logits/chosen": -0.21649169921875, "logits/rejected": -0.2684326171875, "logps/chosen": -775.0, "logps/rejected": -883.5, "loss": 0.4822, "rewards/accuracies": 0.75, "rewards/chosen": -4.328125, "rewards/margins": 1.306640625, "rewards/rejected": -5.6328125, "step": 772 }, { "epoch": 0.98944, "grad_norm": 10.125073496731055, "learning_rate": 2.2428341233012293e-10, "logits/chosen": -0.2061920166015625, "logits/rejected": -0.2589263916015625, "logps/chosen": -753.0, "logps/rejected": -847.0, "loss": 0.4463, "rewards/accuracies": 0.8125, "rewards/chosen": -4.240234375, "rewards/margins": 1.12890625, "rewards/rejected": -5.3671875, "step": 773 }, { "epoch": 0.99072, "grad_norm": 8.724529533275442, "learning_rate": 1.7172128651554151e-10, "logits/chosen": -0.259307861328125, "logits/rejected": -0.2984619140625, "logps/chosen": -720.5, "logps/rejected": -837.0, "loss": 0.4245, "rewards/accuracies": 0.828125, "rewards/chosen": -3.974609375, "rewards/margins": 1.40234375, "rewards/rejected": -5.37890625, "step": 774 }, { "epoch": 0.992, "grad_norm": 10.031507803839778, "learning_rate": 1.2616531519011876e-10, "logits/chosen": -0.2186279296875, "logits/rejected": -0.2698974609375, "logps/chosen": -761.5, "logps/rejected": -877.0, "loss": 0.4012, "rewards/accuracies": 0.8203125, "rewards/chosen": -4.119140625, "rewards/margins": 1.47802734375, "rewards/rejected": -5.58984375, "step": 775 }, { "epoch": 0.99328, "grad_norm": 11.25971739653657, "learning_rate": 8.761641072196346e-11, "logits/chosen": -0.23828125, "logits/rejected": -0.3046875, "logps/chosen": -719.0, "logps/rejected": -844.5, "loss": 0.451, "rewards/accuracies": 0.78125, "rewards/chosen": -4.099609375, "rewards/margins": 1.158203125, "rewards/rejected": -5.2578125, "step": 776 }, { "epoch": 0.99456, "grad_norm": 11.91792314235709, "learning_rate": 5.607534514585066e-11, "logits/chosen": -0.221282958984375, "logits/rejected": -0.26171875, "logps/chosen": -714.0, "logps/rejected": -831.5, "loss": 0.4559, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.826171875, "rewards/margins": 1.36328125, "rewards/rejected": -5.1875, "step": 777 }, { "epoch": 0.99584, "grad_norm": 9.429202223984744, "learning_rate": 3.154275014763952e-11, "logits/chosen": -0.1851806640625, "logits/rejected": -0.23614501953125, "logps/chosen": -694.0, "logps/rejected": -843.0, "loss": 0.3918, "rewards/accuracies": 0.8046875, "rewards/chosen": -3.8984375, "rewards/margins": 1.54296875, "rewards/rejected": -5.4453125, "step": 778 }, { "epoch": 0.99712, "grad_norm": 9.182804511873856, "learning_rate": 1.4019117051683461e-11, "logits/chosen": -0.185791015625, "logits/rejected": -0.255859375, "logps/chosen": -720.5, "logps/rejected": -881.5, "loss": 0.3743, "rewards/accuracies": 0.828125, "rewards/chosen": -3.85546875, "rewards/margins": 1.4609375, "rewards/rejected": -5.3203125, "step": 779 }, { "epoch": 0.9984, "grad_norm": 13.685193571545302, "learning_rate": 3.504796810921418e-12, "logits/chosen": -0.2198486328125, "logits/rejected": -0.26318359375, "logps/chosen": -672.0, "logps/rejected": -777.0, "loss": 0.4135, "rewards/accuracies": 0.7890625, "rewards/chosen": -3.607421875, "rewards/margins": 1.242919921875, "rewards/rejected": -4.845703125, "step": 780 }, { "epoch": 0.99968, "grad_norm": 9.030651964482605, "learning_rate": 0.0, "logits/chosen": -0.1768798828125, "logits/rejected": -0.21502685546875, "logps/chosen": -735.5, "logps/rejected": -832.5, "loss": 0.4697, "rewards/accuracies": 0.7734375, "rewards/chosen": -4.044921875, "rewards/margins": 1.16845703125, "rewards/rejected": -5.21484375, "step": 781 } ], "logging_steps": 1, "max_steps": 781, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }