{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 8826, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.662514156285391e-10, "logits/chosen": 0.2741023600101471, "logits/rejected": 0.6330793499946594, "logps/chosen": -239.01792907714844, "logps/rejected": -789.6390991210938, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 5.66251415628539e-09, "logits/chosen": 0.0353085957467556, "logits/rejected": 0.3788506090641022, "logps/chosen": -509.1393127441406, "logps/rejected": -547.4296264648438, "loss": 0.6978, "rewards/accuracies": 0.5416666865348816, "rewards/chosen": 0.004813813604414463, "rewards/margins": 0.03705020993947983, "rewards/rejected": -0.03223639726638794, "step": 10 }, { "epoch": 0.01, "learning_rate": 1.132502831257078e-08, "logits/chosen": 0.022656653076410294, "logits/rejected": 0.6099055409431458, "logps/chosen": -258.1075744628906, "logps/rejected": -445.50732421875, "loss": 0.7258, "rewards/accuracies": 0.36250001192092896, "rewards/chosen": -0.023073118180036545, "rewards/margins": -0.04769294336438179, "rewards/rejected": 0.024619827046990395, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.698754246885617e-08, "logits/chosen": 0.0007772177341394126, "logits/rejected": 0.5340547561645508, "logps/chosen": -407.92626953125, "logps/rejected": -440.33837890625, "loss": 0.7081, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0010988533031195402, "rewards/margins": 0.008491093292832375, "rewards/rejected": -0.009589947760105133, "step": 30 }, { "epoch": 0.01, "learning_rate": 2.265005662514156e-08, "logits/chosen": 0.09200962632894516, "logits/rejected": 0.5520201921463013, "logps/chosen": -275.111328125, "logps/rejected": -472.977783203125, "loss": 0.7004, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.010260184295475483, "rewards/margins": 0.024815965443849564, "rewards/rejected": -0.03507614880800247, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.8312570781426952e-08, "logits/chosen": 0.10228805243968964, "logits/rejected": 0.5600369572639465, "logps/chosen": -220.8819580078125, "logps/rejected": -583.55859375, "loss": 0.6943, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.015742352232336998, "rewards/margins": 0.005521730519831181, "rewards/rejected": 0.010220622643828392, "step": 50 }, { "epoch": 0.02, "learning_rate": 3.397508493771234e-08, "logits/chosen": -0.010508568957448006, "logits/rejected": 0.5727511644363403, "logps/chosen": -297.95880126953125, "logps/rejected": -328.6632080078125, "loss": 0.6908, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.001281719421967864, "rewards/margins": 0.007822146639227867, "rewards/rejected": -0.006540427450090647, "step": 60 }, { "epoch": 0.02, "learning_rate": 3.9637599093997736e-08, "logits/chosen": 0.10829267650842667, "logits/rejected": 0.5243688821792603, "logps/chosen": -284.8822937011719, "logps/rejected": -531.0701293945312, "loss": 0.7119, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.013552923686802387, "rewards/margins": -0.08166860044002533, "rewards/rejected": 0.09522150456905365, "step": 70 }, { "epoch": 0.03, "learning_rate": 4.530011325028312e-08, "logits/chosen": 0.015305752865970135, "logits/rejected": 0.6524732708930969, "logps/chosen": -424.30621337890625, "logps/rejected": -382.9802551269531, "loss": 0.6732, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.010578184388577938, "rewards/margins": -0.006389158312231302, "rewards/rejected": 0.01696733944118023, "step": 80 }, { "epoch": 0.03, "learning_rate": 5.096262740656852e-08, "logits/chosen": -0.02590624988079071, "logits/rejected": 0.527054488658905, "logps/chosen": -247.3843536376953, "logps/rejected": -273.6214599609375, "loss": 0.6662, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.010751227848231792, "rewards/margins": 0.041849274188280106, "rewards/rejected": -0.03109804354608059, "step": 90 }, { "epoch": 0.03, "learning_rate": 5.6625141562853904e-08, "logits/chosen": 0.08191641420125961, "logits/rejected": 0.6547849774360657, "logps/chosen": -217.2135467529297, "logps/rejected": -414.7909240722656, "loss": 0.6535, "rewards/accuracies": 0.625, "rewards/chosen": 0.07561233639717102, "rewards/margins": 0.10900970548391342, "rewards/rejected": -0.0333973728120327, "step": 100 }, { "epoch": 0.03, "eval_logits/chosen": 0.10562418401241302, "eval_logits/rejected": 0.6266094446182251, "eval_logps/chosen": -273.5692138671875, "eval_logps/rejected": -450.52972412109375, "eval_loss": 0.640408456325531, "eval_rewards/accuracies": 0.6936026811599731, "eval_rewards/chosen": 0.09124113619327545, "eval_rewards/margins": 0.1345468908548355, "eval_rewards/rejected": -0.04330575466156006, "eval_runtime": 513.2575, "eval_samples_per_second": 18.509, "eval_steps_per_second": 0.579, "step": 100 }, { "epoch": 0.04, "learning_rate": 6.22876557191393e-08, "logits/chosen": 0.15774545073509216, "logits/rejected": 0.45926588773727417, "logps/chosen": -219.19619750976562, "logps/rejected": -647.7750244140625, "loss": 0.6183, "rewards/accuracies": 0.8125, "rewards/chosen": 0.14751197397708893, "rewards/margins": 0.24957728385925293, "rewards/rejected": -0.1020653024315834, "step": 110 }, { "epoch": 0.04, "learning_rate": 6.795016987542468e-08, "logits/chosen": 0.02892078459262848, "logits/rejected": 0.4678385257720947, "logps/chosen": -241.78323364257812, "logps/rejected": -441.6668395996094, "loss": 0.6067, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.22012750804424286, "rewards/margins": 0.30746933817863464, "rewards/rejected": -0.08734184503555298, "step": 120 }, { "epoch": 0.04, "learning_rate": 7.361268403171007e-08, "logits/chosen": 0.035242605954408646, "logits/rejected": 0.6198612451553345, "logps/chosen": -286.7660217285156, "logps/rejected": -483.2093200683594, "loss": 0.5817, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.21134242415428162, "rewards/margins": 0.2862333357334137, "rewards/rejected": -0.0748908668756485, "step": 130 }, { "epoch": 0.05, "learning_rate": 7.927519818799547e-08, "logits/chosen": 0.102170929312706, "logits/rejected": 0.48904991149902344, "logps/chosen": -193.81484985351562, "logps/rejected": -513.2350463867188, "loss": 0.5752, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.15708835422992706, "rewards/margins": 0.2403644621372223, "rewards/rejected": -0.08327607810497284, "step": 140 }, { "epoch": 0.05, "learning_rate": 8.493771234428086e-08, "logits/chosen": 0.09552840888500214, "logits/rejected": 0.4914703965187073, "logps/chosen": -225.55941772460938, "logps/rejected": -459.7232971191406, "loss": 0.5588, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.22048036754131317, "rewards/margins": 0.36154818534851074, "rewards/rejected": -0.14106786251068115, "step": 150 }, { "epoch": 0.05, "learning_rate": 9.060022650056625e-08, "logits/chosen": -0.019463256001472473, "logits/rejected": 0.4858437180519104, "logps/chosen": -209.8123321533203, "logps/rejected": -439.5926818847656, "loss": 0.5406, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.2236533910036087, "rewards/margins": 0.446943998336792, "rewards/rejected": -0.22329063713550568, "step": 160 }, { "epoch": 0.06, "learning_rate": 9.626274065685163e-08, "logits/chosen": 0.13270524144172668, "logits/rejected": 0.7370689511299133, "logps/chosen": -322.776123046875, "logps/rejected": -396.0201416015625, "loss": 0.4876, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.2954064905643463, "rewards/margins": 0.48141050338745117, "rewards/rejected": -0.18600401282310486, "step": 170 }, { "epoch": 0.06, "learning_rate": 1.0192525481313703e-07, "logits/chosen": -0.03725407272577286, "logits/rejected": 0.577723503112793, "logps/chosen": -353.58685302734375, "logps/rejected": -492.463134765625, "loss": 0.4638, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.3955172002315521, "rewards/margins": 0.6526498794555664, "rewards/rejected": -0.25713270902633667, "step": 180 }, { "epoch": 0.06, "learning_rate": 1.0758776896942241e-07, "logits/chosen": 0.08514745533466339, "logits/rejected": 0.5692216157913208, "logps/chosen": -225.74462890625, "logps/rejected": -450.30548095703125, "loss": 0.4202, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5971451997756958, "rewards/margins": 0.9192669987678528, "rewards/rejected": -0.3221217691898346, "step": 190 }, { "epoch": 0.07, "learning_rate": 1.1325028312570781e-07, "logits/chosen": 0.014504434540867805, "logits/rejected": 0.594152569770813, "logps/chosen": -275.9012451171875, "logps/rejected": -355.86077880859375, "loss": 0.4293, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.4155961871147156, "rewards/margins": 0.7575788497924805, "rewards/rejected": -0.3419826924800873, "step": 200 }, { "epoch": 0.07, "eval_logits/chosen": 0.13245438039302826, "eval_logits/rejected": 0.6425116658210754, "eval_logps/chosen": -269.52484130859375, "eval_logps/rejected": -454.1286926269531, "eval_loss": 0.4021265208721161, "eval_rewards/accuracies": 0.8998316526412964, "eval_rewards/chosen": 0.4956779479980469, "eval_rewards/margins": 0.8988770842552185, "eval_rewards/rejected": -0.40319907665252686, "eval_runtime": 515.758, "eval_samples_per_second": 18.419, "eval_steps_per_second": 0.576, "step": 200 }, { "epoch": 0.07, "learning_rate": 1.189127972819932e-07, "logits/chosen": 0.08997168391942978, "logits/rejected": 0.34949326515197754, "logps/chosen": -212.4152374267578, "logps/rejected": -612.3704833984375, "loss": 0.3802, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6735544800758362, "rewards/margins": 1.0593321323394775, "rewards/rejected": -0.38577771186828613, "step": 210 }, { "epoch": 0.07, "learning_rate": 1.245753114382786e-07, "logits/chosen": 0.08834132552146912, "logits/rejected": 0.5508908033370972, "logps/chosen": -333.9996032714844, "logps/rejected": -380.7149963378906, "loss": 0.3575, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.6490951776504517, "rewards/margins": 0.9705212712287903, "rewards/rejected": -0.32142606377601624, "step": 220 }, { "epoch": 0.08, "learning_rate": 1.3023782559456398e-07, "logits/chosen": 0.07101895660161972, "logits/rejected": 0.4958924353122711, "logps/chosen": -287.037353515625, "logps/rejected": -473.2584533691406, "loss": 0.3496, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.685560405254364, "rewards/margins": 1.0336828231811523, "rewards/rejected": -0.3481225371360779, "step": 230 }, { "epoch": 0.08, "learning_rate": 1.3590033975084937e-07, "logits/chosen": 0.09266692399978638, "logits/rejected": 0.6038914918899536, "logps/chosen": -273.4285583496094, "logps/rejected": -597.8888549804688, "loss": 0.3553, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.5568755865097046, "rewards/margins": 1.1175185441970825, "rewards/rejected": -0.5606428384780884, "step": 240 }, { "epoch": 0.08, "learning_rate": 1.4156285390713476e-07, "logits/chosen": 0.10122789442539215, "logits/rejected": 0.6531853675842285, "logps/chosen": -215.95217895507812, "logps/rejected": -432.19482421875, "loss": 0.3496, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.7165228128433228, "rewards/margins": 1.299408197402954, "rewards/rejected": -0.5828853249549866, "step": 250 }, { "epoch": 0.09, "learning_rate": 1.4722536806342014e-07, "logits/chosen": 0.006140911485999823, "logits/rejected": 0.41502633690834045, "logps/chosen": -275.1268615722656, "logps/rejected": -469.67364501953125, "loss": 0.331, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6666760444641113, "rewards/margins": 1.2736904621124268, "rewards/rejected": -0.6070144176483154, "step": 260 }, { "epoch": 0.09, "learning_rate": 1.5288788221970556e-07, "logits/chosen": 0.13538256287574768, "logits/rejected": 0.7110094428062439, "logps/chosen": -254.85971069335938, "logps/rejected": -326.21075439453125, "loss": 0.3095, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.725599467754364, "rewards/margins": 1.6757152080535889, "rewards/rejected": -0.9501156806945801, "step": 270 }, { "epoch": 0.1, "learning_rate": 1.5855039637599094e-07, "logits/chosen": 0.08335123211145401, "logits/rejected": 0.8205634355545044, "logps/chosen": -220.60079956054688, "logps/rejected": -302.47802734375, "loss": 0.3141, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6893803477287292, "rewards/margins": 1.6645898818969727, "rewards/rejected": -0.9752095937728882, "step": 280 }, { "epoch": 0.1, "learning_rate": 1.642129105322763e-07, "logits/chosen": 0.09892099350690842, "logits/rejected": 0.6533292531967163, "logps/chosen": -192.48194885253906, "logps/rejected": -457.379150390625, "loss": 0.2871, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6650089025497437, "rewards/margins": 1.6284023523330688, "rewards/rejected": -0.9633933901786804, "step": 290 }, { "epoch": 0.1, "learning_rate": 1.6987542468856172e-07, "logits/chosen": 0.1017051488161087, "logits/rejected": 0.7949822545051575, "logps/chosen": -211.03762817382812, "logps/rejected": -551.3566284179688, "loss": 0.2685, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8636614680290222, "rewards/margins": 2.1299757957458496, "rewards/rejected": -1.2663145065307617, "step": 300 }, { "epoch": 0.1, "eval_logits/chosen": 0.15333408117294312, "eval_logits/rejected": 0.6699573993682861, "eval_logps/chosen": -265.7525634765625, "eval_logps/rejected": -460.72113037109375, "eval_loss": 0.24641236662864685, "eval_rewards/accuracies": 0.93855220079422, "eval_rewards/chosen": 0.8729060888290405, "eval_rewards/margins": 1.9353501796722412, "eval_rewards/rejected": -1.0624442100524902, "eval_runtime": 517.1156, "eval_samples_per_second": 18.371, "eval_steps_per_second": 0.574, "step": 300 }, { "epoch": 0.11, "learning_rate": 1.755379388448471e-07, "logits/chosen": 0.18220114707946777, "logits/rejected": 0.5355657935142517, "logps/chosen": -217.20333862304688, "logps/rejected": -443.2105407714844, "loss": 0.2324, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.9907919764518738, "rewards/margins": 2.091484785079956, "rewards/rejected": -1.1006931066513062, "step": 310 }, { "epoch": 0.11, "learning_rate": 1.812004530011325e-07, "logits/chosen": -0.040408771485090256, "logits/rejected": 0.5952903628349304, "logps/chosen": -406.0402526855469, "logps/rejected": -369.89453125, "loss": 0.2251, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.1152513027191162, "rewards/margins": 2.2655866146087646, "rewards/rejected": -1.150335431098938, "step": 320 }, { "epoch": 0.11, "learning_rate": 1.868629671574179e-07, "logits/chosen": 0.12294594943523407, "logits/rejected": 0.6529209017753601, "logps/chosen": -277.2166748046875, "logps/rejected": -557.4232177734375, "loss": 0.229, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1751363277435303, "rewards/margins": 2.7529709339141846, "rewards/rejected": -1.5778348445892334, "step": 330 }, { "epoch": 0.12, "learning_rate": 1.9252548131370327e-07, "logits/chosen": 0.06886116415262222, "logits/rejected": 0.5941012501716614, "logps/chosen": -198.62258911132812, "logps/rejected": -442.09649658203125, "loss": 0.2082, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.1945065259933472, "rewards/margins": 2.538296699523926, "rewards/rejected": -1.343790054321289, "step": 340 }, { "epoch": 0.12, "learning_rate": 1.9818799546998865e-07, "logits/chosen": 0.07824553549289703, "logits/rejected": 0.4021661877632141, "logps/chosen": -255.095703125, "logps/rejected": -552.4793090820312, "loss": 0.2129, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.264046311378479, "rewards/margins": 2.7580389976501465, "rewards/rejected": -1.4939930438995361, "step": 350 }, { "epoch": 0.12, "learning_rate": 2.0385050962627407e-07, "logits/chosen": 0.019232342019677162, "logits/rejected": 0.6419030427932739, "logps/chosen": -196.41619873046875, "logps/rejected": -380.11358642578125, "loss": 0.1799, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.1300694942474365, "rewards/margins": 2.9319968223571777, "rewards/rejected": -1.8019275665283203, "step": 360 }, { "epoch": 0.13, "learning_rate": 2.0951302378255946e-07, "logits/chosen": 0.26184606552124023, "logits/rejected": 0.43805593252182007, "logps/chosen": -202.53578186035156, "logps/rejected": -713.5509033203125, "loss": 0.198, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.0671663284301758, "rewards/margins": 2.6391656398773193, "rewards/rejected": -1.5719993114471436, "step": 370 }, { "epoch": 0.13, "learning_rate": 2.1517553793884482e-07, "logits/chosen": 0.1692897528409958, "logits/rejected": 0.5646194815635681, "logps/chosen": -226.83740234375, "logps/rejected": -600.69775390625, "loss": 0.1884, "rewards/accuracies": 0.9375, "rewards/chosen": 1.344738245010376, "rewards/margins": 2.8761115074157715, "rewards/rejected": -1.531373381614685, "step": 380 }, { "epoch": 0.13, "learning_rate": 2.2083805209513023e-07, "logits/chosen": 0.0928935557603836, "logits/rejected": 0.5613464117050171, "logps/chosen": -280.67047119140625, "logps/rejected": -367.80291748046875, "loss": 0.1997, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.0904172658920288, "rewards/margins": 2.5656912326812744, "rewards/rejected": -1.4752737283706665, "step": 390 }, { "epoch": 0.14, "learning_rate": 2.2650056625141562e-07, "logits/chosen": 0.16066524386405945, "logits/rejected": 0.7405991554260254, "logps/chosen": -210.985595703125, "logps/rejected": -547.41259765625, "loss": 0.1918, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4568512439727783, "rewards/margins": 3.595226764678955, "rewards/rejected": -2.1383750438690186, "step": 400 }, { "epoch": 0.14, "eval_logits/chosen": 0.171905517578125, "eval_logits/rejected": 0.6917754411697388, "eval_logps/chosen": -262.5002746582031, "eval_logps/rejected": -467.8528137207031, "eval_loss": 0.17915955185890198, "eval_rewards/accuracies": 0.945286214351654, "eval_rewards/chosen": 1.1981322765350342, "eval_rewards/margins": 2.973737955093384, "eval_rewards/rejected": -1.77560555934906, "eval_runtime": 515.8236, "eval_samples_per_second": 18.417, "eval_steps_per_second": 0.576, "step": 400 }, { "epoch": 0.14, "learning_rate": 2.32163080407701e-07, "logits/chosen": 0.2574116885662079, "logits/rejected": 0.40326786041259766, "logps/chosen": -208.1060791015625, "logps/rejected": -455.02459716796875, "loss": 0.1761, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5561015605926514, "rewards/margins": 2.8797719478607178, "rewards/rejected": -1.3236703872680664, "step": 410 }, { "epoch": 0.14, "learning_rate": 2.378255945639864e-07, "logits/chosen": 0.2431601732969284, "logits/rejected": 0.4062012732028961, "logps/chosen": -230.39535522460938, "logps/rejected": -639.6707763671875, "loss": 0.2169, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.2638490200042725, "rewards/margins": 2.9768500328063965, "rewards/rejected": -1.7130008935928345, "step": 420 }, { "epoch": 0.15, "learning_rate": 2.434881087202718e-07, "logits/chosen": 0.16272658109664917, "logits/rejected": 0.509204089641571, "logps/chosen": -197.6181182861328, "logps/rejected": -555.8047485351562, "loss": 0.1608, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5779691934585571, "rewards/margins": 3.402552843093872, "rewards/rejected": -1.824583649635315, "step": 430 }, { "epoch": 0.15, "learning_rate": 2.491506228765572e-07, "logits/chosen": 0.0665513426065445, "logits/rejected": 0.7031861543655396, "logps/chosen": -202.7735595703125, "logps/rejected": -411.5433654785156, "loss": 0.1573, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.4196579456329346, "rewards/margins": 3.5814902782440186, "rewards/rejected": -2.161832332611084, "step": 440 }, { "epoch": 0.15, "learning_rate": 2.548131370328426e-07, "logits/chosen": 0.011431233957409859, "logits/rejected": 0.8608293533325195, "logps/chosen": -248.83590698242188, "logps/rejected": -343.0068359375, "loss": 0.1695, "rewards/accuracies": 1.0, "rewards/chosen": 1.1369214057922363, "rewards/margins": 3.6994431018829346, "rewards/rejected": -2.5625216960906982, "step": 450 }, { "epoch": 0.16, "learning_rate": 2.6047565118912797e-07, "logits/chosen": 0.07334133982658386, "logits/rejected": 0.67508465051651, "logps/chosen": -275.1880798339844, "logps/rejected": -381.44189453125, "loss": 0.1505, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.4296824932098389, "rewards/margins": 3.8819026947021484, "rewards/rejected": -2.4522202014923096, "step": 460 }, { "epoch": 0.16, "learning_rate": 2.6613816534541335e-07, "logits/chosen": -0.00823259074240923, "logits/rejected": 0.6633954048156738, "logps/chosen": -352.98504638671875, "logps/rejected": -232.8203125, "loss": 0.1588, "rewards/accuracies": 0.9375, "rewards/chosen": 1.5089671611785889, "rewards/margins": 3.4772255420684814, "rewards/rejected": -1.9682585000991821, "step": 470 }, { "epoch": 0.16, "learning_rate": 2.7180067950169874e-07, "logits/chosen": 0.24960267543792725, "logits/rejected": 0.3265121877193451, "logps/chosen": -199.93234252929688, "logps/rejected": -574.55029296875, "loss": 0.1462, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5233027935028076, "rewards/margins": 3.691254138946533, "rewards/rejected": -2.1679508686065674, "step": 480 }, { "epoch": 0.17, "learning_rate": 2.7746319365798413e-07, "logits/chosen": 0.20512044429779053, "logits/rejected": 0.5327206254005432, "logps/chosen": -217.4558563232422, "logps/rejected": -463.5848083496094, "loss": 0.147, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.0032362937927246, "rewards/margins": 4.386998176574707, "rewards/rejected": -2.3837618827819824, "step": 490 }, { "epoch": 0.17, "learning_rate": 2.831257078142695e-07, "logits/chosen": 0.04758840054273605, "logits/rejected": 0.6948806047439575, "logps/chosen": -342.4734802246094, "logps/rejected": -271.96844482421875, "loss": 0.1409, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.3732922077178955, "rewards/margins": 3.609480619430542, "rewards/rejected": -2.2361884117126465, "step": 500 }, { "epoch": 0.17, "eval_logits/chosen": 0.17720533907413483, "eval_logits/rejected": 0.7139342427253723, "eval_logps/chosen": -260.4028015136719, "eval_logps/rejected": -474.3183898925781, "eval_loss": 0.14233975112438202, "eval_rewards/accuracies": 0.9503366947174072, "eval_rewards/chosen": 1.4078826904296875, "eval_rewards/margins": 3.830057144165039, "eval_rewards/rejected": -2.4221744537353516, "eval_runtime": 516.0981, "eval_samples_per_second": 18.407, "eval_steps_per_second": 0.575, "step": 500 }, { "epoch": 0.17, "learning_rate": 2.887882219705549e-07, "logits/chosen": 0.01903020776808262, "logits/rejected": 0.5553046464920044, "logps/chosen": -308.3576965332031, "logps/rejected": -595.6212158203125, "loss": 0.1539, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 1.6416337490081787, "rewards/margins": 3.939124345779419, "rewards/rejected": -2.297490119934082, "step": 510 }, { "epoch": 0.18, "learning_rate": 2.944507361268403e-07, "logits/chosen": 0.11918652057647705, "logits/rejected": 0.586240291595459, "logps/chosen": -213.510986328125, "logps/rejected": -494.881103515625, "loss": 0.1194, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.59457528591156, "rewards/margins": 3.994372606277466, "rewards/rejected": -2.399796962738037, "step": 520 }, { "epoch": 0.18, "learning_rate": 3.001132502831257e-07, "logits/chosen": 0.23530828952789307, "logits/rejected": 0.6257737874984741, "logps/chosen": -202.90969848632812, "logps/rejected": -540.7300415039062, "loss": 0.1505, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6795040369033813, "rewards/margins": 4.440317630767822, "rewards/rejected": -2.7608137130737305, "step": 530 }, { "epoch": 0.18, "learning_rate": 3.057757644394111e-07, "logits/chosen": 0.054154396057128906, "logits/rejected": 0.674737811088562, "logps/chosen": -193.75775146484375, "logps/rejected": -381.66485595703125, "loss": 0.1236, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5483448505401611, "rewards/margins": 4.259047031402588, "rewards/rejected": -2.710702419281006, "step": 540 }, { "epoch": 0.19, "learning_rate": 3.114382785956965e-07, "logits/chosen": 0.1885976493358612, "logits/rejected": 0.5661032199859619, "logps/chosen": -204.5492706298828, "logps/rejected": -449.87567138671875, "loss": 0.1453, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.608193039894104, "rewards/margins": 4.289790153503418, "rewards/rejected": -2.6815969944000244, "step": 550 }, { "epoch": 0.19, "learning_rate": 3.171007927519819e-07, "logits/chosen": 0.0016541056102141738, "logits/rejected": 0.6388057470321655, "logps/chosen": -320.69488525390625, "logps/rejected": -383.7098388671875, "loss": 0.1215, "rewards/accuracies": 0.9375, "rewards/chosen": 1.575046420097351, "rewards/margins": 4.163811683654785, "rewards/rejected": -2.5887656211853027, "step": 560 }, { "epoch": 0.19, "learning_rate": 3.227633069082673e-07, "logits/chosen": 0.00697109242901206, "logits/rejected": 0.6750127077102661, "logps/chosen": -373.73822021484375, "logps/rejected": -289.76300048828125, "loss": 0.1023, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.7383673191070557, "rewards/margins": 4.731043815612793, "rewards/rejected": -2.9926772117614746, "step": 570 }, { "epoch": 0.2, "learning_rate": 3.284258210645526e-07, "logits/chosen": -0.003050741506740451, "logits/rejected": 0.6652949452400208, "logps/chosen": -332.9758605957031, "logps/rejected": -460.3736877441406, "loss": 0.1497, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8108749389648438, "rewards/margins": 4.913354873657227, "rewards/rejected": -3.102479934692383, "step": 580 }, { "epoch": 0.2, "learning_rate": 3.34088335220838e-07, "logits/chosen": 0.15606285631656647, "logits/rejected": 0.6190292239189148, "logps/chosen": -203.41131591796875, "logps/rejected": -347.80413818359375, "loss": 0.1309, "rewards/accuracies": 1.0, "rewards/chosen": 1.704292893409729, "rewards/margins": 4.940404891967773, "rewards/rejected": -3.236112117767334, "step": 590 }, { "epoch": 0.2, "learning_rate": 3.3975084937712344e-07, "logits/chosen": 0.054466117173433304, "logits/rejected": 0.644476592540741, "logps/chosen": -325.1722412109375, "logps/rejected": -600.5335693359375, "loss": 0.1437, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.6468417644500732, "rewards/margins": 4.704212188720703, "rewards/rejected": -3.057370662689209, "step": 600 }, { "epoch": 0.2, "eval_logits/chosen": 0.15705275535583496, "eval_logits/rejected": 0.7178868651390076, "eval_logps/chosen": -258.42791748046875, "eval_logps/rejected": -481.10595703125, "eval_loss": 0.12315016239881516, "eval_rewards/accuracies": 0.9537037014961243, "eval_rewards/chosen": 1.6053673028945923, "eval_rewards/margins": 4.70629358291626, "eval_rewards/rejected": -3.100926160812378, "eval_runtime": 519.4664, "eval_samples_per_second": 18.288, "eval_steps_per_second": 0.572, "step": 600 }, { "epoch": 0.21, "learning_rate": 3.454133635334088e-07, "logits/chosen": 0.04422450810670853, "logits/rejected": 0.6172502636909485, "logps/chosen": -251.5245361328125, "logps/rejected": -414.25921630859375, "loss": 0.1222, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 1.2418068647384644, "rewards/margins": 4.487616539001465, "rewards/rejected": -3.245809555053711, "step": 610 }, { "epoch": 0.21, "learning_rate": 3.510758776896942e-07, "logits/chosen": 0.08818056434392929, "logits/rejected": 0.4936336576938629, "logps/chosen": -341.56683349609375, "logps/rejected": -546.2991943359375, "loss": 0.1196, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.90375554561615, "rewards/margins": 4.793130397796631, "rewards/rejected": -2.8893752098083496, "step": 620 }, { "epoch": 0.21, "learning_rate": 3.567383918459796e-07, "logits/chosen": -0.04182681068778038, "logits/rejected": 0.7070735096931458, "logps/chosen": -304.26953125, "logps/rejected": -262.75164794921875, "loss": 0.1312, "rewards/accuracies": 0.9375, "rewards/chosen": 1.4209303855895996, "rewards/margins": 4.760873317718506, "rewards/rejected": -3.3399434089660645, "step": 630 }, { "epoch": 0.22, "learning_rate": 3.62400906002265e-07, "logits/chosen": 0.03215979412198067, "logits/rejected": 0.7907129526138306, "logps/chosen": -266.03741455078125, "logps/rejected": -524.769775390625, "loss": 0.0988, "rewards/accuracies": 1.0, "rewards/chosen": 1.821491003036499, "rewards/margins": 5.66542911529541, "rewards/rejected": -3.843938112258911, "step": 640 }, { "epoch": 0.22, "learning_rate": 3.6806342015855037e-07, "logits/chosen": -0.03710964322090149, "logits/rejected": 0.8121824264526367, "logps/chosen": -256.49951171875, "logps/rejected": -369.80438232421875, "loss": 0.1001, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6975791454315186, "rewards/margins": 5.688068389892578, "rewards/rejected": -3.9904892444610596, "step": 650 }, { "epoch": 0.22, "learning_rate": 3.737259343148358e-07, "logits/chosen": 0.1600877046585083, "logits/rejected": 0.7602616548538208, "logps/chosen": -207.41317749023438, "logps/rejected": -497.0277404785156, "loss": 0.0957, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 1.784257173538208, "rewards/margins": 5.802565097808838, "rewards/rejected": -4.018307685852051, "step": 660 }, { "epoch": 0.23, "learning_rate": 3.7938844847112115e-07, "logits/chosen": 0.05264540761709213, "logits/rejected": 0.7355621457099915, "logps/chosen": -242.3369598388672, "logps/rejected": -578.5082397460938, "loss": 0.1312, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.3727463483810425, "rewards/margins": 5.323803901672363, "rewards/rejected": -3.9510574340820312, "step": 670 }, { "epoch": 0.23, "learning_rate": 3.8505096262740653e-07, "logits/chosen": 0.0809575766324997, "logits/rejected": 0.8300803899765015, "logps/chosen": -214.95285034179688, "logps/rejected": -521.3463134765625, "loss": 0.0727, "rewards/accuracies": 1.0, "rewards/chosen": 1.7277336120605469, "rewards/margins": 5.684216022491455, "rewards/rejected": -3.95648193359375, "step": 680 }, { "epoch": 0.23, "learning_rate": 3.907134767836919e-07, "logits/chosen": 0.12036246061325073, "logits/rejected": 0.6469853520393372, "logps/chosen": -297.1856689453125, "logps/rejected": -526.7928466796875, "loss": 0.1245, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0920958518981934, "rewards/margins": 5.928999423980713, "rewards/rejected": -3.8369040489196777, "step": 690 }, { "epoch": 0.24, "learning_rate": 3.963759909399773e-07, "logits/chosen": 0.08231475949287415, "logits/rejected": 0.6511551141738892, "logps/chosen": -252.63674926757812, "logps/rejected": -549.3499145507812, "loss": 0.0747, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1775269508361816, "rewards/margins": 6.12094783782959, "rewards/rejected": -3.9434211254119873, "step": 700 }, { "epoch": 0.24, "eval_logits/chosen": 0.1324591040611267, "eval_logits/rejected": 0.7179297804832458, "eval_logps/chosen": -257.2913513183594, "eval_logps/rejected": -489.81787109375, "eval_loss": 0.10570324212312698, "eval_rewards/accuracies": 0.9612794518470764, "eval_rewards/chosen": 1.7190241813659668, "eval_rewards/margins": 5.691140651702881, "eval_rewards/rejected": -3.972116708755493, "eval_runtime": 518.7578, "eval_samples_per_second": 18.313, "eval_steps_per_second": 0.573, "step": 700 }, { "epoch": 0.24, "learning_rate": 4.0203850509626275e-07, "logits/chosen": -0.03588678687810898, "logits/rejected": 0.6093950867652893, "logps/chosen": -252.34799194335938, "logps/rejected": -381.96868896484375, "loss": 0.1055, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.915358304977417, "rewards/margins": 5.615798473358154, "rewards/rejected": -3.7004406452178955, "step": 710 }, { "epoch": 0.24, "learning_rate": 4.0770101925254814e-07, "logits/chosen": 0.060346126556396484, "logits/rejected": 0.6727169752120972, "logps/chosen": -219.79354858398438, "logps/rejected": -416.0438537597656, "loss": 0.1041, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6099984645843506, "rewards/margins": 5.61789083480835, "rewards/rejected": -4.007891654968262, "step": 720 }, { "epoch": 0.25, "learning_rate": 4.133635334088335e-07, "logits/chosen": 0.00713381776586175, "logits/rejected": 0.6316781044006348, "logps/chosen": -281.59881591796875, "logps/rejected": -460.47637939453125, "loss": 0.1044, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0221705436706543, "rewards/margins": 5.6202592849731445, "rewards/rejected": -3.5980887413024902, "step": 730 }, { "epoch": 0.25, "learning_rate": 4.190260475651189e-07, "logits/chosen": 0.015939956530928612, "logits/rejected": 0.5051840543746948, "logps/chosen": -280.57171630859375, "logps/rejected": -403.70526123046875, "loss": 0.1052, "rewards/accuracies": 0.9375, "rewards/chosen": 1.6558685302734375, "rewards/margins": 5.355403900146484, "rewards/rejected": -3.699535369873047, "step": 740 }, { "epoch": 0.25, "learning_rate": 4.2468856172140424e-07, "logits/chosen": 0.15295177698135376, "logits/rejected": 0.664870023727417, "logps/chosen": -191.7448272705078, "logps/rejected": -523.5913696289062, "loss": 0.0744, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8847191333770752, "rewards/margins": 6.07841682434082, "rewards/rejected": -4.193697929382324, "step": 750 }, { "epoch": 0.26, "learning_rate": 4.3035107587768963e-07, "logits/chosen": 0.02958027645945549, "logits/rejected": 0.5981519222259521, "logps/chosen": -204.9509735107422, "logps/rejected": -372.2116394042969, "loss": 0.1231, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7095353603363037, "rewards/margins": 5.460729598999023, "rewards/rejected": -3.7511940002441406, "step": 760 }, { "epoch": 0.26, "learning_rate": 4.3601359003397507e-07, "logits/chosen": -0.01379103772342205, "logits/rejected": 0.551000714302063, "logps/chosen": -322.20343017578125, "logps/rejected": -477.31060791015625, "loss": 0.0876, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.0060315132141113, "rewards/margins": 5.686766624450684, "rewards/rejected": -3.6807351112365723, "step": 770 }, { "epoch": 0.27, "learning_rate": 4.4167610419026046e-07, "logits/chosen": 0.05719362571835518, "logits/rejected": 0.677726686000824, "logps/chosen": -331.1451416015625, "logps/rejected": -588.7894287109375, "loss": 0.0997, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.004178524017334, "rewards/margins": 6.221012115478516, "rewards/rejected": -4.216833591461182, "step": 780 }, { "epoch": 0.27, "learning_rate": 4.4733861834654585e-07, "logits/chosen": 0.0018184438813477755, "logits/rejected": 0.6527267694473267, "logps/chosen": -258.1215515136719, "logps/rejected": -324.6481018066406, "loss": 0.0926, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0702271461486816, "rewards/margins": 5.982466220855713, "rewards/rejected": -3.9122390747070312, "step": 790 }, { "epoch": 0.27, "learning_rate": 4.5300113250283123e-07, "logits/chosen": -0.01676514372229576, "logits/rejected": 0.49802160263061523, "logps/chosen": -263.2022399902344, "logps/rejected": -446.6065979003906, "loss": 0.0848, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.9270559549331665, "rewards/margins": 5.6302690505981445, "rewards/rejected": -3.7032134532928467, "step": 800 }, { "epoch": 0.27, "eval_logits/chosen": 0.10244535654783249, "eval_logits/rejected": 0.7047103643417358, "eval_logps/chosen": -255.94149780273438, "eval_logps/rejected": -493.3857727050781, "eval_loss": 0.08630378544330597, "eval_rewards/accuracies": 0.9688552021980286, "eval_rewards/chosen": 1.854011058807373, "eval_rewards/margins": 6.182919502258301, "eval_rewards/rejected": -4.328908443450928, "eval_runtime": 518.4883, "eval_samples_per_second": 18.322, "eval_steps_per_second": 0.573, "step": 800 }, { "epoch": 0.28, "learning_rate": 4.586636466591166e-07, "logits/chosen": 0.04462115466594696, "logits/rejected": 0.6387485265731812, "logps/chosen": -227.82565307617188, "logps/rejected": -586.64892578125, "loss": 0.0801, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0227081775665283, "rewards/margins": 6.601919651031494, "rewards/rejected": -4.579211235046387, "step": 810 }, { "epoch": 0.28, "learning_rate": 4.64326160815402e-07, "logits/chosen": 0.05858244374394417, "logits/rejected": 0.6435064077377319, "logps/chosen": -357.1573181152344, "logps/rejected": -354.05987548828125, "loss": 0.067, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.7805042266845703, "rewards/margins": 5.7817182540893555, "rewards/rejected": -4.001214504241943, "step": 820 }, { "epoch": 0.28, "learning_rate": 4.6998867497168745e-07, "logits/chosen": 0.10970975458621979, "logits/rejected": 0.5290058851242065, "logps/chosen": -187.33468627929688, "logps/rejected": -536.2308959960938, "loss": 0.0996, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.000274181365967, "rewards/margins": 6.211495399475098, "rewards/rejected": -4.211220741271973, "step": 830 }, { "epoch": 0.29, "learning_rate": 4.756511891279728e-07, "logits/chosen": -0.0515323169529438, "logits/rejected": 0.6379265785217285, "logps/chosen": -268.2336120605469, "logps/rejected": -396.9682922363281, "loss": 0.0678, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.4188361167907715, "rewards/margins": 6.625235557556152, "rewards/rejected": -4.206398963928223, "step": 840 }, { "epoch": 0.29, "learning_rate": 4.813137032842582e-07, "logits/chosen": 0.0643363893032074, "logits/rejected": 0.4893170893192291, "logps/chosen": -278.02386474609375, "logps/rejected": -675.4198608398438, "loss": 0.0897, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 2.0273776054382324, "rewards/margins": 5.812685966491699, "rewards/rejected": -3.7853081226348877, "step": 850 }, { "epoch": 0.29, "learning_rate": 4.869762174405436e-07, "logits/chosen": 0.01620934158563614, "logits/rejected": 0.5817543864250183, "logps/chosen": -254.25173950195312, "logps/rejected": -488.99609375, "loss": 0.0793, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.993406057357788, "rewards/margins": 6.334799289703369, "rewards/rejected": -4.341392517089844, "step": 860 }, { "epoch": 0.3, "learning_rate": 4.92638731596829e-07, "logits/chosen": 0.01186728198081255, "logits/rejected": 0.47720083594322205, "logps/chosen": -347.5317077636719, "logps/rejected": -561.3574829101562, "loss": 0.0797, "rewards/accuracies": 0.9375, "rewards/chosen": 2.4026613235473633, "rewards/margins": 6.6899285316467285, "rewards/rejected": -4.287266731262207, "step": 870 }, { "epoch": 0.3, "learning_rate": 4.983012457531144e-07, "logits/chosen": 0.023154649883508682, "logits/rejected": 0.5905860662460327, "logps/chosen": -179.84628295898438, "logps/rejected": -422.66986083984375, "loss": 0.0574, "rewards/accuracies": 1.0, "rewards/chosen": 2.2599828243255615, "rewards/margins": 6.9955339431762695, "rewards/rejected": -4.735550880432129, "step": 880 }, { "epoch": 0.3, "learning_rate": 4.995593604431575e-07, "logits/chosen": -0.022603686898946762, "logits/rejected": 0.5722736716270447, "logps/chosen": -255.6490478515625, "logps/rejected": -341.39581298828125, "loss": 0.0725, "rewards/accuracies": 1.0, "rewards/chosen": 1.8452646732330322, "rewards/margins": 6.381416320800781, "rewards/rejected": -4.536151885986328, "step": 890 }, { "epoch": 0.31, "learning_rate": 4.989298753619539e-07, "logits/chosen": -0.008167101070284843, "logits/rejected": 0.7301871180534363, "logps/chosen": -210.7258758544922, "logps/rejected": -428.51025390625, "loss": 0.0853, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 2.0360567569732666, "rewards/margins": 7.382386207580566, "rewards/rejected": -5.346329212188721, "step": 900 }, { "epoch": 0.31, "eval_logits/chosen": 0.05288419499993324, "eval_logits/rejected": 0.6746042966842651, "eval_logps/chosen": -256.15350341796875, "eval_logps/rejected": -500.3665771484375, "eval_loss": 0.07442650943994522, "eval_rewards/accuracies": 0.9722222089767456, "eval_rewards/chosen": 1.8328105211257935, "eval_rewards/margins": 6.859799385070801, "eval_rewards/rejected": -5.026988983154297, "eval_runtime": 518.6828, "eval_samples_per_second": 18.316, "eval_steps_per_second": 0.573, "step": 900 }, { "epoch": 0.31, "learning_rate": 4.983003902807503e-07, "logits/chosen": -0.007518824189901352, "logits/rejected": 0.5138329863548279, "logps/chosen": -303.4407958984375, "logps/rejected": -336.9670715332031, "loss": 0.0676, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.9263694286346436, "rewards/margins": 6.365667343139648, "rewards/rejected": -4.439297676086426, "step": 910 }, { "epoch": 0.31, "learning_rate": 4.976709051995467e-07, "logits/chosen": 0.03634124994277954, "logits/rejected": 0.5339456796646118, "logps/chosen": -202.24874877929688, "logps/rejected": -427.1283264160156, "loss": 0.0706, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1109328269958496, "rewards/margins": 6.955789089202881, "rewards/rejected": -4.844857215881348, "step": 920 }, { "epoch": 0.32, "learning_rate": 4.970414201183432e-07, "logits/chosen": 0.03343547135591507, "logits/rejected": 0.596768319606781, "logps/chosen": -246.8701629638672, "logps/rejected": -527.07373046875, "loss": 0.0894, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0986328125, "rewards/margins": 7.681967735290527, "rewards/rejected": -5.583334922790527, "step": 930 }, { "epoch": 0.32, "learning_rate": 4.964119350371396e-07, "logits/chosen": -0.005236419849097729, "logits/rejected": 0.4905567169189453, "logps/chosen": -199.27334594726562, "logps/rejected": -492.15252685546875, "loss": 0.06, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0151419639587402, "rewards/margins": 7.270242214202881, "rewards/rejected": -5.255100250244141, "step": 940 }, { "epoch": 0.32, "learning_rate": 4.95782449955936e-07, "logits/chosen": -0.08705534040927887, "logits/rejected": 0.40858906507492065, "logps/chosen": -254.8382110595703, "logps/rejected": -414.87261962890625, "loss": 0.0652, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.139592409133911, "rewards/margins": 7.791099548339844, "rewards/rejected": -5.651506423950195, "step": 950 }, { "epoch": 0.33, "learning_rate": 4.951529648747325e-07, "logits/chosen": -0.08291283249855042, "logits/rejected": 0.6057056188583374, "logps/chosen": -311.377685546875, "logps/rejected": -480.71295166015625, "loss": 0.0758, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.979466438293457, "rewards/margins": 7.607062339782715, "rewards/rejected": -5.627596855163574, "step": 960 }, { "epoch": 0.33, "learning_rate": 4.945234797935289e-07, "logits/chosen": 0.033677466213703156, "logits/rejected": 0.49075764417648315, "logps/chosen": -190.84664916992188, "logps/rejected": -651.8924560546875, "loss": 0.0709, "rewards/accuracies": 1.0, "rewards/chosen": 1.9710493087768555, "rewards/margins": 7.248508453369141, "rewards/rejected": -5.277459144592285, "step": 970 }, { "epoch": 0.33, "learning_rate": 4.938939947123252e-07, "logits/chosen": -0.00282309646718204, "logits/rejected": 0.5748778581619263, "logps/chosen": -327.4038391113281, "logps/rejected": -638.7828369140625, "loss": 0.0842, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.8632694482803345, "rewards/margins": 8.335084915161133, "rewards/rejected": -6.471815586090088, "step": 980 }, { "epoch": 0.34, "learning_rate": 4.932645096311217e-07, "logits/chosen": -0.059016309678554535, "logits/rejected": 0.5663169026374817, "logps/chosen": -183.36831665039062, "logps/rejected": -577.4052734375, "loss": 0.0652, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.1564135551452637, "rewards/margins": 7.945901393890381, "rewards/rejected": -5.789486885070801, "step": 990 }, { "epoch": 0.34, "learning_rate": 4.926350245499181e-07, "logits/chosen": -0.021897226572036743, "logits/rejected": 0.5751781463623047, "logps/chosen": -279.1343078613281, "logps/rejected": -440.008544921875, "loss": 0.0561, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.0208611488342285, "rewards/margins": 7.442690849304199, "rewards/rejected": -5.421829700469971, "step": 1000 }, { "epoch": 0.34, "eval_logits/chosen": 0.018468094989657402, "eval_logits/rejected": 0.6517335176467896, "eval_logps/chosen": -256.7454528808594, "eval_logps/rejected": -504.651611328125, "eval_loss": 0.06387168914079666, "eval_rewards/accuracies": 0.9747474789619446, "eval_rewards/chosen": 1.773616909980774, "eval_rewards/margins": 7.229104042053223, "eval_rewards/rejected": -5.45548677444458, "eval_runtime": 519.7389, "eval_samples_per_second": 18.278, "eval_steps_per_second": 0.571, "step": 1000 }, { "epoch": 0.34, "learning_rate": 4.920055394687146e-07, "logits/chosen": -0.04988214373588562, "logits/rejected": 0.5359185338020325, "logps/chosen": -286.5663757324219, "logps/rejected": -417.20635986328125, "loss": 0.0771, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.8345654010772705, "rewards/margins": 6.427069664001465, "rewards/rejected": -4.592504501342773, "step": 1010 }, { "epoch": 0.35, "learning_rate": 4.91376054387511e-07, "logits/chosen": -0.021457133814692497, "logits/rejected": 0.47527965903282166, "logps/chosen": -181.7171173095703, "logps/rejected": -455.453857421875, "loss": 0.06, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.9185177087783813, "rewards/margins": 6.828257083892822, "rewards/rejected": -4.9097394943237305, "step": 1020 }, { "epoch": 0.35, "learning_rate": 4.907465693063074e-07, "logits/chosen": -0.031804513186216354, "logits/rejected": 0.4869725704193115, "logps/chosen": -189.09429931640625, "logps/rejected": -426.1416015625, "loss": 0.055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.01943302154541, "rewards/margins": 7.674594879150391, "rewards/rejected": -5.655160903930664, "step": 1030 }, { "epoch": 0.35, "learning_rate": 4.901170842251039e-07, "logits/chosen": 0.022681767120957375, "logits/rejected": 0.5010225772857666, "logps/chosen": -369.4875793457031, "logps/rejected": -640.638671875, "loss": 0.0615, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3496134281158447, "rewards/margins": 7.590904235839844, "rewards/rejected": -6.241290092468262, "step": 1040 }, { "epoch": 0.36, "learning_rate": 4.894875991439003e-07, "logits/chosen": -0.06667724996805191, "logits/rejected": 0.6818445324897766, "logps/chosen": -348.4339599609375, "logps/rejected": -438.24493408203125, "loss": 0.0546, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.5926023721694946, "rewards/margins": 7.723822116851807, "rewards/rejected": -6.131219387054443, "step": 1050 }, { "epoch": 0.36, "learning_rate": 4.888581140626966e-07, "logits/chosen": -0.026232430711388588, "logits/rejected": 0.6214309930801392, "logps/chosen": -267.76837158203125, "logps/rejected": -492.09222412109375, "loss": 0.0689, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.645049810409546, "rewards/margins": 7.67995548248291, "rewards/rejected": -6.034905433654785, "step": 1060 }, { "epoch": 0.36, "learning_rate": 4.882286289814931e-07, "logits/chosen": -0.08297820389270782, "logits/rejected": 0.5151987671852112, "logps/chosen": -320.90972900390625, "logps/rejected": -395.3104553222656, "loss": 0.0521, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7994956970214844, "rewards/margins": 7.020452976226807, "rewards/rejected": -5.220957279205322, "step": 1070 }, { "epoch": 0.37, "learning_rate": 4.875991439002896e-07, "logits/chosen": -0.0382964126765728, "logits/rejected": 0.6021534204483032, "logps/chosen": -267.7812194824219, "logps/rejected": -387.43341064453125, "loss": 0.0578, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8110764026641846, "rewards/margins": 7.906289100646973, "rewards/rejected": -6.095212936401367, "step": 1080 }, { "epoch": 0.37, "learning_rate": 4.869696588190859e-07, "logits/chosen": -0.2253701388835907, "logits/rejected": 0.629709780216217, "logps/chosen": -260.7735595703125, "logps/rejected": -291.59173583984375, "loss": 0.0618, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.780935525894165, "rewards/margins": 7.167150974273682, "rewards/rejected": -5.386216163635254, "step": 1090 }, { "epoch": 0.37, "learning_rate": 4.863401737378824e-07, "logits/chosen": -0.0485679991543293, "logits/rejected": 0.6353902816772461, "logps/chosen": -266.6210632324219, "logps/rejected": -513.0389404296875, "loss": 0.0657, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.0133330821990967, "rewards/margins": 8.027952194213867, "rewards/rejected": -6.01461935043335, "step": 1100 }, { "epoch": 0.37, "eval_logits/chosen": 0.01677827164530754, "eval_logits/rejected": 0.6737815141677856, "eval_logps/chosen": -256.5156555175781, "eval_logps/rejected": -506.5500183105469, "eval_loss": 0.0546191930770874, "eval_rewards/accuracies": 0.9840067625045776, "eval_rewards/chosen": 1.7965947389602661, "eval_rewards/margins": 7.441928386688232, "eval_rewards/rejected": -5.645333766937256, "eval_runtime": 521.7327, "eval_samples_per_second": 18.209, "eval_steps_per_second": 0.569, "step": 1100 }, { "epoch": 0.38, "learning_rate": 4.857106886566788e-07, "logits/chosen": 0.09238362312316895, "logits/rejected": 0.5716501474380493, "logps/chosen": -177.93408203125, "logps/rejected": -589.8180541992188, "loss": 0.0607, "rewards/accuracies": 1.0, "rewards/chosen": 1.6404987573623657, "rewards/margins": 7.8634514808654785, "rewards/rejected": -6.222952842712402, "step": 1110 }, { "epoch": 0.38, "learning_rate": 4.850812035754753e-07, "logits/chosen": -0.024067429825663567, "logits/rejected": 0.6025240421295166, "logps/chosen": -205.9295196533203, "logps/rejected": -492.132568359375, "loss": 0.0565, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8846168518066406, "rewards/margins": 7.4638848304748535, "rewards/rejected": -5.579268932342529, "step": 1120 }, { "epoch": 0.38, "learning_rate": 4.844517184942716e-07, "logits/chosen": -0.1049378365278244, "logits/rejected": 0.6088601350784302, "logps/chosen": -289.8595275878906, "logps/rejected": -337.29681396484375, "loss": 0.0526, "rewards/accuracies": 1.0, "rewards/chosen": 1.5747127532958984, "rewards/margins": 7.475668430328369, "rewards/rejected": -5.900955677032471, "step": 1130 }, { "epoch": 0.39, "learning_rate": 4.838222334130681e-07, "logits/chosen": 0.033387623727321625, "logits/rejected": 0.6091621518135071, "logps/chosen": -261.9830017089844, "logps/rejected": -439.41961669921875, "loss": 0.0528, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 1.7111966609954834, "rewards/margins": 8.268998146057129, "rewards/rejected": -6.557801723480225, "step": 1140 }, { "epoch": 0.39, "learning_rate": 4.831927483318645e-07, "logits/chosen": -0.07990173995494843, "logits/rejected": 0.6025005578994751, "logps/chosen": -296.7312316894531, "logps/rejected": -335.607421875, "loss": 0.052, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 2.1426470279693604, "rewards/margins": 8.553141593933105, "rewards/rejected": -6.410494804382324, "step": 1150 }, { "epoch": 0.39, "learning_rate": 4.82563263250661e-07, "logits/chosen": -0.1709839403629303, "logits/rejected": 0.659946620464325, "logps/chosen": -342.8404235839844, "logps/rejected": -438.43017578125, "loss": 0.0744, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4545644521713257, "rewards/margins": 7.707846164703369, "rewards/rejected": -6.2532806396484375, "step": 1160 }, { "epoch": 0.4, "learning_rate": 4.819337781694573e-07, "logits/chosen": -0.06764785945415497, "logits/rejected": 0.6222678422927856, "logps/chosen": -195.7085418701172, "logps/rejected": -484.5032653808594, "loss": 0.0553, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.4105724096298218, "rewards/margins": 7.690183162689209, "rewards/rejected": -6.279610633850098, "step": 1170 }, { "epoch": 0.4, "learning_rate": 4.813042930882538e-07, "logits/chosen": 0.02858917787671089, "logits/rejected": 0.5144532322883606, "logps/chosen": -249.46682739257812, "logps/rejected": -653.155517578125, "loss": 0.0412, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.6215696334838867, "rewards/margins": 7.4202775955200195, "rewards/rejected": -5.798707485198975, "step": 1180 }, { "epoch": 0.4, "learning_rate": 4.806748080070503e-07, "logits/chosen": -0.043406177312135696, "logits/rejected": 0.714196503162384, "logps/chosen": -277.3466491699219, "logps/rejected": -380.5605773925781, "loss": 0.0677, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.5416382551193237, "rewards/margins": 7.223520755767822, "rewards/rejected": -5.681882381439209, "step": 1190 }, { "epoch": 0.41, "learning_rate": 4.800453229258466e-07, "logits/chosen": -0.10575856268405914, "logits/rejected": 0.6795112490653992, "logps/chosen": -348.7967834472656, "logps/rejected": -458.8675842285156, "loss": 0.0445, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5695960521697998, "rewards/margins": 7.275958061218262, "rewards/rejected": -5.706362724304199, "step": 1200 }, { "epoch": 0.41, "eval_logits/chosen": 0.017646372318267822, "eval_logits/rejected": 0.6757264733314514, "eval_logps/chosen": -257.23687744140625, "eval_logps/rejected": -510.7413635253906, "eval_loss": 0.04986022040247917, "eval_rewards/accuracies": 0.9823232293128967, "eval_rewards/chosen": 1.7244741916656494, "eval_rewards/margins": 7.788939952850342, "eval_rewards/rejected": -6.06446647644043, "eval_runtime": 514.6701, "eval_samples_per_second": 18.458, "eval_steps_per_second": 0.577, "step": 1200 }, { "epoch": 0.41, "learning_rate": 4.79415837844643e-07, "logits/chosen": -0.11129943281412125, "logits/rejected": 0.763250470161438, "logps/chosen": -272.175537109375, "logps/rejected": -307.0052795410156, "loss": 0.0594, "rewards/accuracies": 1.0, "rewards/chosen": 1.6549009084701538, "rewards/margins": 8.168081283569336, "rewards/rejected": -6.513180732727051, "step": 1210 }, { "epoch": 0.41, "learning_rate": 4.787863527634395e-07, "logits/chosen": 0.11017324775457382, "logits/rejected": 0.5948671102523804, "logps/chosen": -208.8245086669922, "logps/rejected": -578.9046630859375, "loss": 0.0554, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.086965799331665, "rewards/margins": 7.619833469390869, "rewards/rejected": -6.532868385314941, "step": 1220 }, { "epoch": 0.42, "learning_rate": 4.781568676822359e-07, "logits/chosen": 0.07215871661901474, "logits/rejected": 0.4904249608516693, "logps/chosen": -189.17166137695312, "logps/rejected": -740.416015625, "loss": 0.0394, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9633619785308838, "rewards/margins": 8.144128799438477, "rewards/rejected": -6.1807661056518555, "step": 1230 }, { "epoch": 0.42, "learning_rate": 4.775273826010323e-07, "logits/chosen": -0.02005598321557045, "logits/rejected": 0.5616487264633179, "logps/chosen": -275.2275390625, "logps/rejected": -521.8006591796875, "loss": 0.0564, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.714172124862671, "rewards/margins": 7.829100608825684, "rewards/rejected": -6.11492919921875, "step": 1240 }, { "epoch": 0.42, "learning_rate": 4.768978975198288e-07, "logits/chosen": 0.00589663814753294, "logits/rejected": 0.40184587240219116, "logps/chosen": -260.290771484375, "logps/rejected": -705.8692626953125, "loss": 0.0357, "rewards/accuracies": 1.0, "rewards/chosen": 1.664227843284607, "rewards/margins": 7.495697021484375, "rewards/rejected": -5.831470012664795, "step": 1250 }, { "epoch": 0.43, "learning_rate": 4.762684124386252e-07, "logits/chosen": -0.03081933781504631, "logits/rejected": 0.5352996587753296, "logps/chosen": -283.6715393066406, "logps/rejected": -607.3733520507812, "loss": 0.0518, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.5777918100357056, "rewards/margins": 8.674604415893555, "rewards/rejected": -7.096814155578613, "step": 1260 }, { "epoch": 0.43, "learning_rate": 4.756389273574216e-07, "logits/chosen": -0.11301298439502716, "logits/rejected": 0.6701822876930237, "logps/chosen": -315.54205322265625, "logps/rejected": -569.5267944335938, "loss": 0.0453, "rewards/accuracies": 1.0, "rewards/chosen": 1.6960986852645874, "rewards/margins": 8.628989219665527, "rewards/rejected": -6.932890892028809, "step": 1270 }, { "epoch": 0.44, "learning_rate": 4.7500944227621803e-07, "logits/chosen": -0.08425600826740265, "logits/rejected": 0.575507640838623, "logps/chosen": -219.59005737304688, "logps/rejected": -383.9698486328125, "loss": 0.0398, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9790904521942139, "rewards/margins": 8.621831893920898, "rewards/rejected": -6.6427412033081055, "step": 1280 }, { "epoch": 0.44, "learning_rate": 4.7437995719501445e-07, "logits/chosen": 0.03841705247759819, "logits/rejected": 0.7392519116401672, "logps/chosen": -202.97564697265625, "logps/rejected": -566.29345703125, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": 1.5167877674102783, "rewards/margins": 9.495674133300781, "rewards/rejected": -7.978886604309082, "step": 1290 }, { "epoch": 0.44, "learning_rate": 4.737504721138109e-07, "logits/chosen": 0.050668250769376755, "logits/rejected": 0.40587228536605835, "logps/chosen": -270.19049072265625, "logps/rejected": -565.6331787109375, "loss": 0.0498, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.8828514218330383, "rewards/margins": 7.327982425689697, "rewards/rejected": -6.445130825042725, "step": 1300 }, { "epoch": 0.44, "eval_logits/chosen": 0.022831527516245842, "eval_logits/rejected": 0.6718878746032715, "eval_logps/chosen": -257.6771545410156, "eval_logps/rejected": -516.8956298828125, "eval_loss": 0.04593642055988312, "eval_rewards/accuracies": 0.9848484992980957, "eval_rewards/chosen": 1.6804441213607788, "eval_rewards/margins": 8.360334396362305, "eval_rewards/rejected": -6.679889678955078, "eval_runtime": 518.1321, "eval_samples_per_second": 18.335, "eval_steps_per_second": 0.573, "step": 1300 }, { "epoch": 0.45, "learning_rate": 4.7312098703260735e-07, "logits/chosen": -0.05303068086504936, "logits/rejected": 0.5573102831840515, "logps/chosen": -198.84896850585938, "logps/rejected": -431.00927734375, "loss": 0.0489, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.371403455734253, "rewards/margins": 7.985485076904297, "rewards/rejected": -6.614081382751465, "step": 1310 }, { "epoch": 0.45, "learning_rate": 4.724915019514038e-07, "logits/chosen": -0.03413159400224686, "logits/rejected": 0.42138418555259705, "logps/chosen": -256.59295654296875, "logps/rejected": -580.3004150390625, "loss": 0.0494, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6062533855438232, "rewards/margins": 8.08342170715332, "rewards/rejected": -6.47716760635376, "step": 1320 }, { "epoch": 0.45, "learning_rate": 4.7186201687020014e-07, "logits/chosen": -0.12694311141967773, "logits/rejected": 0.7129651308059692, "logps/chosen": -252.7335662841797, "logps/rejected": -378.3775939941406, "loss": 0.0391, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3313204050064087, "rewards/margins": 8.52730655670166, "rewards/rejected": -7.195985317230225, "step": 1330 }, { "epoch": 0.46, "learning_rate": 4.7123253178899657e-07, "logits/chosen": -0.008433983661234379, "logits/rejected": 0.5761882066726685, "logps/chosen": -186.95590209960938, "logps/rejected": -464.9042053222656, "loss": 0.0297, "rewards/accuracies": 1.0, "rewards/chosen": 2.166475534439087, "rewards/margins": 9.565309524536133, "rewards/rejected": -7.39883279800415, "step": 1340 }, { "epoch": 0.46, "learning_rate": 4.70603046707793e-07, "logits/chosen": -0.01723262295126915, "logits/rejected": 0.543562650680542, "logps/chosen": -199.55226135253906, "logps/rejected": -616.6301879882812, "loss": 0.0296, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8424361944198608, "rewards/margins": 9.643768310546875, "rewards/rejected": -7.80133056640625, "step": 1350 }, { "epoch": 0.46, "learning_rate": 4.699735616265894e-07, "logits/chosen": 0.004501981660723686, "logits/rejected": 0.507594883441925, "logps/chosen": -266.8397216796875, "logps/rejected": -446.079345703125, "loss": 0.0617, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 2.264777660369873, "rewards/margins": 8.615518569946289, "rewards/rejected": -6.350740432739258, "step": 1360 }, { "epoch": 0.47, "learning_rate": 4.693440765453859e-07, "logits/chosen": 0.07373420149087906, "logits/rejected": 0.5399583578109741, "logps/chosen": -218.79269409179688, "logps/rejected": -689.6273803710938, "loss": 0.0549, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9704105257987976, "rewards/margins": 6.958341121673584, "rewards/rejected": -5.987931251525879, "step": 1370 }, { "epoch": 0.47, "learning_rate": 4.687145914641823e-07, "logits/chosen": -0.12151205539703369, "logits/rejected": 0.603783905506134, "logps/chosen": -220.7379608154297, "logps/rejected": -542.1465454101562, "loss": 0.0366, "rewards/accuracies": 1.0, "rewards/chosen": 1.5063287019729614, "rewards/margins": 7.9898552894592285, "rewards/rejected": -6.483526706695557, "step": 1380 }, { "epoch": 0.47, "learning_rate": 4.6808510638297873e-07, "logits/chosen": -0.02185133472084999, "logits/rejected": 0.5403569936752319, "logps/chosen": -221.1869354248047, "logps/rejected": -561.3621826171875, "loss": 0.0282, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.8545684814453125, "rewards/margins": 7.839407444000244, "rewards/rejected": -5.98483943939209, "step": 1390 }, { "epoch": 0.48, "learning_rate": 4.674556213017751e-07, "logits/chosen": -0.08111375570297241, "logits/rejected": 0.7392144203186035, "logps/chosen": -184.41592407226562, "logps/rejected": -483.13665771484375, "loss": 0.0554, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.9428634643554688, "rewards/margins": 8.943255424499512, "rewards/rejected": -7.000391483306885, "step": 1400 }, { "epoch": 0.48, "eval_logits/chosen": 0.015122265554964542, "eval_logits/rejected": 0.6684333086013794, "eval_logps/chosen": -258.4871520996094, "eval_logps/rejected": -518.8279418945312, "eval_loss": 0.054929982870817184, "eval_rewards/accuracies": 0.9848484992980957, "eval_rewards/chosen": 1.5994449853897095, "eval_rewards/margins": 8.472569465637207, "eval_rewards/rejected": -6.873123645782471, "eval_runtime": 514.6871, "eval_samples_per_second": 18.458, "eval_steps_per_second": 0.577, "step": 1400 }, { "epoch": 0.48, "learning_rate": 4.668261362205715e-07, "logits/chosen": -0.011925707571208477, "logits/rejected": 0.7000913619995117, "logps/chosen": -289.9590759277344, "logps/rejected": -530.9601440429688, "loss": 0.0358, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.7594867944717407, "rewards/margins": 9.126184463500977, "rewards/rejected": -7.366697788238525, "step": 1410 }, { "epoch": 0.48, "learning_rate": 4.6619665113936795e-07, "logits/chosen": -0.002680667443200946, "logits/rejected": 0.5044655203819275, "logps/chosen": -270.6023254394531, "logps/rejected": -561.531494140625, "loss": 0.0357, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3575317859649658, "rewards/margins": 8.141082763671875, "rewards/rejected": -6.7835516929626465, "step": 1420 }, { "epoch": 0.49, "learning_rate": 4.6556716605816437e-07, "logits/chosen": -0.02368699200451374, "logits/rejected": 0.4800760746002197, "logps/chosen": -258.1476745605469, "logps/rejected": -514.6412353515625, "loss": 0.04, "rewards/accuracies": 1.0, "rewards/chosen": 1.6494758129119873, "rewards/margins": 8.596821784973145, "rewards/rejected": -6.947345733642578, "step": 1430 }, { "epoch": 0.49, "learning_rate": 4.6493768097696085e-07, "logits/chosen": 0.003919348120689392, "logits/rejected": 0.6637479066848755, "logps/chosen": -351.2317199707031, "logps/rejected": -422.57501220703125, "loss": 0.0405, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.227037787437439, "rewards/margins": 7.992938995361328, "rewards/rejected": -6.765902042388916, "step": 1440 }, { "epoch": 0.49, "learning_rate": 4.6430819589575727e-07, "logits/chosen": -0.015313789248466492, "logits/rejected": 0.6413368582725525, "logps/chosen": -207.24813842773438, "logps/rejected": -365.9192199707031, "loss": 0.0431, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.1080700159072876, "rewards/margins": 8.427752494812012, "rewards/rejected": -7.3196821212768555, "step": 1450 }, { "epoch": 0.5, "learning_rate": 4.636787108145537e-07, "logits/chosen": -0.002808505203574896, "logits/rejected": 0.5014025568962097, "logps/chosen": -329.23492431640625, "logps/rejected": -556.1297607421875, "loss": 0.0306, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3372336626052856, "rewards/margins": 8.180625915527344, "rewards/rejected": -6.843391418457031, "step": 1460 }, { "epoch": 0.5, "learning_rate": 4.630492257333501e-07, "logits/chosen": -0.09513185918331146, "logits/rejected": 0.6777242422103882, "logps/chosen": -192.978515625, "logps/rejected": -392.2290954589844, "loss": 0.0918, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2192718982696533, "rewards/margins": 8.314642906188965, "rewards/rejected": -7.095371246337891, "step": 1470 }, { "epoch": 0.5, "learning_rate": 4.624197406521465e-07, "logits/chosen": -0.047873690724372864, "logits/rejected": 0.6871291399002075, "logps/chosen": -191.26036071777344, "logps/rejected": -490.68231201171875, "loss": 0.0412, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.9298769235610962, "rewards/margins": 8.993932723999023, "rewards/rejected": -8.064054489135742, "step": 1480 }, { "epoch": 0.51, "learning_rate": 4.617902555709429e-07, "logits/chosen": -0.07442308962345123, "logits/rejected": 0.538407027721405, "logps/chosen": -245.76644897460938, "logps/rejected": -311.0569152832031, "loss": 0.0479, "rewards/accuracies": 1.0, "rewards/chosen": 1.179579734802246, "rewards/margins": 8.840651512145996, "rewards/rejected": -7.66107177734375, "step": 1490 }, { "epoch": 0.51, "learning_rate": 4.611607704897394e-07, "logits/chosen": -0.1269553303718567, "logits/rejected": 0.5249725580215454, "logps/chosen": -335.9045715332031, "logps/rejected": -509.41357421875, "loss": 0.0368, "rewards/accuracies": 1.0, "rewards/chosen": 1.5279067754745483, "rewards/margins": 8.755719184875488, "rewards/rejected": -7.227812767028809, "step": 1500 }, { "epoch": 0.51, "eval_logits/chosen": -0.022291265428066254, "eval_logits/rejected": 0.6464473009109497, "eval_logps/chosen": -260.9311218261719, "eval_logps/rejected": -528.1129760742188, "eval_loss": 0.03651096299290657, "eval_rewards/accuracies": 0.9865319728851318, "eval_rewards/chosen": 1.3550506830215454, "eval_rewards/margins": 9.156670570373535, "eval_rewards/rejected": -7.801619529724121, "eval_runtime": 517.7672, "eval_samples_per_second": 18.348, "eval_steps_per_second": 0.574, "step": 1500 }, { "epoch": 0.51, "learning_rate": 4.605312854085358e-07, "logits/chosen": -0.08384579420089722, "logits/rejected": 0.5313757658004761, "logps/chosen": -301.20111083984375, "logps/rejected": -446.17169189453125, "loss": 0.0455, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 1.124419093132019, "rewards/margins": 8.705448150634766, "rewards/rejected": -7.581029415130615, "step": 1510 }, { "epoch": 0.52, "learning_rate": 4.5990180032733223e-07, "logits/chosen": -0.1201632022857666, "logits/rejected": 0.5541719198226929, "logps/chosen": -266.7043762207031, "logps/rejected": -586.0890502929688, "loss": 0.0388, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.5352497100830078, "rewards/margins": 8.533283233642578, "rewards/rejected": -6.9980340003967285, "step": 1520 }, { "epoch": 0.52, "learning_rate": 4.5927231524612865e-07, "logits/chosen": -0.12296725809574127, "logits/rejected": 0.7490689754486084, "logps/chosen": -240.1829071044922, "logps/rejected": -377.25164794921875, "loss": 0.0307, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4602634906768799, "rewards/margins": 9.460953712463379, "rewards/rejected": -8.000689506530762, "step": 1530 }, { "epoch": 0.52, "learning_rate": 4.586428301649251e-07, "logits/chosen": 0.0010828435188159347, "logits/rejected": 0.33955293893814087, "logps/chosen": -205.18276977539062, "logps/rejected": -607.7073974609375, "loss": 0.0375, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.6731287240982056, "rewards/margins": 7.909235954284668, "rewards/rejected": -6.23610782623291, "step": 1540 }, { "epoch": 0.53, "learning_rate": 4.5801334508372145e-07, "logits/chosen": -0.03397984802722931, "logits/rejected": 0.6582716107368469, "logps/chosen": -278.14239501953125, "logps/rejected": -428.67840576171875, "loss": 0.0344, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2025349140167236, "rewards/margins": 8.241921424865723, "rewards/rejected": -7.039385795593262, "step": 1550 }, { "epoch": 0.53, "learning_rate": 4.573838600025179e-07, "logits/chosen": -0.0032762468326836824, "logits/rejected": 0.5959766507148743, "logps/chosen": -361.1002502441406, "logps/rejected": -589.371826171875, "loss": 0.0343, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0029118061065674, "rewards/margins": 8.154011726379395, "rewards/rejected": -7.151100158691406, "step": 1560 }, { "epoch": 0.53, "learning_rate": 4.5675437492131434e-07, "logits/chosen": -0.07828334718942642, "logits/rejected": 0.5826660394668579, "logps/chosen": -238.6009979248047, "logps/rejected": -543.2061767578125, "loss": 0.035, "rewards/accuracies": 1.0, "rewards/chosen": 1.6205368041992188, "rewards/margins": 8.451268196105957, "rewards/rejected": -6.8307318687438965, "step": 1570 }, { "epoch": 0.54, "learning_rate": 4.5612488984011077e-07, "logits/chosen": -0.13771258294582367, "logits/rejected": 0.6276491284370422, "logps/chosen": -262.6397705078125, "logps/rejected": -475.828369140625, "loss": 0.0456, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3252559900283813, "rewards/margins": 8.569852828979492, "rewards/rejected": -7.2445969581604, "step": 1580 }, { "epoch": 0.54, "learning_rate": 4.554954047589072e-07, "logits/chosen": -0.09512496739625931, "logits/rejected": 0.7345348596572876, "logps/chosen": -267.2898254394531, "logps/rejected": -466.9762268066406, "loss": 0.0367, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.301556944847107, "rewards/margins": 10.29693603515625, "rewards/rejected": -8.995379447937012, "step": 1590 }, { "epoch": 0.54, "learning_rate": 4.548659196777036e-07, "logits/chosen": -0.12127195298671722, "logits/rejected": 0.6262275576591492, "logps/chosen": -407.61053466796875, "logps/rejected": -431.717529296875, "loss": 0.0514, "rewards/accuracies": 1.0, "rewards/chosen": 1.6713097095489502, "rewards/margins": 8.706266403198242, "rewards/rejected": -7.034958839416504, "step": 1600 }, { "epoch": 0.54, "eval_logits/chosen": -0.01591184362769127, "eval_logits/rejected": 0.661719560623169, "eval_logps/chosen": -261.0510559082031, "eval_logps/rejected": -525.8816528320312, "eval_loss": 0.03326256945729256, "eval_rewards/accuracies": 0.9890572428703308, "eval_rewards/chosen": 1.3430562019348145, "eval_rewards/margins": 8.921546936035156, "eval_rewards/rejected": -7.578491687774658, "eval_runtime": 521.1573, "eval_samples_per_second": 18.229, "eval_steps_per_second": 0.57, "step": 1600 }, { "epoch": 0.55, "learning_rate": 4.5423643459650003e-07, "logits/chosen": -0.03308238834142685, "logits/rejected": 0.5057162046432495, "logps/chosen": -278.1869812011719, "logps/rejected": -630.75146484375, "loss": 0.032, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.217278242111206, "rewards/margins": 8.419373512268066, "rewards/rejected": -7.202095031738281, "step": 1610 }, { "epoch": 0.55, "learning_rate": 4.536069495152965e-07, "logits/chosen": -0.043737608939409256, "logits/rejected": 0.5046564340591431, "logps/chosen": -262.81781005859375, "logps/rejected": -588.3793334960938, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 1.381829857826233, "rewards/margins": 9.302871704101562, "rewards/rejected": -7.921041965484619, "step": 1620 }, { "epoch": 0.55, "learning_rate": 4.529774644340929e-07, "logits/chosen": -0.1623125970363617, "logits/rejected": 0.6118518114089966, "logps/chosen": -279.07891845703125, "logps/rejected": -451.875, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 1.096649408340454, "rewards/margins": 8.925345420837402, "rewards/rejected": -7.828696250915527, "step": 1630 }, { "epoch": 0.56, "learning_rate": 4.523479793528893e-07, "logits/chosen": -0.1671612411737442, "logits/rejected": 0.5162423849105835, "logps/chosen": -259.68548583984375, "logps/rejected": -467.79107666015625, "loss": 0.0225, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1437369585037231, "rewards/margins": 9.201498031616211, "rewards/rejected": -8.057761192321777, "step": 1640 }, { "epoch": 0.56, "learning_rate": 4.517184942716857e-07, "logits/chosen": -0.11009261757135391, "logits/rejected": 0.593260645866394, "logps/chosen": -334.7881774902344, "logps/rejected": -406.8807067871094, "loss": 0.034, "rewards/accuracies": 1.0, "rewards/chosen": 0.8425920605659485, "rewards/margins": 8.46872615814209, "rewards/rejected": -7.626134395599365, "step": 1650 }, { "epoch": 0.56, "learning_rate": 4.5108900919048215e-07, "logits/chosen": -0.11647365987300873, "logits/rejected": 0.5646840929985046, "logps/chosen": -279.81732177734375, "logps/rejected": -471.8079528808594, "loss": 0.0325, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.9721755981445312, "rewards/margins": 9.051806449890137, "rewards/rejected": -8.079630851745605, "step": 1660 }, { "epoch": 0.57, "learning_rate": 4.5045952410927857e-07, "logits/chosen": -0.05266667157411575, "logits/rejected": 0.5460548996925354, "logps/chosen": -240.8248291015625, "logps/rejected": -506.22113037109375, "loss": 0.0285, "rewards/accuracies": 1.0, "rewards/chosen": 1.7817230224609375, "rewards/margins": 10.135636329650879, "rewards/rejected": -8.353912353515625, "step": 1670 }, { "epoch": 0.57, "learning_rate": 4.4983003902807505e-07, "logits/chosen": -0.07833122462034225, "logits/rejected": 0.5011191964149475, "logps/chosen": -352.9286193847656, "logps/rejected": -716.1649780273438, "loss": 0.0282, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.3859103918075562, "rewards/margins": 10.49589729309082, "rewards/rejected": -9.109986305236816, "step": 1680 }, { "epoch": 0.57, "learning_rate": 4.4920055394687147e-07, "logits/chosen": -0.11079414188861847, "logits/rejected": 0.607758641242981, "logps/chosen": -209.4288330078125, "logps/rejected": -430.60235595703125, "loss": 0.0361, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0271971225738525, "rewards/margins": 8.727540969848633, "rewards/rejected": -7.700344085693359, "step": 1690 }, { "epoch": 0.58, "learning_rate": 4.485710688656679e-07, "logits/chosen": -0.035323236137628555, "logits/rejected": 0.533891499042511, "logps/chosen": -223.5799102783203, "logps/rejected": -483.94659423828125, "loss": 0.0365, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.4450551271438599, "rewards/margins": 10.5841646194458, "rewards/rejected": -9.139111518859863, "step": 1700 }, { "epoch": 0.58, "eval_logits/chosen": -0.02343106083571911, "eval_logits/rejected": 0.635417640209198, "eval_logps/chosen": -263.7386474609375, "eval_logps/rejected": -538.2263793945312, "eval_loss": 0.03101719357073307, "eval_rewards/accuracies": 0.9932659864425659, "eval_rewards/chosen": 1.0742944478988647, "eval_rewards/margins": 9.887263298034668, "eval_rewards/rejected": -8.812970161437988, "eval_runtime": 526.9326, "eval_samples_per_second": 18.029, "eval_steps_per_second": 0.564, "step": 1700 }, { "epoch": 0.58, "learning_rate": 4.4794158378446426e-07, "logits/chosen": -0.051509879529476166, "logits/rejected": 0.46857118606567383, "logps/chosen": -331.1856994628906, "logps/rejected": -709.3829345703125, "loss": 0.0311, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.2430858612060547, "rewards/margins": 9.418615341186523, "rewards/rejected": -8.175528526306152, "step": 1710 }, { "epoch": 0.58, "learning_rate": 4.473120987032607e-07, "logits/chosen": 0.02388383448123932, "logits/rejected": 0.3967018127441406, "logps/chosen": -252.25729370117188, "logps/rejected": -886.15576171875, "loss": 0.0205, "rewards/accuracies": 1.0, "rewards/chosen": 1.1064679622650146, "rewards/margins": 9.701570510864258, "rewards/rejected": -8.595102310180664, "step": 1720 }, { "epoch": 0.59, "learning_rate": 4.466826136220571e-07, "logits/chosen": -0.10306843370199203, "logits/rejected": 0.7018638849258423, "logps/chosen": -214.75192260742188, "logps/rejected": -436.7806701660156, "loss": 0.0323, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.2659380435943604, "rewards/margins": 10.531064987182617, "rewards/rejected": -9.265128135681152, "step": 1730 }, { "epoch": 0.59, "learning_rate": 4.460531285408536e-07, "logits/chosen": -0.15345972776412964, "logits/rejected": 0.6240254044532776, "logps/chosen": -400.036865234375, "logps/rejected": -448.048095703125, "loss": 0.0528, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8120933771133423, "rewards/margins": 9.125635147094727, "rewards/rejected": -8.313541412353516, "step": 1740 }, { "epoch": 0.59, "learning_rate": 4.4542364345965e-07, "logits/chosen": -0.15340320765972137, "logits/rejected": 0.5751463174819946, "logps/chosen": -316.0746154785156, "logps/rejected": -429.846435546875, "loss": 0.1749, "rewards/accuracies": 1.0, "rewards/chosen": 1.2138341665267944, "rewards/margins": 11.596688270568848, "rewards/rejected": -10.382854461669922, "step": 1750 }, { "epoch": 0.6, "learning_rate": 4.4479415837844643e-07, "logits/chosen": -0.16663026809692383, "logits/rejected": 0.6081364154815674, "logps/chosen": -326.77410888671875, "logps/rejected": -431.6424865722656, "loss": 0.0263, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7780649065971375, "rewards/margins": 9.530843734741211, "rewards/rejected": -8.752779006958008, "step": 1760 }, { "epoch": 0.6, "learning_rate": 4.4416467329724285e-07, "logits/chosen": -0.04580571502447128, "logits/rejected": 0.40343135595321655, "logps/chosen": -197.31143188476562, "logps/rejected": -672.2965087890625, "loss": 0.0172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.432607889175415, "rewards/margins": 10.84279727935791, "rewards/rejected": -9.410189628601074, "step": 1770 }, { "epoch": 0.61, "learning_rate": 4.435351882160392e-07, "logits/chosen": -0.12161228805780411, "logits/rejected": 0.5618101358413696, "logps/chosen": -282.4912109375, "logps/rejected": -463.66619873046875, "loss": 0.0309, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.4986404478549957, "rewards/margins": 9.744178771972656, "rewards/rejected": -9.245538711547852, "step": 1780 }, { "epoch": 0.61, "learning_rate": 4.4290570313483564e-07, "logits/chosen": -0.0592464916408062, "logits/rejected": 0.6122828722000122, "logps/chosen": -407.3721618652344, "logps/rejected": -479.8359375, "loss": 0.0281, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8558034896850586, "rewards/margins": 11.5916748046875, "rewards/rejected": -10.735872268676758, "step": 1790 }, { "epoch": 0.61, "learning_rate": 4.422762180536321e-07, "logits/chosen": 0.05196399614214897, "logits/rejected": 0.4721229076385498, "logps/chosen": -219.16555786132812, "logps/rejected": -597.8582763671875, "loss": 0.0214, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.3525173664093018, "rewards/margins": 9.861231803894043, "rewards/rejected": -8.50871467590332, "step": 1800 }, { "epoch": 0.61, "eval_logits/chosen": -0.019948190078139305, "eval_logits/rejected": 0.627830982208252, "eval_logps/chosen": -263.172119140625, "eval_logps/rejected": -541.729248046875, "eval_loss": 0.03000798262655735, "eval_rewards/accuracies": 0.9924242496490479, "eval_rewards/chosen": 1.130950689315796, "eval_rewards/margins": 10.29420280456543, "eval_rewards/rejected": -9.163251876831055, "eval_runtime": 527.7581, "eval_samples_per_second": 18.001, "eval_steps_per_second": 0.563, "step": 1800 }, { "epoch": 0.62, "learning_rate": 4.4164673297242854e-07, "logits/chosen": -0.07379848510026932, "logits/rejected": 0.6002097129821777, "logps/chosen": -330.8468933105469, "logps/rejected": -606.0767822265625, "loss": 0.0244, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.185977578163147, "rewards/margins": 11.918807983398438, "rewards/rejected": -10.732831954956055, "step": 1810 }, { "epoch": 0.62, "learning_rate": 4.4101724789122497e-07, "logits/chosen": -0.05416034907102585, "logits/rejected": 0.41344794631004333, "logps/chosen": -278.5623779296875, "logps/rejected": -772.2142333984375, "loss": 0.0267, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0470502376556396, "rewards/margins": 9.277294158935547, "rewards/rejected": -8.230244636535645, "step": 1820 }, { "epoch": 0.62, "learning_rate": 4.403877628100214e-07, "logits/chosen": -0.0519573912024498, "logits/rejected": 0.5700843334197998, "logps/chosen": -271.1733703613281, "logps/rejected": -501.96783447265625, "loss": 0.0198, "rewards/accuracies": 1.0, "rewards/chosen": 0.959564208984375, "rewards/margins": 9.517755508422852, "rewards/rejected": -8.558189392089844, "step": 1830 }, { "epoch": 0.63, "learning_rate": 4.397582777288178e-07, "logits/chosen": -0.01639259047806263, "logits/rejected": 0.6458941698074341, "logps/chosen": -322.2823791503906, "logps/rejected": -637.6697387695312, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 1.3698785305023193, "rewards/margins": 10.687684059143066, "rewards/rejected": -9.317805290222168, "step": 1840 }, { "epoch": 0.63, "learning_rate": 4.3912879264761423e-07, "logits/chosen": -0.1477237492799759, "logits/rejected": 0.4823497235774994, "logps/chosen": -274.5985412597656, "logps/rejected": -512.8140869140625, "loss": 0.0462, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7998732924461365, "rewards/margins": 9.184606552124023, "rewards/rejected": -8.384733200073242, "step": 1850 }, { "epoch": 0.63, "learning_rate": 4.3849930756641066e-07, "logits/chosen": -0.005030305590480566, "logits/rejected": 0.389387845993042, "logps/chosen": -264.0657043457031, "logps/rejected": -617.393310546875, "loss": 0.0319, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.0277444124221802, "rewards/margins": 10.148255348205566, "rewards/rejected": -9.12051010131836, "step": 1860 }, { "epoch": 0.64, "learning_rate": 4.378698224852071e-07, "logits/chosen": -0.1255815178155899, "logits/rejected": 0.5196479558944702, "logps/chosen": -270.0270080566406, "logps/rejected": -481.51776123046875, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 0.7426668405532837, "rewards/margins": 10.62120246887207, "rewards/rejected": -9.878534317016602, "step": 1870 }, { "epoch": 0.64, "learning_rate": 4.372403374040035e-07, "logits/chosen": -0.004232077859342098, "logits/rejected": 0.603295087814331, "logps/chosen": -208.4076690673828, "logps/rejected": -526.0924072265625, "loss": 0.0214, "rewards/accuracies": 1.0, "rewards/chosen": 0.554468035697937, "rewards/margins": 10.9672269821167, "rewards/rejected": -10.412758827209473, "step": 1880 }, { "epoch": 0.64, "learning_rate": 4.366108523227999e-07, "logits/chosen": -0.09426625072956085, "logits/rejected": 0.491260290145874, "logps/chosen": -294.0018005371094, "logps/rejected": -508.4173889160156, "loss": 0.0342, "rewards/accuracies": 1.0, "rewards/chosen": 0.7776670455932617, "rewards/margins": 10.024995803833008, "rewards/rejected": -9.247330665588379, "step": 1890 }, { "epoch": 0.65, "learning_rate": 4.3598136724159635e-07, "logits/chosen": -0.13394729793071747, "logits/rejected": 0.48483777046203613, "logps/chosen": -343.94171142578125, "logps/rejected": -538.5118408203125, "loss": 0.0321, "rewards/accuracies": 1.0, "rewards/chosen": 1.1678273677825928, "rewards/margins": 10.829117774963379, "rewards/rejected": -9.661290168762207, "step": 1900 }, { "epoch": 0.65, "eval_logits/chosen": -0.026832714676856995, "eval_logits/rejected": 0.6044366359710693, "eval_logps/chosen": -264.9674987792969, "eval_logps/rejected": -549.9811401367188, "eval_loss": 0.026685014367103577, "eval_rewards/accuracies": 0.9924242496490479, "eval_rewards/chosen": 0.9514113664627075, "eval_rewards/margins": 10.939852714538574, "eval_rewards/rejected": -9.988442420959473, "eval_runtime": 529.9954, "eval_samples_per_second": 17.925, "eval_steps_per_second": 0.56, "step": 1900 }, { "epoch": 0.65, "learning_rate": 4.3535188216039277e-07, "logits/chosen": -0.02342187985777855, "logits/rejected": 0.5200868248939514, "logps/chosen": -251.0133056640625, "logps/rejected": -575.5413818359375, "loss": 0.0376, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.0311676263809204, "rewards/margins": 11.647043228149414, "rewards/rejected": -10.615877151489258, "step": 1910 }, { "epoch": 0.65, "learning_rate": 4.3472239707918925e-07, "logits/chosen": -0.08540651947259903, "logits/rejected": 0.5662746429443359, "logps/chosen": -195.35794067382812, "logps/rejected": -423.41455078125, "loss": 0.0247, "rewards/accuracies": 1.0, "rewards/chosen": 1.068160057067871, "rewards/margins": 12.050028800964355, "rewards/rejected": -10.981868743896484, "step": 1920 }, { "epoch": 0.66, "learning_rate": 4.3409291199798567e-07, "logits/chosen": -0.08700753003358841, "logits/rejected": 0.4414942264556885, "logps/chosen": -215.6087646484375, "logps/rejected": -652.0645751953125, "loss": 0.0282, "rewards/accuracies": 1.0, "rewards/chosen": 1.0443105697631836, "rewards/margins": 9.376866340637207, "rewards/rejected": -8.33255672454834, "step": 1930 }, { "epoch": 0.66, "learning_rate": 4.3346342691678204e-07, "logits/chosen": -0.0039008245803415775, "logits/rejected": 0.5577886700630188, "logps/chosen": -216.0365753173828, "logps/rejected": -721.1570434570312, "loss": 0.0272, "rewards/accuracies": 1.0, "rewards/chosen": 0.7645665407180786, "rewards/margins": 10.75326919555664, "rewards/rejected": -9.988702774047852, "step": 1940 }, { "epoch": 0.66, "learning_rate": 4.3283394183557846e-07, "logits/chosen": -0.13362696766853333, "logits/rejected": 0.5607558488845825, "logps/chosen": -396.74713134765625, "logps/rejected": -483.5771484375, "loss": 0.024, "rewards/accuracies": 1.0, "rewards/chosen": 1.4111171960830688, "rewards/margins": 10.623318672180176, "rewards/rejected": -9.212201118469238, "step": 1950 }, { "epoch": 0.67, "learning_rate": 4.322044567543749e-07, "logits/chosen": -0.15538975596427917, "logits/rejected": 0.5421515703201294, "logps/chosen": -272.4572448730469, "logps/rejected": -478.591796875, "loss": 0.0277, "rewards/accuracies": 1.0, "rewards/chosen": 1.3946647644042969, "rewards/margins": 11.960134506225586, "rewards/rejected": -10.565469741821289, "step": 1960 }, { "epoch": 0.67, "learning_rate": 4.315749716731713e-07, "logits/chosen": -0.14188618957996368, "logits/rejected": 0.4542626738548279, "logps/chosen": -293.08477783203125, "logps/rejected": -656.4564208984375, "loss": 0.0264, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6020262837409973, "rewards/margins": 8.616676330566406, "rewards/rejected": -8.014650344848633, "step": 1970 }, { "epoch": 0.67, "learning_rate": 4.309454865919678e-07, "logits/chosen": -0.10316536575555801, "logits/rejected": 0.4347095489501953, "logps/chosen": -282.9707946777344, "logps/rejected": -482.3162536621094, "loss": 0.0197, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8463128209114075, "rewards/margins": 9.63262939453125, "rewards/rejected": -8.78631591796875, "step": 1980 }, { "epoch": 0.68, "learning_rate": 4.303160015107642e-07, "logits/chosen": -0.2067209780216217, "logits/rejected": 0.536677360534668, "logps/chosen": -325.4396057128906, "logps/rejected": -371.20941162109375, "loss": 0.0263, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6606660485267639, "rewards/margins": 10.188016891479492, "rewards/rejected": -9.527350425720215, "step": 1990 }, { "epoch": 0.68, "learning_rate": 4.2968651642956063e-07, "logits/chosen": -0.12377657741308212, "logits/rejected": 0.49681028723716736, "logps/chosen": -271.9918518066406, "logps/rejected": -460.09820556640625, "loss": 0.0239, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6046313047409058, "rewards/margins": 10.295214653015137, "rewards/rejected": -9.690584182739258, "step": 2000 }, { "epoch": 0.68, "eval_logits/chosen": -0.06276055425405502, "eval_logits/rejected": 0.5955268740653992, "eval_logps/chosen": -264.85736083984375, "eval_logps/rejected": -550.7046508789062, "eval_loss": 0.024904990568757057, "eval_rewards/accuracies": 0.9932659864425659, "eval_rewards/chosen": 0.9624245166778564, "eval_rewards/margins": 11.023221969604492, "eval_rewards/rejected": -10.060798645019531, "eval_runtime": 528.9073, "eval_samples_per_second": 17.962, "eval_steps_per_second": 0.562, "step": 2000 }, { "epoch": 0.68, "learning_rate": 4.29057031348357e-07, "logits/chosen": -0.11695262044668198, "logits/rejected": 0.5953128933906555, "logps/chosen": -206.17529296875, "logps/rejected": -538.4225463867188, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": 0.8213122487068176, "rewards/margins": 11.205498695373535, "rewards/rejected": -10.384185791015625, "step": 2010 }, { "epoch": 0.69, "learning_rate": 4.284275462671534e-07, "logits/chosen": -0.11838328838348389, "logits/rejected": 0.5601052045822144, "logps/chosen": -211.7301025390625, "logps/rejected": -633.0421752929688, "loss": 0.0161, "rewards/accuracies": 1.0, "rewards/chosen": 1.0992175340652466, "rewards/margins": 12.040093421936035, "rewards/rejected": -10.940874099731445, "step": 2020 }, { "epoch": 0.69, "learning_rate": 4.2779806118594984e-07, "logits/chosen": -0.1722133457660675, "logits/rejected": 0.44787830114364624, "logps/chosen": -323.5675048828125, "logps/rejected": -427.1105041503906, "loss": 0.0218, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7270190715789795, "rewards/margins": 9.452925682067871, "rewards/rejected": -8.725908279418945, "step": 2030 }, { "epoch": 0.69, "learning_rate": 4.271685761047463e-07, "logits/chosen": -0.045999616384506226, "logits/rejected": 0.5689800977706909, "logps/chosen": -202.95169067382812, "logps/rejected": -507.082763671875, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": 1.022501826286316, "rewards/margins": 11.852930068969727, "rewards/rejected": -10.830429077148438, "step": 2040 }, { "epoch": 0.7, "learning_rate": 4.2653909102354274e-07, "logits/chosen": -0.05863869935274124, "logits/rejected": 0.6385976076126099, "logps/chosen": -287.4092712402344, "logps/rejected": -474.9805603027344, "loss": 0.0249, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2029474675655365, "rewards/margins": 10.286785125732422, "rewards/rejected": -10.083837509155273, "step": 2050 }, { "epoch": 0.7, "learning_rate": 4.2590960594233917e-07, "logits/chosen": -0.14040683209896088, "logits/rejected": 0.6000669002532959, "logps/chosen": -287.12750244140625, "logps/rejected": -500.1000061035156, "loss": 0.0207, "rewards/accuracies": 1.0, "rewards/chosen": 0.8683013916015625, "rewards/margins": 11.111690521240234, "rewards/rejected": -10.243389129638672, "step": 2060 }, { "epoch": 0.7, "learning_rate": 4.252801208611356e-07, "logits/chosen": -0.16264764964580536, "logits/rejected": 0.5074498057365417, "logps/chosen": -265.7417907714844, "logps/rejected": -485.18634033203125, "loss": 0.0249, "rewards/accuracies": 1.0, "rewards/chosen": 1.4339065551757812, "rewards/margins": 10.857864379882812, "rewards/rejected": -9.423957824707031, "step": 2070 }, { "epoch": 0.71, "learning_rate": 4.24650635779932e-07, "logits/chosen": -0.16662411391735077, "logits/rejected": 0.6092788577079773, "logps/chosen": -195.45689392089844, "logps/rejected": -440.0008239746094, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": 0.700150191783905, "rewards/margins": 14.03722095489502, "rewards/rejected": -13.337069511413574, "step": 2080 }, { "epoch": 0.71, "learning_rate": 4.240211506987284e-07, "logits/chosen": -0.11359413713216782, "logits/rejected": 0.4685317575931549, "logps/chosen": -308.27484130859375, "logps/rejected": -560.80810546875, "loss": 0.0233, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9943892359733582, "rewards/margins": 12.600744247436523, "rewards/rejected": -11.606355667114258, "step": 2090 }, { "epoch": 0.71, "learning_rate": 4.233916656175248e-07, "logits/chosen": -0.047763921320438385, "logits/rejected": 0.37557467818260193, "logps/chosen": -325.2818603515625, "logps/rejected": -771.1813354492188, "loss": 0.0484, "rewards/accuracies": 1.0, "rewards/chosen": 0.7997843027114868, "rewards/margins": 11.725061416625977, "rewards/rejected": -10.925277709960938, "step": 2100 }, { "epoch": 0.71, "eval_logits/chosen": -0.03383903577923775, "eval_logits/rejected": 0.6042855381965637, "eval_logps/chosen": -265.3055419921875, "eval_logps/rejected": -560.390869140625, "eval_loss": 0.022230619564652443, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 0.9176083207130432, "eval_rewards/margins": 11.947029113769531, "eval_rewards/rejected": -11.0294189453125, "eval_runtime": 529.0193, "eval_samples_per_second": 17.958, "eval_steps_per_second": 0.561, "step": 2100 }, { "epoch": 0.72, "learning_rate": 4.227621805363213e-07, "logits/chosen": -0.063340924680233, "logits/rejected": 0.49716949462890625, "logps/chosen": -242.68826293945312, "logps/rejected": -522.4921264648438, "loss": 0.0363, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5536531209945679, "rewards/margins": 12.109762191772461, "rewards/rejected": -11.556108474731445, "step": 2110 }, { "epoch": 0.72, "learning_rate": 4.221326954551177e-07, "logits/chosen": -0.07148043811321259, "logits/rejected": 0.3612940013408661, "logps/chosen": -219.7454376220703, "logps/rejected": -670.3692626953125, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 1.2621829509735107, "rewards/margins": 12.583165168762207, "rewards/rejected": -11.320981979370117, "step": 2120 }, { "epoch": 0.72, "learning_rate": 4.215032103739141e-07, "logits/chosen": -0.062252093106508255, "logits/rejected": 0.4583914279937744, "logps/chosen": -276.1160583496094, "logps/rejected": -633.4683227539062, "loss": 0.0181, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.791365921497345, "rewards/margins": 12.759963035583496, "rewards/rejected": -11.968597412109375, "step": 2130 }, { "epoch": 0.73, "learning_rate": 4.2087372529271055e-07, "logits/chosen": -0.09728838503360748, "logits/rejected": 0.5470895767211914, "logps/chosen": -233.1765899658203, "logps/rejected": -440.2936096191406, "loss": 0.0276, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.84349524974823, "rewards/margins": 11.547700881958008, "rewards/rejected": -10.704206466674805, "step": 2140 }, { "epoch": 0.73, "learning_rate": 4.2024424021150697e-07, "logits/chosen": -0.13178148865699768, "logits/rejected": 0.46155649423599243, "logps/chosen": -213.68038940429688, "logps/rejected": -627.6820068359375, "loss": 0.0277, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7167500853538513, "rewards/margins": 12.361598014831543, "rewards/rejected": -11.644847869873047, "step": 2150 }, { "epoch": 0.73, "learning_rate": 4.1961475513030334e-07, "logits/chosen": -0.22656993567943573, "logits/rejected": 0.5808902382850647, "logps/chosen": -349.2641906738281, "logps/rejected": -335.78857421875, "loss": 0.0261, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4508140981197357, "rewards/margins": 12.311933517456055, "rewards/rejected": -11.861119270324707, "step": 2160 }, { "epoch": 0.74, "learning_rate": 4.189852700490998e-07, "logits/chosen": -0.1902860403060913, "logits/rejected": 0.4535021185874939, "logps/chosen": -275.20977783203125, "logps/rejected": -458.49658203125, "loss": 0.0186, "rewards/accuracies": 1.0, "rewards/chosen": 0.6169565320014954, "rewards/margins": 12.614381790161133, "rewards/rejected": -11.997424125671387, "step": 2170 }, { "epoch": 0.74, "learning_rate": 4.1835578496789624e-07, "logits/chosen": -0.14257676899433136, "logits/rejected": 0.4507673382759094, "logps/chosen": -286.69482421875, "logps/rejected": -543.22265625, "loss": 0.0279, "rewards/accuracies": 1.0, "rewards/chosen": 0.7822535634040833, "rewards/margins": 12.46099853515625, "rewards/rejected": -11.678746223449707, "step": 2180 }, { "epoch": 0.74, "learning_rate": 4.1772629988669266e-07, "logits/chosen": -0.1451343595981598, "logits/rejected": 0.4870292544364929, "logps/chosen": -279.1175842285156, "logps/rejected": -531.2913208007812, "loss": 0.0174, "rewards/accuracies": 1.0, "rewards/chosen": 0.9259557723999023, "rewards/margins": 13.197964668273926, "rewards/rejected": -12.272008895874023, "step": 2190 }, { "epoch": 0.75, "learning_rate": 4.170968148054891e-07, "logits/chosen": -0.18290169537067413, "logits/rejected": 0.648645281791687, "logps/chosen": -527.1409912109375, "logps/rejected": -509.85125732421875, "loss": 0.0239, "rewards/accuracies": 1.0, "rewards/chosen": 0.1451863795518875, "rewards/margins": 10.679492950439453, "rewards/rejected": -10.534305572509766, "step": 2200 }, { "epoch": 0.75, "eval_logits/chosen": -0.06477193534374237, "eval_logits/rejected": 0.5912536382675171, "eval_logps/chosen": -266.2344665527344, "eval_logps/rejected": -555.1663208007812, "eval_loss": 0.020906928926706314, "eval_rewards/accuracies": 0.994107723236084, "eval_rewards/chosen": 0.824713945388794, "eval_rewards/margins": 11.331680297851562, "eval_rewards/rejected": -10.506965637207031, "eval_runtime": 528.8842, "eval_samples_per_second": 17.962, "eval_steps_per_second": 0.562, "step": 2200 }, { "epoch": 0.75, "learning_rate": 4.164673297242855e-07, "logits/chosen": -0.20305442810058594, "logits/rejected": 0.4324674606323242, "logps/chosen": -445.89459228515625, "logps/rejected": -562.4218139648438, "loss": 0.0313, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5024852156639099, "rewards/margins": 9.698598861694336, "rewards/rejected": -9.196114540100098, "step": 2210 }, { "epoch": 0.75, "learning_rate": 4.1583784464308193e-07, "logits/chosen": -0.06496913731098175, "logits/rejected": 0.3307177722454071, "logps/chosen": -193.982666015625, "logps/rejected": -496.25555419921875, "loss": 0.0317, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7482525706291199, "rewards/margins": 11.48645305633545, "rewards/rejected": -10.738200187683105, "step": 2220 }, { "epoch": 0.76, "learning_rate": 4.152083595618784e-07, "logits/chosen": -0.21572966873645782, "logits/rejected": 0.5808408260345459, "logps/chosen": -199.22390747070312, "logps/rejected": -430.3890686035156, "loss": 0.0254, "rewards/accuracies": 1.0, "rewards/chosen": 0.8668169975280762, "rewards/margins": 10.849908828735352, "rewards/rejected": -9.98309326171875, "step": 2230 }, { "epoch": 0.76, "learning_rate": 4.145788744806748e-07, "logits/chosen": -0.04192575067281723, "logits/rejected": 0.43682247400283813, "logps/chosen": -212.11679077148438, "logps/rejected": -512.9453735351562, "loss": 0.0133, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9113252758979797, "rewards/margins": 10.751002311706543, "rewards/rejected": -9.839677810668945, "step": 2240 }, { "epoch": 0.76, "learning_rate": 4.139493893994712e-07, "logits/chosen": -0.20572011172771454, "logits/rejected": 0.6226992011070251, "logps/chosen": -228.646484375, "logps/rejected": -395.66900634765625, "loss": 0.022, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5656083822250366, "rewards/margins": 11.009600639343262, "rewards/rejected": -10.44399356842041, "step": 2250 }, { "epoch": 0.77, "learning_rate": 4.133199043182676e-07, "logits/chosen": -0.11542689800262451, "logits/rejected": 0.6017035245895386, "logps/chosen": -213.2447509765625, "logps/rejected": -542.9639892578125, "loss": 0.0206, "rewards/accuracies": 1.0, "rewards/chosen": 0.7725807428359985, "rewards/margins": 11.702407836914062, "rewards/rejected": -10.929826736450195, "step": 2260 }, { "epoch": 0.77, "learning_rate": 4.1269041923706404e-07, "logits/chosen": -0.15504463016986847, "logits/rejected": 0.4869251847267151, "logps/chosen": -319.1248779296875, "logps/rejected": -489.453125, "loss": 0.019, "rewards/accuracies": 1.0, "rewards/chosen": 0.782381534576416, "rewards/margins": 12.115377426147461, "rewards/rejected": -11.33299446105957, "step": 2270 }, { "epoch": 0.77, "learning_rate": 4.1206093415586047e-07, "logits/chosen": -0.14119437336921692, "logits/rejected": 0.32113099098205566, "logps/chosen": -223.0038299560547, "logps/rejected": -492.20916748046875, "loss": 0.03, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 1.375575304031372, "rewards/margins": 11.673541069030762, "rewards/rejected": -10.297967910766602, "step": 2280 }, { "epoch": 0.78, "learning_rate": 4.1143144907465694e-07, "logits/chosen": -0.07326892763376236, "logits/rejected": 0.43182024359703064, "logps/chosen": -222.5634002685547, "logps/rejected": -558.0598754882812, "loss": 0.0395, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7403125166893005, "rewards/margins": 10.397378921508789, "rewards/rejected": -9.657066345214844, "step": 2290 }, { "epoch": 0.78, "learning_rate": 4.1080196399345336e-07, "logits/chosen": -0.15113991498947144, "logits/rejected": 0.4731000065803528, "logps/chosen": -257.635986328125, "logps/rejected": -565.606201171875, "loss": 0.0219, "rewards/accuracies": 1.0, "rewards/chosen": 0.8646310567855835, "rewards/margins": 10.384620666503906, "rewards/rejected": -9.519989967346191, "step": 2300 }, { "epoch": 0.78, "eval_logits/chosen": -0.08079363405704498, "eval_logits/rejected": 0.5941591858863831, "eval_logps/chosen": -267.14739990234375, "eval_logps/rejected": -555.2272338867188, "eval_loss": 0.022073844447731972, "eval_rewards/accuracies": 0.9924242496490479, "eval_rewards/chosen": 0.733421266078949, "eval_rewards/margins": 11.246471405029297, "eval_rewards/rejected": -10.513050079345703, "eval_runtime": 527.9937, "eval_samples_per_second": 17.993, "eval_steps_per_second": 0.563, "step": 2300 }, { "epoch": 0.79, "learning_rate": 4.101724789122498e-07, "logits/chosen": -0.09357740730047226, "logits/rejected": 0.4840284287929535, "logps/chosen": -354.61181640625, "logps/rejected": -553.3482055664062, "loss": 0.0275, "rewards/accuracies": 1.0, "rewards/chosen": 0.3784427344799042, "rewards/margins": 10.83348560333252, "rewards/rejected": -10.455041885375977, "step": 2310 }, { "epoch": 0.79, "learning_rate": 4.0954299383104616e-07, "logits/chosen": -0.2171735018491745, "logits/rejected": 0.35860520601272583, "logps/chosen": -251.4418182373047, "logps/rejected": -466.4185485839844, "loss": 0.0105, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7791914939880371, "rewards/margins": 10.050416946411133, "rewards/rejected": -9.271225929260254, "step": 2320 }, { "epoch": 0.79, "learning_rate": 4.089135087498426e-07, "logits/chosen": -0.2093077003955841, "logits/rejected": 0.6616267561912537, "logps/chosen": -454.29266357421875, "logps/rejected": -494.3580627441406, "loss": 0.0274, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1907956749200821, "rewards/margins": 10.962419509887695, "rewards/rejected": -10.771623611450195, "step": 2330 }, { "epoch": 0.8, "learning_rate": 4.08284023668639e-07, "logits/chosen": -0.12679234147071838, "logits/rejected": 0.49535074830055237, "logps/chosen": -267.88836669921875, "logps/rejected": -868.2401123046875, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 0.8079283833503723, "rewards/margins": 12.063997268676758, "rewards/rejected": -11.25606918334961, "step": 2340 }, { "epoch": 0.8, "learning_rate": 4.076545385874355e-07, "logits/chosen": -0.17443695664405823, "logits/rejected": 0.527366042137146, "logps/chosen": -248.3028106689453, "logps/rejected": -424.52581787109375, "loss": 0.0196, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.728929340839386, "rewards/margins": 12.667831420898438, "rewards/rejected": -11.938901901245117, "step": 2350 }, { "epoch": 0.8, "learning_rate": 4.070250535062319e-07, "logits/chosen": -0.15069898962974548, "logits/rejected": 0.5786216855049133, "logps/chosen": -202.11904907226562, "logps/rejected": -487.2919006347656, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": 1.2083404064178467, "rewards/margins": 13.486035346984863, "rewards/rejected": -12.27769660949707, "step": 2360 }, { "epoch": 0.81, "learning_rate": 4.063955684250283e-07, "logits/chosen": -0.10182011127471924, "logits/rejected": 0.3610193431377411, "logps/chosen": -255.92507934570312, "logps/rejected": -636.9708862304688, "loss": 0.024, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7685899138450623, "rewards/margins": 10.659513473510742, "rewards/rejected": -9.890922546386719, "step": 2370 }, { "epoch": 0.81, "learning_rate": 4.0576608334382475e-07, "logits/chosen": -0.15635620057582855, "logits/rejected": 0.5533707737922668, "logps/chosen": -284.06561279296875, "logps/rejected": -480.6683654785156, "loss": 0.0163, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4815060496330261, "rewards/margins": 12.727840423583984, "rewards/rejected": -12.246333122253418, "step": 2380 }, { "epoch": 0.81, "learning_rate": 4.051365982626211e-07, "logits/chosen": -0.0062142787501215935, "logits/rejected": 0.4701073169708252, "logps/chosen": -282.1827392578125, "logps/rejected": -660.7125244140625, "loss": 0.0393, "rewards/accuracies": 1.0, "rewards/chosen": 0.625975489616394, "rewards/margins": 12.382850646972656, "rewards/rejected": -11.756875991821289, "step": 2390 }, { "epoch": 0.82, "learning_rate": 4.0450711318141754e-07, "logits/chosen": -0.06439328193664551, "logits/rejected": 0.4563215374946594, "logps/chosen": -232.7467041015625, "logps/rejected": -568.8533325195312, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 1.0737766027450562, "rewards/margins": 11.858481407165527, "rewards/rejected": -10.78470516204834, "step": 2400 }, { "epoch": 0.82, "eval_logits/chosen": -0.07696208357810974, "eval_logits/rejected": 0.6005997061729431, "eval_logps/chosen": -265.5723876953125, "eval_logps/rejected": -560.9146118164062, "eval_loss": 0.02003282867372036, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.8909235000610352, "eval_rewards/margins": 11.972716331481934, "eval_rewards/rejected": -11.081791877746582, "eval_runtime": 530.2819, "eval_samples_per_second": 17.915, "eval_steps_per_second": 0.56, "step": 2400 }, { "epoch": 0.82, "learning_rate": 4.03877628100214e-07, "logits/chosen": -0.08212999999523163, "logits/rejected": 0.5023364424705505, "logps/chosen": -231.1219024658203, "logps/rejected": -461.01702880859375, "loss": 0.0165, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8222377896308899, "rewards/margins": 11.51823616027832, "rewards/rejected": -10.695998191833496, "step": 2410 }, { "epoch": 0.82, "learning_rate": 4.0324814301901044e-07, "logits/chosen": -0.2340635061264038, "logits/rejected": 0.439828097820282, "logps/chosen": -314.5412292480469, "logps/rejected": -462.94866943359375, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 0.9882681965827942, "rewards/margins": 11.283441543579102, "rewards/rejected": -10.295174598693848, "step": 2420 }, { "epoch": 0.83, "learning_rate": 4.0261865793780686e-07, "logits/chosen": -0.12157144397497177, "logits/rejected": 0.3841712474822998, "logps/chosen": -269.59613037109375, "logps/rejected": -610.4569702148438, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": 0.7235780954360962, "rewards/margins": 11.661900520324707, "rewards/rejected": -10.938322067260742, "step": 2430 }, { "epoch": 0.83, "learning_rate": 4.019891728566033e-07, "logits/chosen": -0.15745995938777924, "logits/rejected": 0.5135104060173035, "logps/chosen": -284.58233642578125, "logps/rejected": -659.0477294921875, "loss": 0.023, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6467440128326416, "rewards/margins": 11.165446281433105, "rewards/rejected": -10.518701553344727, "step": 2440 }, { "epoch": 0.83, "learning_rate": 4.013596877753997e-07, "logits/chosen": -0.02436182275414467, "logits/rejected": 0.42639461159706116, "logps/chosen": -217.4248046875, "logps/rejected": -623.3473510742188, "loss": 0.0245, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9206333160400391, "rewards/margins": 12.200538635253906, "rewards/rejected": -11.279905319213867, "step": 2450 }, { "epoch": 0.84, "learning_rate": 4.0073020269419613e-07, "logits/chosen": -0.09613429009914398, "logits/rejected": 0.3641251027584076, "logps/chosen": -202.6272735595703, "logps/rejected": -550.1214599609375, "loss": 0.0276, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7099416851997375, "rewards/margins": 11.553439140319824, "rewards/rejected": -10.843497276306152, "step": 2460 }, { "epoch": 0.84, "learning_rate": 4.0010071761299255e-07, "logits/chosen": -0.14863647520542145, "logits/rejected": 0.49279046058654785, "logps/chosen": -275.4486389160156, "logps/rejected": -533.4655151367188, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": 0.8163439035415649, "rewards/margins": 12.271845817565918, "rewards/rejected": -11.4555025100708, "step": 2470 }, { "epoch": 0.84, "learning_rate": 3.99471232531789e-07, "logits/chosen": -0.09219856560230255, "logits/rejected": 0.4974042475223541, "logps/chosen": -275.95166015625, "logps/rejected": -512.1917724609375, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": 0.648658037185669, "rewards/margins": 11.59749984741211, "rewards/rejected": -10.948843002319336, "step": 2480 }, { "epoch": 0.85, "learning_rate": 3.988417474505854e-07, "logits/chosen": -0.25812116265296936, "logits/rejected": 0.48852747678756714, "logps/chosen": -247.64901733398438, "logps/rejected": -413.83673095703125, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 0.8571916818618774, "rewards/margins": 12.030542373657227, "rewards/rejected": -11.173351287841797, "step": 2490 }, { "epoch": 0.85, "learning_rate": 3.982122623693818e-07, "logits/chosen": -0.20920786261558533, "logits/rejected": 0.6405856609344482, "logps/chosen": -327.2900695800781, "logps/rejected": -434.55718994140625, "loss": 0.0135, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7203938364982605, "rewards/margins": 13.17688274383545, "rewards/rejected": -12.456488609313965, "step": 2500 }, { "epoch": 0.85, "eval_logits/chosen": -0.07112111151218414, "eval_logits/rejected": 0.5947590470314026, "eval_logps/chosen": -267.20843505859375, "eval_logps/rejected": -563.2269287109375, "eval_loss": 0.01874653436243534, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.7273170351982117, "eval_rewards/margins": 12.040336608886719, "eval_rewards/rejected": -11.313019752502441, "eval_runtime": 528.3051, "eval_samples_per_second": 17.982, "eval_steps_per_second": 0.562, "step": 2500 }, { "epoch": 0.85, "learning_rate": 3.9758277728817824e-07, "logits/chosen": -0.18794643878936768, "logits/rejected": 0.46528005599975586, "logps/chosen": -288.76458740234375, "logps/rejected": -552.2510375976562, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": 0.5760281085968018, "rewards/margins": 12.057085037231445, "rewards/rejected": -11.481058120727539, "step": 2510 }, { "epoch": 0.86, "learning_rate": 3.9695329220697467e-07, "logits/chosen": -0.00617934251204133, "logits/rejected": 0.3865601122379303, "logps/chosen": -209.2038116455078, "logps/rejected": -775.068115234375, "loss": 0.0188, "rewards/accuracies": 1.0, "rewards/chosen": 0.7824047207832336, "rewards/margins": 13.376935005187988, "rewards/rejected": -12.59453010559082, "step": 2520 }, { "epoch": 0.86, "learning_rate": 3.9632380712577114e-07, "logits/chosen": -0.11167693138122559, "logits/rejected": 0.36250877380371094, "logps/chosen": -286.03155517578125, "logps/rejected": -571.5970458984375, "loss": 0.0228, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7994006276130676, "rewards/margins": 11.978608131408691, "rewards/rejected": -11.179207801818848, "step": 2530 }, { "epoch": 0.86, "learning_rate": 3.9569432204456756e-07, "logits/chosen": -0.11374542862176895, "logits/rejected": 0.4673793911933899, "logps/chosen": -217.7483367919922, "logps/rejected": -435.0228576660156, "loss": 0.0139, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3813634514808655, "rewards/margins": 11.404729843139648, "rewards/rejected": -11.023366928100586, "step": 2540 }, { "epoch": 0.87, "learning_rate": 3.9506483696336393e-07, "logits/chosen": -0.14110806584358215, "logits/rejected": 0.48759546875953674, "logps/chosen": -266.063232421875, "logps/rejected": -575.5397338867188, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": 0.5659262537956238, "rewards/margins": 13.308695793151855, "rewards/rejected": -12.74277114868164, "step": 2550 }, { "epoch": 0.87, "learning_rate": 3.9443535188216036e-07, "logits/chosen": -0.07576151192188263, "logits/rejected": 0.5750542879104614, "logps/chosen": -215.49169921875, "logps/rejected": -641.0803833007812, "loss": 0.0178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7046786546707153, "rewards/margins": 13.290704727172852, "rewards/rejected": -12.586023330688477, "step": 2560 }, { "epoch": 0.87, "learning_rate": 3.938058668009568e-07, "logits/chosen": -0.14013835787773132, "logits/rejected": 0.531544029712677, "logps/chosen": -287.74176025390625, "logps/rejected": -603.5052490234375, "loss": 0.0162, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19667640328407288, "rewards/margins": 12.567159652709961, "rewards/rejected": -12.370481491088867, "step": 2570 }, { "epoch": 0.88, "learning_rate": 3.931763817197532e-07, "logits/chosen": -0.24508266150951385, "logits/rejected": 0.5287877321243286, "logps/chosen": -272.97491455078125, "logps/rejected": -348.9820556640625, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": 0.4897540509700775, "rewards/margins": 11.445518493652344, "rewards/rejected": -10.955763816833496, "step": 2580 }, { "epoch": 0.88, "learning_rate": 3.925468966385497e-07, "logits/chosen": -0.21008244156837463, "logits/rejected": 0.5857201218605042, "logps/chosen": -327.2457580566406, "logps/rejected": -374.1681213378906, "loss": 0.0269, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9311743974685669, "rewards/margins": 12.178866386413574, "rewards/rejected": -11.247692108154297, "step": 2590 }, { "epoch": 0.88, "learning_rate": 3.919174115573461e-07, "logits/chosen": -0.09638357907533646, "logits/rejected": 0.564531683921814, "logps/chosen": -214.39315795898438, "logps/rejected": -595.1641845703125, "loss": 0.0211, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8528212308883667, "rewards/margins": 14.073602676391602, "rewards/rejected": -13.220781326293945, "step": 2600 }, { "epoch": 0.88, "eval_logits/chosen": -0.07191111147403717, "eval_logits/rejected": 0.5818330645561218, "eval_logps/chosen": -269.0913391113281, "eval_logps/rejected": -568.5889892578125, "eval_loss": 0.019040482118725777, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": 0.5390294194221497, "eval_rewards/margins": 12.388264656066895, "eval_rewards/rejected": -11.849235534667969, "eval_runtime": 529.8521, "eval_samples_per_second": 17.93, "eval_steps_per_second": 0.561, "step": 2600 }, { "epoch": 0.89, "learning_rate": 3.912879264761425e-07, "logits/chosen": -0.030538285151124, "logits/rejected": 0.3915198743343353, "logps/chosen": -280.9195251464844, "logps/rejected": -605.1884765625, "loss": 0.0167, "rewards/accuracies": 1.0, "rewards/chosen": 0.4870836138725281, "rewards/margins": 11.2864408493042, "rewards/rejected": -10.799357414245605, "step": 2610 }, { "epoch": 0.89, "learning_rate": 3.906584413949389e-07, "logits/chosen": -0.1368604302406311, "logits/rejected": 0.4227638840675354, "logps/chosen": -361.6731872558594, "logps/rejected": -452.4187927246094, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 0.18623751401901245, "rewards/margins": 10.402421951293945, "rewards/rejected": -10.216184616088867, "step": 2620 }, { "epoch": 0.89, "learning_rate": 3.900289563137353e-07, "logits/chosen": -0.06431153416633606, "logits/rejected": 0.4335872530937195, "logps/chosen": -235.37985229492188, "logps/rejected": -594.444580078125, "loss": 0.0148, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.46397948265075684, "rewards/margins": 13.44219970703125, "rewards/rejected": -12.978219985961914, "step": 2630 }, { "epoch": 0.9, "learning_rate": 3.8939947123253174e-07, "logits/chosen": -0.12103205919265747, "logits/rejected": 0.5143983960151672, "logps/chosen": -263.63702392578125, "logps/rejected": -484.204345703125, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": -0.16590793430805206, "rewards/margins": 10.881414413452148, "rewards/rejected": -11.047323226928711, "step": 2640 }, { "epoch": 0.9, "learning_rate": 3.887699861513282e-07, "logits/chosen": -0.14152146875858307, "logits/rejected": 0.46752557158470154, "logps/chosen": -194.6835479736328, "logps/rejected": -461.307861328125, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": 0.4606015086174011, "rewards/margins": 13.556489944458008, "rewards/rejected": -13.095888137817383, "step": 2650 }, { "epoch": 0.9, "learning_rate": 3.8814050107012464e-07, "logits/chosen": -0.167997807264328, "logits/rejected": 0.5616825222969055, "logps/chosen": -278.96868896484375, "logps/rejected": -413.35125732421875, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": 0.629269003868103, "rewards/margins": 13.446261405944824, "rewards/rejected": -12.816993713378906, "step": 2660 }, { "epoch": 0.91, "learning_rate": 3.8751101598892106e-07, "logits/chosen": -0.07479087263345718, "logits/rejected": 0.32740721106529236, "logps/chosen": -220.5128173828125, "logps/rejected": -683.3924560546875, "loss": 0.0134, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.142855167388916, "rewards/margins": 13.543848037719727, "rewards/rejected": -12.400991439819336, "step": 2670 }, { "epoch": 0.91, "learning_rate": 3.868815309077175e-07, "logits/chosen": -0.07700560986995697, "logits/rejected": 0.4741830825805664, "logps/chosen": -275.77886962890625, "logps/rejected": -727.8414916992188, "loss": 0.0131, "rewards/accuracies": 1.0, "rewards/chosen": 0.7903385162353516, "rewards/margins": 12.989718437194824, "rewards/rejected": -12.199380874633789, "step": 2680 }, { "epoch": 0.91, "learning_rate": 3.862520458265139e-07, "logits/chosen": -0.07967878878116608, "logits/rejected": 0.587452232837677, "logps/chosen": -309.2009582519531, "logps/rejected": -789.654541015625, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": 0.5250208973884583, "rewards/margins": 15.006256103515625, "rewards/rejected": -14.481234550476074, "step": 2690 }, { "epoch": 0.92, "learning_rate": 3.856225607453103e-07, "logits/chosen": -0.09481174498796463, "logits/rejected": 0.3942771553993225, "logps/chosen": -177.1461639404297, "logps/rejected": -469.56939697265625, "loss": 0.0209, "rewards/accuracies": 1.0, "rewards/chosen": 0.848396897315979, "rewards/margins": 12.999656677246094, "rewards/rejected": -12.15125846862793, "step": 2700 }, { "epoch": 0.92, "eval_logits/chosen": -0.09120472520589828, "eval_logits/rejected": 0.5537270307540894, "eval_logps/chosen": -269.5248718261719, "eval_logps/rejected": -573.2724609375, "eval_loss": 0.01702161878347397, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": 0.49567750096321106, "eval_rewards/margins": 12.813257217407227, "eval_rewards/rejected": -12.317580223083496, "eval_runtime": 528.8104, "eval_samples_per_second": 17.965, "eval_steps_per_second": 0.562, "step": 2700 }, { "epoch": 0.92, "learning_rate": 3.8499307566410675e-07, "logits/chosen": -0.14772407710552216, "logits/rejected": 0.39948028326034546, "logps/chosen": -215.0373077392578, "logps/rejected": -532.5383911132812, "loss": 0.0203, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.513881504535675, "rewards/margins": 13.959579467773438, "rewards/rejected": -13.445696830749512, "step": 2710 }, { "epoch": 0.92, "learning_rate": 3.843635905829032e-07, "logits/chosen": -0.13698230683803558, "logits/rejected": 0.49852800369262695, "logps/chosen": -344.6234436035156, "logps/rejected": -655.9090576171875, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": 0.3518791198730469, "rewards/margins": 12.87934684753418, "rewards/rejected": -12.52746868133545, "step": 2720 }, { "epoch": 0.93, "learning_rate": 3.837341055016996e-07, "logits/chosen": -0.057786840945482254, "logits/rejected": 0.42463135719299316, "logps/chosen": -211.4300079345703, "logps/rejected": -646.4425048828125, "loss": 0.0155, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.45557403564453125, "rewards/margins": 11.349102020263672, "rewards/rejected": -10.89352798461914, "step": 2730 }, { "epoch": 0.93, "learning_rate": 3.83104620420496e-07, "logits/chosen": -0.0599808394908905, "logits/rejected": 0.309297651052475, "logps/chosen": -318.2600402832031, "logps/rejected": -848.4415893554688, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.2335256040096283, "rewards/margins": 12.227529525756836, "rewards/rejected": -11.994003295898438, "step": 2740 }, { "epoch": 0.93, "learning_rate": 3.8247513533929244e-07, "logits/chosen": -0.1070268377661705, "logits/rejected": 0.48339715600013733, "logps/chosen": -215.1229705810547, "logps/rejected": -696.1378784179688, "loss": 0.0132, "rewards/accuracies": 1.0, "rewards/chosen": 0.5925717949867249, "rewards/margins": 13.109349250793457, "rewards/rejected": -12.516777038574219, "step": 2750 }, { "epoch": 0.94, "learning_rate": 3.8184565025808887e-07, "logits/chosen": -0.12322517484426498, "logits/rejected": 0.5763725638389587, "logps/chosen": -302.0684509277344, "logps/rejected": -596.053466796875, "loss": 0.0148, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.44381818175315857, "rewards/margins": 12.98021125793457, "rewards/rejected": -12.536392211914062, "step": 2760 }, { "epoch": 0.94, "learning_rate": 3.8121616517688534e-07, "logits/chosen": -0.08368263393640518, "logits/rejected": 0.3957024812698364, "logps/chosen": -203.96292114257812, "logps/rejected": -580.15283203125, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 1.186550259590149, "rewards/margins": 13.399798393249512, "rewards/rejected": -12.213247299194336, "step": 2770 }, { "epoch": 0.94, "learning_rate": 3.805866800956817e-07, "logits/chosen": -0.18107816576957703, "logits/rejected": 0.6798295974731445, "logps/chosen": -209.6468048095703, "logps/rejected": -356.6151123046875, "loss": 0.0159, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.6069614291191101, "rewards/margins": 13.541903495788574, "rewards/rejected": -12.934942245483398, "step": 2780 }, { "epoch": 0.95, "learning_rate": 3.7995719501447813e-07, "logits/chosen": -0.14779521524906158, "logits/rejected": 0.4904769957065582, "logps/chosen": -226.6512908935547, "logps/rejected": -525.0828857421875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 0.7781469821929932, "rewards/margins": 15.461952209472656, "rewards/rejected": -14.683804512023926, "step": 2790 }, { "epoch": 0.95, "learning_rate": 3.7932770993327456e-07, "logits/chosen": -0.1434634029865265, "logits/rejected": 0.521730363368988, "logps/chosen": -218.3976593017578, "logps/rejected": -800.5870361328125, "loss": 0.0168, "rewards/accuracies": 1.0, "rewards/chosen": 0.5224558711051941, "rewards/margins": 13.158914566040039, "rewards/rejected": -12.636457443237305, "step": 2800 }, { "epoch": 0.95, "eval_logits/chosen": -0.08695169538259506, "eval_logits/rejected": 0.5471464991569519, "eval_logps/chosen": -268.6321105957031, "eval_logps/rejected": -572.8340454101562, "eval_loss": 0.017498521134257317, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.5849502682685852, "eval_rewards/margins": 12.858681678771973, "eval_rewards/rejected": -12.27373218536377, "eval_runtime": 529.8039, "eval_samples_per_second": 17.931, "eval_steps_per_second": 0.561, "step": 2800 }, { "epoch": 0.96, "learning_rate": 3.78698224852071e-07, "logits/chosen": -0.13975068926811218, "logits/rejected": 0.5137497782707214, "logps/chosen": -207.9899139404297, "logps/rejected": -403.18524169921875, "loss": 0.0212, "rewards/accuracies": 1.0, "rewards/chosen": 0.6654854416847229, "rewards/margins": 13.079936027526855, "rewards/rejected": -12.414449691772461, "step": 2810 }, { "epoch": 0.96, "learning_rate": 3.780687397708674e-07, "logits/chosen": -0.15226922929286957, "logits/rejected": 0.4262031614780426, "logps/chosen": -231.24429321289062, "logps/rejected": -414.29412841796875, "loss": 0.0131, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.35701635479927063, "rewards/margins": 12.19134521484375, "rewards/rejected": -11.834329605102539, "step": 2820 }, { "epoch": 0.96, "learning_rate": 3.774392546896638e-07, "logits/chosen": -0.10910400003194809, "logits/rejected": 0.3090236186981201, "logps/chosen": -297.28118896484375, "logps/rejected": -696.0444946289062, "loss": 0.0158, "rewards/accuracies": 1.0, "rewards/chosen": 0.4488081932067871, "rewards/margins": 12.045207977294922, "rewards/rejected": -11.596399307250977, "step": 2830 }, { "epoch": 0.97, "learning_rate": 3.768097696084603e-07, "logits/chosen": -0.1400688737630844, "logits/rejected": 0.5129746198654175, "logps/chosen": -261.764892578125, "logps/rejected": -601.9446411132812, "loss": 0.0193, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.8136240839958191, "rewards/margins": 14.573637008666992, "rewards/rejected": -13.760014533996582, "step": 2840 }, { "epoch": 0.97, "learning_rate": 3.761802845272567e-07, "logits/chosen": -0.11414362490177155, "logits/rejected": 0.5449014902114868, "logps/chosen": -273.9658508300781, "logps/rejected": -512.8095703125, "loss": 0.0176, "rewards/accuracies": 1.0, "rewards/chosen": 0.8329913020133972, "rewards/margins": 14.56764030456543, "rewards/rejected": -13.734649658203125, "step": 2850 }, { "epoch": 0.97, "learning_rate": 3.755507994460531e-07, "logits/chosen": -0.12390995025634766, "logits/rejected": 0.4711190164089203, "logps/chosen": -229.75350952148438, "logps/rejected": -448.346923828125, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 0.35145479440689087, "rewards/margins": 12.791833877563477, "rewards/rejected": -12.440378189086914, "step": 2860 }, { "epoch": 0.98, "learning_rate": 3.749213143648495e-07, "logits/chosen": -0.15647295117378235, "logits/rejected": 0.3714851140975952, "logps/chosen": -344.99505615234375, "logps/rejected": -641.271240234375, "loss": 0.0304, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2010762244462967, "rewards/margins": 12.048068046569824, "rewards/rejected": -11.846990585327148, "step": 2870 }, { "epoch": 0.98, "learning_rate": 3.7429182928364594e-07, "logits/chosen": -0.0512438528239727, "logits/rejected": 0.2887820601463318, "logps/chosen": -281.89935302734375, "logps/rejected": -831.63916015625, "loss": 0.0222, "rewards/accuracies": 1.0, "rewards/chosen": 0.10141458362340927, "rewards/margins": 12.418639183044434, "rewards/rejected": -12.317224502563477, "step": 2880 }, { "epoch": 0.98, "learning_rate": 3.7366234420244236e-07, "logits/chosen": -0.14991262555122375, "logits/rejected": 0.4734499454498291, "logps/chosen": -422.3702697753906, "logps/rejected": -665.525634765625, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": 0.45316439867019653, "rewards/margins": 13.74336051940918, "rewards/rejected": -13.290196418762207, "step": 2890 }, { "epoch": 0.99, "learning_rate": 3.7303285912123884e-07, "logits/chosen": -0.15185262262821198, "logits/rejected": 0.4371206760406494, "logps/chosen": -221.96261596679688, "logps/rejected": -533.8972778320312, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": 0.38257449865341187, "rewards/margins": 13.530128479003906, "rewards/rejected": -13.147554397583008, "step": 2900 }, { "epoch": 0.99, "eval_logits/chosen": -0.08102447539567947, "eval_logits/rejected": 0.5440705418586731, "eval_logps/chosen": -268.79095458984375, "eval_logps/rejected": -577.8817138671875, "eval_loss": 0.017108755186200142, "eval_rewards/accuracies": 0.9949495196342468, "eval_rewards/chosen": 0.5690661072731018, "eval_rewards/margins": 13.347569465637207, "eval_rewards/rejected": -12.778502464294434, "eval_runtime": 528.1707, "eval_samples_per_second": 17.987, "eval_steps_per_second": 0.562, "step": 2900 }, { "epoch": 0.99, "learning_rate": 3.7240337404003526e-07, "logits/chosen": -0.15038302540779114, "logits/rejected": 0.29201313853263855, "logps/chosen": -219.22592163085938, "logps/rejected": -706.9136352539062, "loss": 0.0142, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.18758581578731537, "rewards/margins": 13.352384567260742, "rewards/rejected": -13.16479778289795, "step": 2910 }, { "epoch": 0.99, "learning_rate": 3.717738889588317e-07, "logits/chosen": -0.1067178025841713, "logits/rejected": 0.382688045501709, "logps/chosen": -231.7379608154297, "logps/rejected": -739.9605712890625, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": 0.3437434136867523, "rewards/margins": 14.248052597045898, "rewards/rejected": -13.90431022644043, "step": 2920 }, { "epoch": 1.0, "learning_rate": 3.7114440387762805e-07, "logits/chosen": -0.15428930521011353, "logits/rejected": 0.4265620708465576, "logps/chosen": -216.2051239013672, "logps/rejected": -411.7529296875, "loss": 0.012, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.01843712292611599, "rewards/margins": 13.127111434936523, "rewards/rejected": -13.145548820495605, "step": 2930 }, { "epoch": 1.0, "learning_rate": 3.705149187964245e-07, "logits/chosen": -0.18938064575195312, "logits/rejected": 0.47105178236961365, "logps/chosen": -291.40057373046875, "logps/rejected": -532.2966918945312, "loss": 0.0126, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.24124033749103546, "rewards/margins": 13.044621467590332, "rewards/rejected": -12.803380966186523, "step": 2940 }, { "epoch": 1.0, "learning_rate": 3.698854337152209e-07, "logits/chosen": -0.23888206481933594, "logits/rejected": 0.5088266134262085, "logps/chosen": -212.10311889648438, "logps/rejected": -485.43585205078125, "loss": 0.0156, "rewards/accuracies": 1.0, "rewards/chosen": 0.7945457696914673, "rewards/margins": 14.172282218933105, "rewards/rejected": -13.37773609161377, "step": 2950 }, { "epoch": 1.01, "learning_rate": 3.692559486340174e-07, "logits/chosen": -0.05864056944847107, "logits/rejected": 0.40118637681007385, "logps/chosen": -264.44854736328125, "logps/rejected": -896.8850708007812, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 0.4027208685874939, "rewards/margins": 13.660585403442383, "rewards/rejected": -13.25786304473877, "step": 2960 }, { "epoch": 1.01, "learning_rate": 3.686264635528138e-07, "logits/chosen": -0.16464586555957794, "logits/rejected": 0.35519278049468994, "logps/chosen": -205.8061981201172, "logps/rejected": -313.51873779296875, "loss": 0.0556, "rewards/accuracies": 1.0, "rewards/chosen": 0.4512391984462738, "rewards/margins": 13.165997505187988, "rewards/rejected": -12.71475887298584, "step": 2970 }, { "epoch": 1.01, "learning_rate": 3.679969784716102e-07, "logits/chosen": -0.13240697979927063, "logits/rejected": 0.41653507947921753, "logps/chosen": -195.60409545898438, "logps/rejected": -484.6581115722656, "loss": 0.0152, "rewards/accuracies": 1.0, "rewards/chosen": 0.4566887319087982, "rewards/margins": 13.147418022155762, "rewards/rejected": -12.690730094909668, "step": 2980 }, { "epoch": 1.02, "learning_rate": 3.6736749339040664e-07, "logits/chosen": -0.1693662703037262, "logits/rejected": 0.3362937569618225, "logps/chosen": -286.00408935546875, "logps/rejected": -626.2353515625, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": 0.6935654878616333, "rewards/margins": 13.607297897338867, "rewards/rejected": -12.913732528686523, "step": 2990 }, { "epoch": 1.02, "learning_rate": 3.6673800830920307e-07, "logits/chosen": -0.17739418148994446, "logits/rejected": 0.3879055082798004, "logps/chosen": -258.3529052734375, "logps/rejected": -545.0477294921875, "loss": 0.0134, "rewards/accuracies": 1.0, "rewards/chosen": 0.7862340211868286, "rewards/margins": 12.397263526916504, "rewards/rejected": -11.611030578613281, "step": 3000 }, { "epoch": 1.02, "eval_logits/chosen": -0.11625607311725616, "eval_logits/rejected": 0.5261458158493042, "eval_logps/chosen": -270.76910400390625, "eval_logps/rejected": -580.3504028320312, "eval_loss": 0.018808143213391304, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.37125134468078613, "eval_rewards/margins": 13.396617889404297, "eval_rewards/rejected": -13.025364875793457, "eval_runtime": 527.9787, "eval_samples_per_second": 17.993, "eval_steps_per_second": 0.563, "step": 3000 }, { "epoch": 1.02, "learning_rate": 3.6610852322799943e-07, "logits/chosen": -0.15386778116226196, "logits/rejected": 0.41680389642715454, "logps/chosen": -332.1883850097656, "logps/rejected": -596.311279296875, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.5304964184761047, "rewards/margins": 13.764398574829102, "rewards/rejected": -13.233901977539062, "step": 3010 }, { "epoch": 1.03, "learning_rate": 3.654790381467959e-07, "logits/chosen": -0.21962925791740417, "logits/rejected": 0.5407239198684692, "logps/chosen": -228.4311981201172, "logps/rejected": -388.72271728515625, "loss": 0.0098, "rewards/accuracies": 1.0, "rewards/chosen": 0.274773508310318, "rewards/margins": 13.626386642456055, "rewards/rejected": -13.35161304473877, "step": 3020 }, { "epoch": 1.03, "learning_rate": 3.6484955306559233e-07, "logits/chosen": -0.2430342137813568, "logits/rejected": 0.31790828704833984, "logps/chosen": -355.12738037109375, "logps/rejected": -470.6015625, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.6480463743209839, "rewards/margins": 13.547224044799805, "rewards/rejected": -12.899177551269531, "step": 3030 }, { "epoch": 1.03, "learning_rate": 3.6422006798438876e-07, "logits/chosen": -0.12265034765005112, "logits/rejected": 0.4889611303806305, "logps/chosen": -251.908447265625, "logps/rejected": -509.55157470703125, "loss": 0.0133, "rewards/accuracies": 1.0, "rewards/chosen": 0.31434959173202515, "rewards/margins": 13.672518730163574, "rewards/rejected": -13.358169555664062, "step": 3040 }, { "epoch": 1.04, "learning_rate": 3.635905829031852e-07, "logits/chosen": -0.18725822865962982, "logits/rejected": 0.4391081929206848, "logps/chosen": -341.0368957519531, "logps/rejected": -439.4134826660156, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.43673276901245117, "rewards/margins": 12.897595405578613, "rewards/rejected": -12.46086311340332, "step": 3050 }, { "epoch": 1.04, "learning_rate": 3.629610978219816e-07, "logits/chosen": -0.12597785890102386, "logits/rejected": 0.35728976130485535, "logps/chosen": -237.11074829101562, "logps/rejected": -732.3026733398438, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 0.47060489654541016, "rewards/margins": 12.454336166381836, "rewards/rejected": -11.983729362487793, "step": 3060 }, { "epoch": 1.04, "learning_rate": 3.62331612740778e-07, "logits/chosen": -0.22195923328399658, "logits/rejected": 0.4338354170322418, "logps/chosen": -335.032470703125, "logps/rejected": -573.9718627929688, "loss": 0.0115, "rewards/accuracies": 1.0, "rewards/chosen": 0.5400988459587097, "rewards/margins": 14.206143379211426, "rewards/rejected": -13.666044235229492, "step": 3070 }, { "epoch": 1.05, "learning_rate": 3.617021276595745e-07, "logits/chosen": -0.20756149291992188, "logits/rejected": 0.3465970754623413, "logps/chosen": -457.8055114746094, "logps/rejected": -465.6961975097656, "loss": 0.0138, "rewards/accuracies": 1.0, "rewards/chosen": 0.19909584522247314, "rewards/margins": 12.776103019714355, "rewards/rejected": -12.577006340026855, "step": 3080 }, { "epoch": 1.05, "learning_rate": 3.6107264257837087e-07, "logits/chosen": -0.08912526071071625, "logits/rejected": 0.358253538608551, "logps/chosen": -272.1973571777344, "logps/rejected": -681.8186645507812, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 0.3679356276988983, "rewards/margins": 15.020917892456055, "rewards/rejected": -14.652981758117676, "step": 3090 }, { "epoch": 1.05, "learning_rate": 3.604431574971673e-07, "logits/chosen": -0.1170891523361206, "logits/rejected": 0.36317330598831177, "logps/chosen": -266.10736083984375, "logps/rejected": -564.6534423828125, "loss": 0.0127, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.003286123275756836, "rewards/margins": 12.691465377807617, "rewards/rejected": -12.694753646850586, "step": 3100 }, { "epoch": 1.05, "eval_logits/chosen": -0.11285170167684555, "eval_logits/rejected": 0.49280035495758057, "eval_logps/chosen": -272.71258544921875, "eval_logps/rejected": -593.1094360351562, "eval_loss": 0.015542441979050636, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.17690156400203705, "eval_rewards/margins": 14.478179931640625, "eval_rewards/rejected": -14.301276206970215, "eval_runtime": 526.2335, "eval_samples_per_second": 18.053, "eval_steps_per_second": 0.564, "step": 3100 }, { "epoch": 1.06, "learning_rate": 3.598136724159637e-07, "logits/chosen": -0.19244477152824402, "logits/rejected": 0.3562130331993103, "logps/chosen": -325.08697509765625, "logps/rejected": -443.707763671875, "loss": 0.0107, "rewards/accuracies": 1.0, "rewards/chosen": 0.3590792715549469, "rewards/margins": 15.826189994812012, "rewards/rejected": -15.467111587524414, "step": 3110 }, { "epoch": 1.06, "learning_rate": 3.5918418733476014e-07, "logits/chosen": -0.1792040318250656, "logits/rejected": 0.376350075006485, "logps/chosen": -344.9609375, "logps/rejected": -583.9660034179688, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 0.8797038197517395, "rewards/margins": 15.89190673828125, "rewards/rejected": -15.01220417022705, "step": 3120 }, { "epoch": 1.06, "learning_rate": 3.5855470225355656e-07, "logits/chosen": -0.14283646643161774, "logits/rejected": 0.31446728110313416, "logps/chosen": -280.244384765625, "logps/rejected": -600.479736328125, "loss": 0.007, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3069140315055847, "rewards/margins": 13.652433395385742, "rewards/rejected": -13.34552001953125, "step": 3130 }, { "epoch": 1.07, "learning_rate": 3.5792521717235304e-07, "logits/chosen": -0.14997351169586182, "logits/rejected": 0.3398076891899109, "logps/chosen": -278.66851806640625, "logps/rejected": -478.003662109375, "loss": 0.0106, "rewards/accuracies": 1.0, "rewards/chosen": 0.585425853729248, "rewards/margins": 13.8798189163208, "rewards/rejected": -13.294395446777344, "step": 3140 }, { "epoch": 1.07, "learning_rate": 3.5729573209114946e-07, "logits/chosen": -0.17879004776477814, "logits/rejected": 0.30932989716529846, "logps/chosen": -213.9352569580078, "logps/rejected": -558.7156372070312, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 0.5194706320762634, "rewards/margins": 12.995248794555664, "rewards/rejected": -12.475778579711914, "step": 3150 }, { "epoch": 1.07, "learning_rate": 3.5666624700994583e-07, "logits/chosen": -0.05548171326518059, "logits/rejected": 0.3703877925872803, "logps/chosen": -225.4084930419922, "logps/rejected": -898.7545166015625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.4377104341983795, "rewards/margins": 14.159419059753418, "rewards/rejected": -13.721707344055176, "step": 3160 }, { "epoch": 1.08, "learning_rate": 3.5603676192874225e-07, "logits/chosen": -0.21437790989875793, "logits/rejected": 0.29642224311828613, "logps/chosen": -203.72225952148438, "logps/rejected": -429.0672912597656, "loss": 0.0319, "rewards/accuracies": 1.0, "rewards/chosen": 0.9248259663581848, "rewards/margins": 13.500173568725586, "rewards/rejected": -12.575345993041992, "step": 3170 }, { "epoch": 1.08, "learning_rate": 3.554072768475387e-07, "logits/chosen": -0.24128814041614532, "logits/rejected": 0.39583396911621094, "logps/chosen": -360.6728515625, "logps/rejected": -541.4347534179688, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.28943270444869995, "rewards/margins": 12.571670532226562, "rewards/rejected": -12.28223705291748, "step": 3180 }, { "epoch": 1.08, "learning_rate": 3.547777917663351e-07, "logits/chosen": -0.18002577126026154, "logits/rejected": 0.41906601190567017, "logps/chosen": -265.5144348144531, "logps/rejected": -622.9805908203125, "loss": 0.0347, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3608430027961731, "rewards/margins": 13.80253791809082, "rewards/rejected": -13.441694259643555, "step": 3190 }, { "epoch": 1.09, "learning_rate": 3.5414830668513157e-07, "logits/chosen": -0.23837295174598694, "logits/rejected": 0.42499786615371704, "logps/chosen": -201.17269897460938, "logps/rejected": -472.4668884277344, "loss": 0.0072, "rewards/accuracies": 1.0, "rewards/chosen": 0.6717270612716675, "rewards/margins": 14.364822387695312, "rewards/rejected": -13.693095207214355, "step": 3200 }, { "epoch": 1.09, "eval_logits/chosen": -0.14791913330554962, "eval_logits/rejected": 0.4753006398677826, "eval_logps/chosen": -272.0074157714844, "eval_logps/rejected": -586.5951538085938, "eval_loss": 0.0146998455747962, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.24741852283477783, "eval_rewards/margins": 13.897268295288086, "eval_rewards/rejected": -13.64985179901123, "eval_runtime": 527.9977, "eval_samples_per_second": 17.993, "eval_steps_per_second": 0.563, "step": 3200 }, { "epoch": 1.09, "learning_rate": 3.53518821603928e-07, "logits/chosen": -0.22014132142066956, "logits/rejected": 0.45485371351242065, "logps/chosen": -208.58895874023438, "logps/rejected": -467.513427734375, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": 0.017479026690125465, "rewards/margins": 13.714553833007812, "rewards/rejected": -13.697074890136719, "step": 3210 }, { "epoch": 1.09, "learning_rate": 3.528893365227244e-07, "logits/chosen": -0.21733424067497253, "logits/rejected": 0.4469735026359558, "logps/chosen": -367.9599304199219, "logps/rejected": -525.0040283203125, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 0.10493244230747223, "rewards/margins": 14.323179244995117, "rewards/rejected": -14.21824836730957, "step": 3220 }, { "epoch": 1.1, "learning_rate": 3.5225985144152084e-07, "logits/chosen": -0.18706978857517242, "logits/rejected": 0.5261206030845642, "logps/chosen": -226.6429443359375, "logps/rejected": -479.5487365722656, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 0.43367934226989746, "rewards/margins": 15.854001998901367, "rewards/rejected": -15.420324325561523, "step": 3230 }, { "epoch": 1.1, "learning_rate": 3.516303663603172e-07, "logits/chosen": -0.31243711709976196, "logits/rejected": 0.4271464943885803, "logps/chosen": -278.55206298828125, "logps/rejected": -523.6666259765625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 0.12881168723106384, "rewards/margins": 13.570047378540039, "rewards/rejected": -13.44123649597168, "step": 3240 }, { "epoch": 1.1, "learning_rate": 3.5100088127911363e-07, "logits/chosen": -0.20127888023853302, "logits/rejected": 0.6034557223320007, "logps/chosen": -317.86956787109375, "logps/rejected": -473.9720764160156, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.4128205180168152, "rewards/margins": 15.974685668945312, "rewards/rejected": -15.561864852905273, "step": 3250 }, { "epoch": 1.11, "learning_rate": 3.503713961979101e-07, "logits/chosen": -0.1956896185874939, "logits/rejected": 0.44514065980911255, "logps/chosen": -270.1253356933594, "logps/rejected": -606.2057495117188, "loss": 0.0434, "rewards/accuracies": 1.0, "rewards/chosen": -0.2530042231082916, "rewards/margins": 14.329150199890137, "rewards/rejected": -14.5821533203125, "step": 3260 }, { "epoch": 1.11, "learning_rate": 3.4974191111670653e-07, "logits/chosen": -0.1570592224597931, "logits/rejected": 0.32768821716308594, "logps/chosen": -291.195556640625, "logps/rejected": -756.2486572265625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.38313955068588257, "rewards/margins": 14.428857803344727, "rewards/rejected": -14.0457181930542, "step": 3270 }, { "epoch": 1.11, "learning_rate": 3.4911242603550296e-07, "logits/chosen": -0.2582021653652191, "logits/rejected": 0.3288532495498657, "logps/chosen": -250.4126434326172, "logps/rejected": -497.032470703125, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.4659535884857178, "rewards/margins": 11.763138771057129, "rewards/rejected": -12.229090690612793, "step": 3280 }, { "epoch": 1.12, "learning_rate": 3.484829409542994e-07, "logits/chosen": -0.20826244354248047, "logits/rejected": 0.4146566390991211, "logps/chosen": -318.6023864746094, "logps/rejected": -566.1416625976562, "loss": 0.0179, "rewards/accuracies": 1.0, "rewards/chosen": 0.28231120109558105, "rewards/margins": 15.425786972045898, "rewards/rejected": -15.143476486206055, "step": 3290 }, { "epoch": 1.12, "learning_rate": 3.478534558730958e-07, "logits/chosen": -0.1915864646434784, "logits/rejected": 0.408102810382843, "logps/chosen": -278.49542236328125, "logps/rejected": -495.40185546875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -0.08867375552654266, "rewards/margins": 13.412040710449219, "rewards/rejected": -13.500715255737305, "step": 3300 }, { "epoch": 1.12, "eval_logits/chosen": -0.14303095638751984, "eval_logits/rejected": 0.4721684455871582, "eval_logps/chosen": -272.3131408691406, "eval_logps/rejected": -596.2191772460938, "eval_loss": 0.017895570024847984, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.21684670448303223, "eval_rewards/margins": 14.829092979431152, "eval_rewards/rejected": -14.612245559692383, "eval_runtime": 526.5542, "eval_samples_per_second": 18.042, "eval_steps_per_second": 0.564, "step": 3300 }, { "epoch": 1.13, "learning_rate": 3.4722397079189217e-07, "logits/chosen": -0.09167354553937912, "logits/rejected": 0.3425484597682953, "logps/chosen": -222.25912475585938, "logps/rejected": -626.326904296875, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": 0.144133061170578, "rewards/margins": 15.5638427734375, "rewards/rejected": -15.419710159301758, "step": 3310 }, { "epoch": 1.13, "learning_rate": 3.4659448571068865e-07, "logits/chosen": -0.15995065867900848, "logits/rejected": 0.4081358015537262, "logps/chosen": -388.7671813964844, "logps/rejected": -588.749267578125, "loss": 0.0172, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.17335918545722961, "rewards/margins": 15.126846313476562, "rewards/rejected": -15.300204277038574, "step": 3320 }, { "epoch": 1.13, "learning_rate": 3.4596500062948507e-07, "logits/chosen": -0.2174321413040161, "logits/rejected": 0.39305639266967773, "logps/chosen": -315.2247619628906, "logps/rejected": -558.0072021484375, "loss": 0.0166, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4984303116798401, "rewards/margins": 16.15171241760254, "rewards/rejected": -15.653280258178711, "step": 3330 }, { "epoch": 1.14, "learning_rate": 3.453355155482815e-07, "logits/chosen": -0.14297278225421906, "logits/rejected": 0.40061822533607483, "logps/chosen": -276.52801513671875, "logps/rejected": -853.6500854492188, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.410574734210968, "rewards/margins": 14.862714767456055, "rewards/rejected": -14.452142715454102, "step": 3340 }, { "epoch": 1.14, "learning_rate": 3.447060304670779e-07, "logits/chosen": -0.15079109370708466, "logits/rejected": 0.4143487513065338, "logps/chosen": -227.1095428466797, "logps/rejected": -513.4925537109375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 0.31720855832099915, "rewards/margins": 15.909500122070312, "rewards/rejected": -15.592290878295898, "step": 3350 }, { "epoch": 1.14, "learning_rate": 3.4407654538587434e-07, "logits/chosen": -0.19802021980285645, "logits/rejected": 0.2084237039089203, "logps/chosen": -296.99609375, "logps/rejected": -667.3692016601562, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": -0.06057851389050484, "rewards/margins": 14.032327651977539, "rewards/rejected": -14.09290599822998, "step": 3360 }, { "epoch": 1.15, "learning_rate": 3.4344706030467076e-07, "logits/chosen": -0.1091545969247818, "logits/rejected": 0.449861615896225, "logps/chosen": -220.0536651611328, "logps/rejected": -766.4942016601562, "loss": 0.0094, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.03174459934234619, "rewards/margins": 14.802316665649414, "rewards/rejected": -14.83405876159668, "step": 3370 }, { "epoch": 1.15, "learning_rate": 3.4281757522346724e-07, "logits/chosen": -0.2393231838941574, "logits/rejected": 0.3732552230358124, "logps/chosen": -343.4911804199219, "logps/rejected": -494.58087158203125, "loss": 0.0147, "rewards/accuracies": 1.0, "rewards/chosen": 0.38980475068092346, "rewards/margins": 14.358392715454102, "rewards/rejected": -13.968586921691895, "step": 3380 }, { "epoch": 1.15, "learning_rate": 3.421880901422636e-07, "logits/chosen": -0.17029327154159546, "logits/rejected": 0.30234837532043457, "logps/chosen": -269.3778076171875, "logps/rejected": -729.6644287109375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.28063321113586426, "rewards/margins": 15.374956130981445, "rewards/rejected": -15.094322204589844, "step": 3390 }, { "epoch": 1.16, "learning_rate": 3.4155860506106003e-07, "logits/chosen": -0.09964896738529205, "logits/rejected": 0.282901406288147, "logps/chosen": -295.78814697265625, "logps/rejected": -619.4361572265625, "loss": 0.0323, "rewards/accuracies": 1.0, "rewards/chosen": -0.12941047549247742, "rewards/margins": 13.717930793762207, "rewards/rejected": -13.84734058380127, "step": 3400 }, { "epoch": 1.16, "eval_logits/chosen": -0.14304298162460327, "eval_logits/rejected": 0.4158560633659363, "eval_logps/chosen": -271.9459228515625, "eval_logps/rejected": -609.576171875, "eval_loss": 0.02182593382894993, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.25357261300086975, "eval_rewards/margins": 16.201513290405273, "eval_rewards/rejected": -15.947941780090332, "eval_runtime": 528.0953, "eval_samples_per_second": 17.989, "eval_steps_per_second": 0.562, "step": 3400 }, { "epoch": 1.16, "learning_rate": 3.4092911997985645e-07, "logits/chosen": -0.1826629638671875, "logits/rejected": 0.3074275255203247, "logps/chosen": -268.0657653808594, "logps/rejected": -627.3146362304688, "loss": 0.0327, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.049239225685596466, "rewards/margins": 15.414558410644531, "rewards/rejected": -15.463796615600586, "step": 3410 }, { "epoch": 1.16, "learning_rate": 3.402996348986529e-07, "logits/chosen": -0.09795518219470978, "logits/rejected": 0.3582213521003723, "logps/chosen": -349.23455810546875, "logps/rejected": -664.0587158203125, "loss": 0.0157, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.011978554539382458, "rewards/margins": 13.283285140991211, "rewards/rejected": -13.271306991577148, "step": 3420 }, { "epoch": 1.17, "learning_rate": 3.396701498174493e-07, "logits/chosen": -0.19625170528888702, "logits/rejected": 0.40038400888442993, "logps/chosen": -295.09588623046875, "logps/rejected": -492.6290588378906, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": 0.5425056219100952, "rewards/margins": 13.1342191696167, "rewards/rejected": -12.591713905334473, "step": 3430 }, { "epoch": 1.17, "learning_rate": 3.3904066473624577e-07, "logits/chosen": -0.13993123173713684, "logits/rejected": 0.3320801258087158, "logps/chosen": -201.84878540039062, "logps/rejected": -638.9454956054688, "loss": 0.0087, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.0918659120798111, "rewards/margins": 14.574012756347656, "rewards/rejected": -14.482149124145508, "step": 3440 }, { "epoch": 1.17, "learning_rate": 3.384111796550422e-07, "logits/chosen": -0.15443512797355652, "logits/rejected": 0.304768830537796, "logps/chosen": -220.4514617919922, "logps/rejected": -687.4869995117188, "loss": 0.0068, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.244291752576828, "rewards/margins": 14.97923469543457, "rewards/rejected": -15.223526000976562, "step": 3450 }, { "epoch": 1.18, "learning_rate": 3.377816945738386e-07, "logits/chosen": -0.1619388312101364, "logits/rejected": 0.32695573568344116, "logps/chosen": -192.61392211914062, "logps/rejected": -497.8218688964844, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 0.18743309378623962, "rewards/margins": 13.209783554077148, "rewards/rejected": -13.022351264953613, "step": 3460 }, { "epoch": 1.18, "learning_rate": 3.37152209492635e-07, "logits/chosen": -0.1554739773273468, "logits/rejected": 0.3489275574684143, "logps/chosen": -265.12109375, "logps/rejected": -595.3714599609375, "loss": 0.0263, "rewards/accuracies": 1.0, "rewards/chosen": 0.09530983120203018, "rewards/margins": 14.287948608398438, "rewards/rejected": -14.19264030456543, "step": 3470 }, { "epoch": 1.18, "learning_rate": 3.365227244114314e-07, "logits/chosen": -0.15851840376853943, "logits/rejected": 0.14447328448295593, "logps/chosen": -205.0215606689453, "logps/rejected": -517.3046264648438, "loss": 0.0047, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.542872428894043, "rewards/margins": 13.926173210144043, "rewards/rejected": -13.38330078125, "step": 3480 }, { "epoch": 1.19, "learning_rate": 3.3589323933022783e-07, "logits/chosen": -0.22381973266601562, "logits/rejected": 0.41218647360801697, "logps/chosen": -295.12872314453125, "logps/rejected": -633.66943359375, "loss": 0.0108, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.09497400373220444, "rewards/margins": 16.886516571044922, "rewards/rejected": -16.79154396057129, "step": 3490 }, { "epoch": 1.19, "learning_rate": 3.3526375424902426e-07, "logits/chosen": -0.27511733770370483, "logits/rejected": 0.3487841486930847, "logps/chosen": -278.3988342285156, "logps/rejected": -552.4881591796875, "loss": 0.0232, "rewards/accuracies": 1.0, "rewards/chosen": 0.17403404414653778, "rewards/margins": 14.08287239074707, "rewards/rejected": -13.908838272094727, "step": 3500 }, { "epoch": 1.19, "eval_logits/chosen": -0.1409633308649063, "eval_logits/rejected": 0.4597848653793335, "eval_logps/chosen": -270.2084045410156, "eval_logps/rejected": -592.4003295898438, "eval_loss": 0.017061512917280197, "eval_rewards/accuracies": 0.9957912564277649, "eval_rewards/chosen": 0.42732003331184387, "eval_rewards/margins": 14.657685279846191, "eval_rewards/rejected": -14.230364799499512, "eval_runtime": 528.2137, "eval_samples_per_second": 17.985, "eval_steps_per_second": 0.562, "step": 3500 }, { "epoch": 1.19, "learning_rate": 3.3463426916782073e-07, "logits/chosen": -0.12065289914608002, "logits/rejected": 0.3313377797603607, "logps/chosen": -258.06561279296875, "logps/rejected": -474.4727478027344, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22961728274822235, "rewards/margins": 13.486015319824219, "rewards/rejected": -13.256396293640137, "step": 3510 }, { "epoch": 1.2, "learning_rate": 3.3400478408661716e-07, "logits/chosen": -0.26255255937576294, "logits/rejected": 0.36281904578208923, "logps/chosen": -224.4401397705078, "logps/rejected": -404.94830322265625, "loss": 0.0044, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.9341962933540344, "rewards/margins": 15.84864616394043, "rewards/rejected": -14.914449691772461, "step": 3520 }, { "epoch": 1.2, "learning_rate": 3.333752990054136e-07, "logits/chosen": -0.15393109619617462, "logits/rejected": 0.28830626606941223, "logps/chosen": -266.291748046875, "logps/rejected": -684.2716064453125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.8597735166549683, "rewards/margins": 14.795564651489258, "rewards/rejected": -13.935789108276367, "step": 3530 }, { "epoch": 1.2, "learning_rate": 3.3274581392420995e-07, "logits/chosen": -0.19804079830646515, "logits/rejected": 0.31266260147094727, "logps/chosen": -200.33416748046875, "logps/rejected": -468.512451171875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 0.5385676622390747, "rewards/margins": 15.956387519836426, "rewards/rejected": -15.417819023132324, "step": 3540 }, { "epoch": 1.21, "learning_rate": 3.3211632884300637e-07, "logits/chosen": -0.21335545182228088, "logits/rejected": 0.5758942365646362, "logps/chosen": -314.9797058105469, "logps/rejected": -584.1978759765625, "loss": 0.0057, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.31985849142074585, "rewards/margins": 14.214152336120605, "rewards/rejected": -13.894294738769531, "step": 3550 }, { "epoch": 1.21, "learning_rate": 3.314868437618028e-07, "logits/chosen": -0.21948771178722382, "logits/rejected": 0.429076611995697, "logps/chosen": -349.6432189941406, "logps/rejected": -570.4706420898438, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.16393320262432098, "rewards/margins": 13.862060546875, "rewards/rejected": -13.698125839233398, "step": 3560 }, { "epoch": 1.21, "learning_rate": 3.3085735868059927e-07, "logits/chosen": -0.24136993288993835, "logits/rejected": 0.4326243996620178, "logps/chosen": -410.6064453125, "logps/rejected": -462.531005859375, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 0.24709634482860565, "rewards/margins": 14.8192720413208, "rewards/rejected": -14.572174072265625, "step": 3570 }, { "epoch": 1.22, "learning_rate": 3.302278735993957e-07, "logits/chosen": -0.11237607896327972, "logits/rejected": 0.3745260238647461, "logps/chosen": -324.9195556640625, "logps/rejected": -766.3726196289062, "loss": 0.0111, "rewards/accuracies": 1.0, "rewards/chosen": 0.06317238509654999, "rewards/margins": 16.509471893310547, "rewards/rejected": -16.446300506591797, "step": 3580 }, { "epoch": 1.22, "learning_rate": 3.295983885181921e-07, "logits/chosen": -0.18865501880645752, "logits/rejected": 0.2601153254508972, "logps/chosen": -435.29144287109375, "logps/rejected": -739.7227783203125, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": 0.7317277193069458, "rewards/margins": 14.382043838500977, "rewards/rejected": -13.65031623840332, "step": 3590 }, { "epoch": 1.22, "learning_rate": 3.2896890343698854e-07, "logits/chosen": -0.16328895092010498, "logits/rejected": 0.34126654267311096, "logps/chosen": -205.11355590820312, "logps/rejected": -556.348388671875, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": 0.27502450346946716, "rewards/margins": 14.211759567260742, "rewards/rejected": -13.936738967895508, "step": 3600 }, { "epoch": 1.22, "eval_logits/chosen": -0.15211555361747742, "eval_logits/rejected": 0.4365946054458618, "eval_logps/chosen": -270.8471984863281, "eval_logps/rejected": -601.1661376953125, "eval_loss": 0.016450628638267517, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.36344170570373535, "eval_rewards/margins": 15.470385551452637, "eval_rewards/rejected": -15.106945037841797, "eval_runtime": 528.3936, "eval_samples_per_second": 17.979, "eval_steps_per_second": 0.562, "step": 3600 }, { "epoch": 1.23, "learning_rate": 3.2833941835578496e-07, "logits/chosen": -0.14044170081615448, "logits/rejected": 0.4206857681274414, "logps/chosen": -227.84689331054688, "logps/rejected": -664.2696533203125, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 0.576287567615509, "rewards/margins": 17.07168960571289, "rewards/rejected": -16.49540138244629, "step": 3610 }, { "epoch": 1.23, "learning_rate": 3.2770993327458133e-07, "logits/chosen": -0.18667910993099213, "logits/rejected": 0.4223746359348297, "logps/chosen": -278.6806335449219, "logps/rejected": -473.71002197265625, "loss": 0.0082, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22075721621513367, "rewards/margins": 16.31894874572754, "rewards/rejected": -16.098190307617188, "step": 3620 }, { "epoch": 1.23, "learning_rate": 3.270804481933778e-07, "logits/chosen": -0.1566249430179596, "logits/rejected": 0.4028855264186859, "logps/chosen": -216.5321807861328, "logps/rejected": -641.686279296875, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -0.014846044592559338, "rewards/margins": 14.834589004516602, "rewards/rejected": -14.849433898925781, "step": 3630 }, { "epoch": 1.24, "learning_rate": 3.2645096311217423e-07, "logits/chosen": -0.2827712297439575, "logits/rejected": 0.38314664363861084, "logps/chosen": -287.4330749511719, "logps/rejected": -406.48968505859375, "loss": 0.0092, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3858519196510315, "rewards/margins": 14.257487297058105, "rewards/rejected": -13.871635437011719, "step": 3640 }, { "epoch": 1.24, "learning_rate": 3.2582147803097065e-07, "logits/chosen": -0.17638123035430908, "logits/rejected": 0.27414146065711975, "logps/chosen": -248.9254913330078, "logps/rejected": -601.4592895507812, "loss": 0.0073, "rewards/accuracies": 1.0, "rewards/chosen": 0.6509407758712769, "rewards/margins": 15.086674690246582, "rewards/rejected": -14.4357328414917, "step": 3650 }, { "epoch": 1.24, "learning_rate": 3.251919929497671e-07, "logits/chosen": -0.2523764967918396, "logits/rejected": 0.37927982211112976, "logps/chosen": -312.2040100097656, "logps/rejected": -504.53045654296875, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 0.07258407771587372, "rewards/margins": 15.12769889831543, "rewards/rejected": -15.05511474609375, "step": 3660 }, { "epoch": 1.25, "learning_rate": 3.245625078685635e-07, "logits/chosen": -0.19265082478523254, "logits/rejected": 0.3669603168964386, "logps/chosen": -401.91241455078125, "logps/rejected": -758.1317749023438, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 0.3311590552330017, "rewards/margins": 15.484827995300293, "rewards/rejected": -15.153668403625488, "step": 3670 }, { "epoch": 1.25, "learning_rate": 3.239330227873599e-07, "logits/chosen": -0.26369500160217285, "logits/rejected": 0.38850894570350647, "logps/chosen": -249.6353302001953, "logps/rejected": -514.5643310546875, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.7192095518112183, "rewards/margins": 16.05099105834961, "rewards/rejected": -15.331782341003418, "step": 3680 }, { "epoch": 1.25, "learning_rate": 3.233035377061564e-07, "logits/chosen": -0.23389701545238495, "logits/rejected": 0.26544034481048584, "logps/chosen": -270.5085144042969, "logps/rejected": -518.8240966796875, "loss": 0.0073, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.04987448453903198, "rewards/margins": 14.862302780151367, "rewards/rejected": -14.912175178527832, "step": 3690 }, { "epoch": 1.26, "learning_rate": 3.2267405262495277e-07, "logits/chosen": -0.28265756368637085, "logits/rejected": 0.3147626519203186, "logps/chosen": -262.10137939453125, "logps/rejected": -526.5657958984375, "loss": 0.0136, "rewards/accuracies": 1.0, "rewards/chosen": -0.3934406638145447, "rewards/margins": 14.2977294921875, "rewards/rejected": -14.691167831420898, "step": 3700 }, { "epoch": 1.26, "eval_logits/chosen": -0.1679392158985138, "eval_logits/rejected": 0.45533519983291626, "eval_logps/chosen": -271.5597229003906, "eval_logps/rejected": -594.880859375, "eval_loss": 0.01755337044596672, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.29218798875808716, "eval_rewards/margins": 14.770598411560059, "eval_rewards/rejected": -14.478410720825195, "eval_runtime": 530.0647, "eval_samples_per_second": 17.922, "eval_steps_per_second": 0.56, "step": 3700 }, { "epoch": 1.26, "learning_rate": 3.220445675437492e-07, "logits/chosen": -0.15885339677333832, "logits/rejected": 0.24473007023334503, "logps/chosen": -233.7559051513672, "logps/rejected": -735.5289306640625, "loss": 0.0124, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3215694725513458, "rewards/margins": 15.537500381469727, "rewards/rejected": -15.85906982421875, "step": 3710 }, { "epoch": 1.26, "learning_rate": 3.214150824625456e-07, "logits/chosen": -0.18055549263954163, "logits/rejected": 0.36639198660850525, "logps/chosen": -216.2382049560547, "logps/rejected": -645.614013671875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.028872346505522728, "rewards/margins": 15.542221069335938, "rewards/rejected": -15.513348579406738, "step": 3720 }, { "epoch": 1.27, "learning_rate": 3.2078559738134203e-07, "logits/chosen": -0.2450266182422638, "logits/rejected": 0.23105528950691223, "logps/chosen": -337.71966552734375, "logps/rejected": -685.8253173828125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.9552300572395325, "rewards/margins": 15.50323486328125, "rewards/rejected": -16.458465576171875, "step": 3730 }, { "epoch": 1.27, "learning_rate": 3.2015611230013846e-07, "logits/chosen": -0.23318926990032196, "logits/rejected": 0.3770993649959564, "logps/chosen": -222.3261260986328, "logps/rejected": -708.5399780273438, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 0.5888663530349731, "rewards/margins": 15.647760391235352, "rewards/rejected": -15.058893203735352, "step": 3740 }, { "epoch": 1.27, "learning_rate": 3.1952662721893493e-07, "logits/chosen": -0.18979564309120178, "logits/rejected": 0.32741016149520874, "logps/chosen": -197.73849487304688, "logps/rejected": -609.9541015625, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": -0.021016694605350494, "rewards/margins": 13.583696365356445, "rewards/rejected": -13.604713439941406, "step": 3750 }, { "epoch": 1.28, "learning_rate": 3.1889714213773135e-07, "logits/chosen": -0.2407466620206833, "logits/rejected": 0.2376377135515213, "logps/chosen": -265.22515869140625, "logps/rejected": -602.7027587890625, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.39856502413749695, "rewards/margins": 14.47114086151123, "rewards/rejected": -14.072575569152832, "step": 3760 }, { "epoch": 1.28, "learning_rate": 3.182676570565277e-07, "logits/chosen": -0.17925480008125305, "logits/rejected": 0.4106702208518982, "logps/chosen": -215.75424194335938, "logps/rejected": -590.206787109375, "loss": 0.0088, "rewards/accuracies": 1.0, "rewards/chosen": 0.4610523581504822, "rewards/margins": 16.836069107055664, "rewards/rejected": -16.375019073486328, "step": 3770 }, { "epoch": 1.28, "learning_rate": 3.1763817197532415e-07, "logits/chosen": -0.2198895961046219, "logits/rejected": 0.40587133169174194, "logps/chosen": -269.89508056640625, "logps/rejected": -551.52685546875, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.3670289218425751, "rewards/margins": 16.443462371826172, "rewards/rejected": -16.076431274414062, "step": 3780 }, { "epoch": 1.29, "learning_rate": 3.1700868689412057e-07, "logits/chosen": -0.24140086770057678, "logits/rejected": 0.37982696294784546, "logps/chosen": -364.250244140625, "logps/rejected": -685.4907836914062, "loss": 0.0078, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 1.1564490795135498, "rewards/margins": 17.217512130737305, "rewards/rejected": -16.061065673828125, "step": 3790 }, { "epoch": 1.29, "learning_rate": 3.16379201812917e-07, "logits/chosen": -0.14407333731651306, "logits/rejected": 0.21796922385692596, "logps/chosen": -197.7551727294922, "logps/rejected": -776.6976928710938, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.1580687016248703, "rewards/margins": 15.616865158081055, "rewards/rejected": -15.458796501159668, "step": 3800 }, { "epoch": 1.29, "eval_logits/chosen": -0.1720902919769287, "eval_logits/rejected": 0.44467589259147644, "eval_logps/chosen": -272.133056640625, "eval_logps/rejected": -599.06689453125, "eval_loss": 0.015483987517654896, "eval_rewards/accuracies": 0.996632993221283, "eval_rewards/chosen": 0.23485629260540009, "eval_rewards/margins": 15.131868362426758, "eval_rewards/rejected": -14.897013664245605, "eval_runtime": 529.8892, "eval_samples_per_second": 17.928, "eval_steps_per_second": 0.56, "step": 3800 }, { "epoch": 1.3, "learning_rate": 3.1574971673171347e-07, "logits/chosen": -0.23518407344818115, "logits/rejected": 0.48808103799819946, "logps/chosen": -343.52105712890625, "logps/rejected": -470.1806640625, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -0.10256783664226532, "rewards/margins": 16.54703712463379, "rewards/rejected": -16.64960289001465, "step": 3810 }, { "epoch": 1.3, "learning_rate": 3.151202316505099e-07, "logits/chosen": -0.13806667923927307, "logits/rejected": 0.2960960268974304, "logps/chosen": -226.56478881835938, "logps/rejected": -701.4844970703125, "loss": 0.0235, "rewards/accuracies": 1.0, "rewards/chosen": 0.3629760146141052, "rewards/margins": 17.1171817779541, "rewards/rejected": -16.754201889038086, "step": 3820 }, { "epoch": 1.3, "learning_rate": 3.144907465693063e-07, "logits/chosen": -0.18388059735298157, "logits/rejected": 0.20920582115650177, "logps/chosen": -270.31890869140625, "logps/rejected": -610.9346313476562, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -0.39998307824134827, "rewards/margins": 15.421943664550781, "rewards/rejected": -15.821925163269043, "step": 3830 }, { "epoch": 1.31, "learning_rate": 3.1386126148810274e-07, "logits/chosen": -0.16105304658412933, "logits/rejected": 0.434287965297699, "logps/chosen": -372.8267517089844, "logps/rejected": -571.6928100585938, "loss": 0.0129, "rewards/accuracies": 1.0, "rewards/chosen": -0.07323522865772247, "rewards/margins": 18.5178279876709, "rewards/rejected": -18.591060638427734, "step": 3840 }, { "epoch": 1.31, "learning_rate": 3.132317764068991e-07, "logits/chosen": -0.09611834585666656, "logits/rejected": 0.23400087654590607, "logps/chosen": -228.3934326171875, "logps/rejected": -733.98583984375, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -0.19709303975105286, "rewards/margins": 15.741836547851562, "rewards/rejected": -15.938929557800293, "step": 3850 }, { "epoch": 1.31, "learning_rate": 3.1260229132569553e-07, "logits/chosen": -0.3162058889865875, "logits/rejected": 0.35179176926612854, "logps/chosen": -338.6106262207031, "logps/rejected": -465.80035400390625, "loss": 0.0195, "rewards/accuracies": 1.0, "rewards/chosen": -0.43765395879745483, "rewards/margins": 16.71442222595215, "rewards/rejected": -17.15207862854004, "step": 3860 }, { "epoch": 1.32, "learning_rate": 3.11972806244492e-07, "logits/chosen": -0.19480642676353455, "logits/rejected": 0.30231326818466187, "logps/chosen": -284.5684814453125, "logps/rejected": -583.4805908203125, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -0.2183064967393875, "rewards/margins": 13.6595458984375, "rewards/rejected": -13.877851486206055, "step": 3870 }, { "epoch": 1.32, "learning_rate": 3.1134332116328843e-07, "logits/chosen": -0.18951399624347687, "logits/rejected": 0.31676262617111206, "logps/chosen": -261.74859619140625, "logps/rejected": -714.4825439453125, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 0.36718469858169556, "rewards/margins": 17.039648056030273, "rewards/rejected": -16.67246437072754, "step": 3880 }, { "epoch": 1.32, "learning_rate": 3.1071383608208485e-07, "logits/chosen": -0.2937713861465454, "logits/rejected": 0.38697463274002075, "logps/chosen": -318.3480529785156, "logps/rejected": -440.5936584472656, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -0.05105816200375557, "rewards/margins": 17.582950592041016, "rewards/rejected": -17.634008407592773, "step": 3890 }, { "epoch": 1.33, "learning_rate": 3.1008435100088127e-07, "logits/chosen": -0.09262891113758087, "logits/rejected": 0.33533549308776855, "logps/chosen": -226.59945678710938, "logps/rejected": -777.2349853515625, "loss": 0.01, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3744003474712372, "rewards/margins": 16.460899353027344, "rewards/rejected": -16.086498260498047, "step": 3900 }, { "epoch": 1.33, "eval_logits/chosen": -0.18338800966739655, "eval_logits/rejected": 0.41795286536216736, "eval_logps/chosen": -273.17236328125, "eval_logps/rejected": -603.717041015625, "eval_loss": 0.012337159365415573, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.13092456758022308, "eval_rewards/margins": 15.492964744567871, "eval_rewards/rejected": -15.362041473388672, "eval_runtime": 528.7197, "eval_samples_per_second": 17.968, "eval_steps_per_second": 0.562, "step": 3900 }, { "epoch": 1.33, "learning_rate": 3.094548659196777e-07, "logits/chosen": -0.15283605456352234, "logits/rejected": 0.38465410470962524, "logps/chosen": -419.2047424316406, "logps/rejected": -746.9746704101562, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 0.008219528011977673, "rewards/margins": 13.787338256835938, "rewards/rejected": -13.779119491577148, "step": 3910 }, { "epoch": 1.33, "learning_rate": 3.0882538083847407e-07, "logits/chosen": -0.22887253761291504, "logits/rejected": 0.37960878014564514, "logps/chosen": -305.2249755859375, "logps/rejected": -511.80560302734375, "loss": 0.0058, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.25815117359161377, "rewards/margins": 14.81982135772705, "rewards/rejected": -15.077972412109375, "step": 3920 }, { "epoch": 1.34, "learning_rate": 3.0819589575727054e-07, "logits/chosen": -0.32747799158096313, "logits/rejected": 0.41286468505859375, "logps/chosen": -459.1854553222656, "logps/rejected": -435.34368896484375, "loss": 0.0123, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2620457708835602, "rewards/margins": 15.596186637878418, "rewards/rejected": -15.858235359191895, "step": 3930 }, { "epoch": 1.34, "learning_rate": 3.0756641067606696e-07, "logits/chosen": -0.2114226520061493, "logits/rejected": 0.22898319363594055, "logps/chosen": -231.874267578125, "logps/rejected": -564.7235717773438, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.13716581463813782, "rewards/margins": 15.562444686889648, "rewards/rejected": -15.699609756469727, "step": 3940 }, { "epoch": 1.34, "learning_rate": 3.069369255948634e-07, "logits/chosen": -0.18351168930530548, "logits/rejected": 0.26984935998916626, "logps/chosen": -221.30477905273438, "logps/rejected": -666.3414306640625, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": 0.5785341262817383, "rewards/margins": 17.911020278930664, "rewards/rejected": -17.33248519897461, "step": 3950 }, { "epoch": 1.35, "learning_rate": 3.063074405136598e-07, "logits/chosen": -0.24071399867534637, "logits/rejected": 0.22876401245594025, "logps/chosen": -219.90347290039062, "logps/rejected": -596.7186889648438, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.024813424795866013, "rewards/margins": 17.983104705810547, "rewards/rejected": -17.95829200744629, "step": 3960 }, { "epoch": 1.35, "learning_rate": 3.0567795543245623e-07, "logits/chosen": -0.20096378028392792, "logits/rejected": 0.197138249874115, "logps/chosen": -285.3905334472656, "logps/rejected": -701.5247802734375, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.019631052389740944, "rewards/margins": 17.68076515197754, "rewards/rejected": -17.661134719848633, "step": 3970 }, { "epoch": 1.35, "learning_rate": 3.0504847035125266e-07, "logits/chosen": -0.2959366738796234, "logits/rejected": 0.36888614296913147, "logps/chosen": -317.3089904785156, "logps/rejected": -610.6212158203125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.4381635785102844, "rewards/margins": 16.686092376708984, "rewards/rejected": -16.247928619384766, "step": 3980 }, { "epoch": 1.36, "learning_rate": 3.0441898527004913e-07, "logits/chosen": -0.2693161368370056, "logits/rejected": 0.36736229062080383, "logps/chosen": -209.51611328125, "logps/rejected": -584.1768798828125, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 0.047711730003356934, "rewards/margins": 17.12515640258789, "rewards/rejected": -17.077442169189453, "step": 3990 }, { "epoch": 1.36, "learning_rate": 3.037895001888455e-07, "logits/chosen": -0.2709835171699524, "logits/rejected": 0.25317418575286865, "logps/chosen": -282.4261169433594, "logps/rejected": -571.9276123046875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.16672606766223907, "rewards/margins": 15.3230562210083, "rewards/rejected": -15.156329154968262, "step": 4000 }, { "epoch": 1.36, "eval_logits/chosen": -0.18988220393657684, "eval_logits/rejected": 0.38357996940612793, "eval_logps/chosen": -274.2541809082031, "eval_logps/rejected": -615.1339111328125, "eval_loss": 0.013239211402833462, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.022742442786693573, "eval_rewards/margins": 16.526460647583008, "eval_rewards/rejected": -16.50371742248535, "eval_runtime": 528.8899, "eval_samples_per_second": 17.962, "eval_steps_per_second": 0.562, "step": 4000 }, { "epoch": 1.36, "learning_rate": 3.031600151076419e-07, "logits/chosen": -0.2373082935810089, "logits/rejected": 0.24630069732666016, "logps/chosen": -225.5118865966797, "logps/rejected": -636.379638671875, "loss": 0.007, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.19022385776042938, "rewards/margins": 16.978271484375, "rewards/rejected": -17.16849708557129, "step": 4010 }, { "epoch": 1.37, "learning_rate": 3.0253053002643835e-07, "logits/chosen": -0.22659477591514587, "logits/rejected": 0.32924550771713257, "logps/chosen": -223.6737823486328, "logps/rejected": -638.1231079101562, "loss": 0.0046, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.27358555793762207, "rewards/margins": 18.227428436279297, "rewards/rejected": -17.95384407043457, "step": 4020 }, { "epoch": 1.37, "learning_rate": 3.0190104494523477e-07, "logits/chosen": -0.23890674114227295, "logits/rejected": 0.19467458128929138, "logps/chosen": -234.0302734375, "logps/rejected": -764.7510986328125, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.5679093599319458, "rewards/margins": 14.931506156921387, "rewards/rejected": -15.499417304992676, "step": 4030 }, { "epoch": 1.37, "learning_rate": 3.012715598640312e-07, "logits/chosen": -0.3242078423500061, "logits/rejected": 0.353915274143219, "logps/chosen": -326.27215576171875, "logps/rejected": -567.3423461914062, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 0.5260270237922668, "rewards/margins": 18.044193267822266, "rewards/rejected": -17.51816177368164, "step": 4040 }, { "epoch": 1.38, "learning_rate": 3.0064207478282767e-07, "logits/chosen": -0.24511408805847168, "logits/rejected": 0.34329378604888916, "logps/chosen": -347.5810852050781, "logps/rejected": -634.1834716796875, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.18419332802295685, "rewards/margins": 16.877849578857422, "rewards/rejected": -17.062042236328125, "step": 4050 }, { "epoch": 1.38, "learning_rate": 3.000125897016241e-07, "logits/chosen": -0.15527072548866272, "logits/rejected": 0.28012755513191223, "logps/chosen": -238.47616577148438, "logps/rejected": -592.252197265625, "loss": 0.0075, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2582128643989563, "rewards/margins": 15.374653816223145, "rewards/rejected": -15.632867813110352, "step": 4060 }, { "epoch": 1.38, "learning_rate": 2.993831046204205e-07, "logits/chosen": -0.2936265170574188, "logits/rejected": 0.3369044065475464, "logps/chosen": -378.78515625, "logps/rejected": -512.6487426757812, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 0.26306888461112976, "rewards/margins": 15.894487380981445, "rewards/rejected": -15.63141918182373, "step": 4070 }, { "epoch": 1.39, "learning_rate": 2.987536195392169e-07, "logits/chosen": -0.25324660539627075, "logits/rejected": 0.3778296113014221, "logps/chosen": -297.2909851074219, "logps/rejected": -523.3548583984375, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.7922178506851196, "rewards/margins": 16.856945037841797, "rewards/rejected": -16.06472396850586, "step": 4080 }, { "epoch": 1.39, "learning_rate": 2.981241344580133e-07, "logits/chosen": -0.2574784755706787, "logits/rejected": 0.28378060460090637, "logps/chosen": -297.7970886230469, "logps/rejected": -546.0797729492188, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.03678516298532486, "rewards/margins": 14.366331100463867, "rewards/rejected": -14.329546928405762, "step": 4090 }, { "epoch": 1.39, "learning_rate": 2.9749464937680973e-07, "logits/chosen": -0.32521432638168335, "logits/rejected": 0.23761284351348877, "logps/chosen": -233.2191619873047, "logps/rejected": -361.3418273925781, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": 0.6759541034698486, "rewards/margins": 15.453967094421387, "rewards/rejected": -14.7780122756958, "step": 4100 }, { "epoch": 1.39, "eval_logits/chosen": -0.18697935342788696, "eval_logits/rejected": 0.4086976945400238, "eval_logps/chosen": -271.8445129394531, "eval_logps/rejected": -602.20751953125, "eval_loss": 0.012223497033119202, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.26371097564697266, "eval_rewards/margins": 15.474800109863281, "eval_rewards/rejected": -15.211089134216309, "eval_runtime": 529.1409, "eval_samples_per_second": 17.954, "eval_steps_per_second": 0.561, "step": 4100 }, { "epoch": 1.4, "learning_rate": 2.968651642956062e-07, "logits/chosen": -0.17507441341876984, "logits/rejected": 0.2461117058992386, "logps/chosen": -212.32357788085938, "logps/rejected": -581.2274169921875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.244314044713974, "rewards/margins": 15.668779373168945, "rewards/rejected": -15.913090705871582, "step": 4110 }, { "epoch": 1.4, "learning_rate": 2.9623567921440263e-07, "logits/chosen": -0.2512482702732086, "logits/rejected": 0.2587471604347229, "logps/chosen": -215.48098754882812, "logps/rejected": -466.307861328125, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.10397139936685562, "rewards/margins": 13.659551620483398, "rewards/rejected": -13.555580139160156, "step": 4120 }, { "epoch": 1.4, "learning_rate": 2.9560619413319905e-07, "logits/chosen": -0.22416651248931885, "logits/rejected": 0.2503054738044739, "logps/chosen": -444.462646484375, "logps/rejected": -730.7481689453125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.17638221383094788, "rewards/margins": 16.932003021240234, "rewards/rejected": -17.10838508605957, "step": 4130 }, { "epoch": 1.41, "learning_rate": 2.9497670905199547e-07, "logits/chosen": -0.24734529852867126, "logits/rejected": 0.25456804037094116, "logps/chosen": -433.63739013671875, "logps/rejected": -641.03369140625, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.3881835639476776, "rewards/margins": 16.885908126831055, "rewards/rejected": -16.497724533081055, "step": 4140 }, { "epoch": 1.41, "learning_rate": 2.9434722397079184e-07, "logits/chosen": -0.2244756519794464, "logits/rejected": 0.2791951596736908, "logps/chosen": -311.4923400878906, "logps/rejected": -801.0186767578125, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.032573211938142776, "rewards/margins": 16.866619110107422, "rewards/rejected": -16.834047317504883, "step": 4150 }, { "epoch": 1.41, "learning_rate": 2.9371773888958827e-07, "logits/chosen": -0.20708689093589783, "logits/rejected": 0.3253230154514313, "logps/chosen": -234.5973663330078, "logps/rejected": -599.05810546875, "loss": 0.0077, "rewards/accuracies": 1.0, "rewards/chosen": 0.15446926653385162, "rewards/margins": 18.027141571044922, "rewards/rejected": -17.872671127319336, "step": 4160 }, { "epoch": 1.42, "learning_rate": 2.9308825380838474e-07, "logits/chosen": -0.23652204871177673, "logits/rejected": 0.32536113262176514, "logps/chosen": -301.27740478515625, "logps/rejected": -528.7940673828125, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": 0.20321667194366455, "rewards/margins": 17.8907470703125, "rewards/rejected": -17.687532424926758, "step": 4170 }, { "epoch": 1.42, "learning_rate": 2.9245876872718116e-07, "logits/chosen": -0.2608851194381714, "logits/rejected": 0.2631388306617737, "logps/chosen": -276.1244812011719, "logps/rejected": -583.9542236328125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.3372761905193329, "rewards/margins": 16.133317947387695, "rewards/rejected": -15.796038627624512, "step": 4180 }, { "epoch": 1.42, "learning_rate": 2.918292836459776e-07, "logits/chosen": -0.14426776766777039, "logits/rejected": 0.1536015421152115, "logps/chosen": -257.0679626464844, "logps/rejected": -820.3624267578125, "loss": 0.0148, "rewards/accuracies": 1.0, "rewards/chosen": 0.1765262335538864, "rewards/margins": 17.629920959472656, "rewards/rejected": -17.45339584350586, "step": 4190 }, { "epoch": 1.43, "learning_rate": 2.91199798564774e-07, "logits/chosen": -0.15764063596725464, "logits/rejected": 0.2566733956336975, "logps/chosen": -203.2073211669922, "logps/rejected": -575.6087646484375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.1717136800289154, "rewards/margins": 15.351728439331055, "rewards/rejected": -15.523442268371582, "step": 4200 }, { "epoch": 1.43, "eval_logits/chosen": -0.15178489685058594, "eval_logits/rejected": 0.40855103731155396, "eval_logps/chosen": -272.8890686035156, "eval_logps/rejected": -615.5074462890625, "eval_loss": 0.011744743213057518, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.15925489366054535, "eval_rewards/margins": 16.7003231048584, "eval_rewards/rejected": -16.54106903076172, "eval_runtime": 528.984, "eval_samples_per_second": 17.959, "eval_steps_per_second": 0.561, "step": 4200 }, { "epoch": 1.43, "learning_rate": 2.9057031348357043e-07, "logits/chosen": -0.10736264288425446, "logits/rejected": 0.18843689560890198, "logps/chosen": -217.3965301513672, "logps/rejected": -752.2833862304688, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": 0.49614453315734863, "rewards/margins": 17.258535385131836, "rewards/rejected": -16.762392044067383, "step": 4210 }, { "epoch": 1.43, "learning_rate": 2.8994082840236686e-07, "logits/chosen": -0.21540586650371552, "logits/rejected": 0.2235790193080902, "logps/chosen": -344.9656066894531, "logps/rejected": -683.6705322265625, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.06039924547076225, "rewards/margins": 17.048391342163086, "rewards/rejected": -17.10879135131836, "step": 4220 }, { "epoch": 1.44, "learning_rate": 2.893113433211632e-07, "logits/chosen": -0.21119225025177002, "logits/rejected": 0.27178964018821716, "logps/chosen": -238.1674041748047, "logps/rejected": -524.1343994140625, "loss": 0.0116, "rewards/accuracies": 1.0, "rewards/chosen": 0.24927306175231934, "rewards/margins": 16.510616302490234, "rewards/rejected": -16.261341094970703, "step": 4230 }, { "epoch": 1.44, "learning_rate": 2.886818582399597e-07, "logits/chosen": -0.2197844237089157, "logits/rejected": 0.34832561016082764, "logps/chosen": -284.6922302246094, "logps/rejected": -485.92047119140625, "loss": 0.0051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.5371509194374084, "rewards/margins": 15.7072172164917, "rewards/rejected": -16.244369506835938, "step": 4240 }, { "epoch": 1.44, "learning_rate": 2.880523731587561e-07, "logits/chosen": -0.17311552166938782, "logits/rejected": 0.19032004475593567, "logps/chosen": -398.7367248535156, "logps/rejected": -975.7136840820312, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.19422271847724915, "rewards/margins": 16.49168586730957, "rewards/rejected": -16.297462463378906, "step": 4250 }, { "epoch": 1.45, "learning_rate": 2.8742288807755255e-07, "logits/chosen": -0.16319172084331512, "logits/rejected": 0.13500218093395233, "logps/chosen": -276.0093688964844, "logps/rejected": -756.6503295898438, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 0.36778244376182556, "rewards/margins": 16.420482635498047, "rewards/rejected": -16.05270004272461, "step": 4260 }, { "epoch": 1.45, "learning_rate": 2.8679340299634897e-07, "logits/chosen": -0.03200678154826164, "logits/rejected": 0.2605624794960022, "logps/chosen": -212.0287628173828, "logps/rejected": -947.9996948242188, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.49220848083496094, "rewards/margins": 15.738177299499512, "rewards/rejected": -15.24596881866455, "step": 4270 }, { "epoch": 1.45, "learning_rate": 2.861639179151454e-07, "logits/chosen": -0.2447376698255539, "logits/rejected": 0.3794545531272888, "logps/chosen": -232.6941375732422, "logps/rejected": -619.48095703125, "loss": 0.012, "rewards/accuracies": 1.0, "rewards/chosen": -0.2102927714586258, "rewards/margins": 17.37118911743164, "rewards/rejected": -17.581483840942383, "step": 4280 }, { "epoch": 1.46, "learning_rate": 2.855344328339418e-07, "logits/chosen": -0.16824063658714294, "logits/rejected": 0.35942238569259644, "logps/chosen": -215.51803588867188, "logps/rejected": -647.6165771484375, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 0.14137548208236694, "rewards/margins": 17.527124404907227, "rewards/rejected": -17.3857479095459, "step": 4290 }, { "epoch": 1.46, "learning_rate": 2.849049477527383e-07, "logits/chosen": -0.28347688913345337, "logits/rejected": 0.34930890798568726, "logps/chosen": -226.5618438720703, "logps/rejected": -457.2098693847656, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.5425971150398254, "rewards/margins": 18.027463912963867, "rewards/rejected": -17.4848690032959, "step": 4300 }, { "epoch": 1.46, "eval_logits/chosen": -0.17819282412528992, "eval_logits/rejected": 0.37222063541412354, "eval_logps/chosen": -275.2962646484375, "eval_logps/rejected": -624.9092407226562, "eval_loss": 0.01125518698245287, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.08146754652261734, "eval_rewards/margins": 17.399789810180664, "eval_rewards/rejected": -17.481258392333984, "eval_runtime": 527.6516, "eval_samples_per_second": 18.004, "eval_steps_per_second": 0.563, "step": 4300 }, { "epoch": 1.46, "learning_rate": 2.8427546267153466e-07, "logits/chosen": -0.13541218638420105, "logits/rejected": 0.25646549463272095, "logps/chosen": -235.5537872314453, "logps/rejected": -722.388427734375, "loss": 0.0163, "rewards/accuracies": 1.0, "rewards/chosen": -0.6693282127380371, "rewards/margins": 16.75033187866211, "rewards/rejected": -17.419658660888672, "step": 4310 }, { "epoch": 1.47, "learning_rate": 2.836459775903311e-07, "logits/chosen": -0.25134673714637756, "logits/rejected": 0.3408077657222748, "logps/chosen": -333.66485595703125, "logps/rejected": -483.9823303222656, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -0.04942737892270088, "rewards/margins": 16.056825637817383, "rewards/rejected": -16.106250762939453, "step": 4320 }, { "epoch": 1.47, "learning_rate": 2.830164925091275e-07, "logits/chosen": -0.19782884418964386, "logits/rejected": 0.32758602499961853, "logps/chosen": -223.3359375, "logps/rejected": -596.4703369140625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.12522730231285095, "rewards/margins": 18.539627075195312, "rewards/rejected": -18.664852142333984, "step": 4330 }, { "epoch": 1.48, "learning_rate": 2.8238700742792393e-07, "logits/chosen": -0.25615859031677246, "logits/rejected": 0.33938831090927124, "logps/chosen": -277.4320373535156, "logps/rejected": -445.38104248046875, "loss": 0.0069, "rewards/accuracies": 1.0, "rewards/chosen": 0.32226598262786865, "rewards/margins": 17.16183853149414, "rewards/rejected": -16.839570999145508, "step": 4340 }, { "epoch": 1.48, "learning_rate": 2.8175752234672035e-07, "logits/chosen": -0.32644081115722656, "logits/rejected": 0.4200917184352875, "logps/chosen": -341.0304260253906, "logps/rejected": -543.3197021484375, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": 0.2821505665779114, "rewards/margins": 15.443603515625, "rewards/rejected": -15.161453247070312, "step": 4350 }, { "epoch": 1.48, "learning_rate": 2.8112803726551683e-07, "logits/chosen": -0.28735944628715515, "logits/rejected": 0.2743311822414398, "logps/chosen": -366.29925537109375, "logps/rejected": -397.9869384765625, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.21798062324523926, "rewards/margins": 14.375032424926758, "rewards/rejected": -14.59301471710205, "step": 4360 }, { "epoch": 1.49, "learning_rate": 2.8049855218431325e-07, "logits/chosen": -0.29170936346054077, "logits/rejected": 0.1908983290195465, "logps/chosen": -225.93948364257812, "logps/rejected": -501.26031494140625, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.4945368766784668, "rewards/margins": 14.903242111206055, "rewards/rejected": -14.408706665039062, "step": 4370 }, { "epoch": 1.49, "learning_rate": 2.7986906710310967e-07, "logits/chosen": -0.14978908002376556, "logits/rejected": 0.29089677333831787, "logps/chosen": -294.646728515625, "logps/rejected": -882.6041259765625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": 0.034731097519397736, "rewards/margins": 16.13250160217285, "rewards/rejected": -16.09777069091797, "step": 4380 }, { "epoch": 1.49, "learning_rate": 2.7923958202190604e-07, "logits/chosen": -0.10451877117156982, "logits/rejected": 0.2823556661605835, "logps/chosen": -208.64505004882812, "logps/rejected": -907.1721801757812, "loss": 0.0098, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.37157416343688965, "rewards/margins": 18.227693557739258, "rewards/rejected": -18.599267959594727, "step": 4390 }, { "epoch": 1.5, "learning_rate": 2.7861009694070247e-07, "logits/chosen": -0.29618895053863525, "logits/rejected": 0.35425060987472534, "logps/chosen": -315.30010986328125, "logps/rejected": -472.050537109375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.023552346974611282, "rewards/margins": 17.247730255126953, "rewards/rejected": -17.27128028869629, "step": 4400 }, { "epoch": 1.5, "eval_logits/chosen": -0.19750729203224182, "eval_logits/rejected": 0.3445335626602173, "eval_logps/chosen": -272.4309997558594, "eval_logps/rejected": -624.4055786132812, "eval_loss": 0.011451439931988716, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.20506168901920319, "eval_rewards/margins": 17.635944366455078, "eval_rewards/rejected": -17.430885314941406, "eval_runtime": 528.0131, "eval_samples_per_second": 17.992, "eval_steps_per_second": 0.562, "step": 4400 }, { "epoch": 1.5, "learning_rate": 2.779806118594989e-07, "logits/chosen": -0.199708953499794, "logits/rejected": 0.23678787052631378, "logps/chosen": -294.13043212890625, "logps/rejected": -739.6495361328125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.5660265684127808, "rewards/margins": 17.846559524536133, "rewards/rejected": -17.280532836914062, "step": 4410 }, { "epoch": 1.5, "learning_rate": 2.7735112677829536e-07, "logits/chosen": -0.23684322834014893, "logits/rejected": 0.1837673783302307, "logps/chosen": -208.8436279296875, "logps/rejected": -584.4861450195312, "loss": 0.0764, "rewards/accuracies": 1.0, "rewards/chosen": 0.37673476338386536, "rewards/margins": 16.30198097229004, "rewards/rejected": -15.925247192382812, "step": 4420 }, { "epoch": 1.51, "learning_rate": 2.767216416970918e-07, "logits/chosen": -0.24971072375774384, "logits/rejected": 0.3504873216152191, "logps/chosen": -209.44046020507812, "logps/rejected": -460.20135498046875, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.19055530428886414, "rewards/margins": 15.769549369812012, "rewards/rejected": -15.578994750976562, "step": 4430 }, { "epoch": 1.51, "learning_rate": 2.760921566158882e-07, "logits/chosen": -0.14951011538505554, "logits/rejected": 0.2989422082901001, "logps/chosen": -270.6300048828125, "logps/rejected": -592.0036010742188, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.06859159469604492, "rewards/margins": 14.246310234069824, "rewards/rejected": -14.177717208862305, "step": 4440 }, { "epoch": 1.51, "learning_rate": 2.7546267153468463e-07, "logits/chosen": -0.2669655978679657, "logits/rejected": 0.34664806723594666, "logps/chosen": -445.8868103027344, "logps/rejected": -556.882568359375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 0.07703065872192383, "rewards/margins": 13.929638862609863, "rewards/rejected": -13.852605819702148, "step": 4450 }, { "epoch": 1.52, "learning_rate": 2.74833186453481e-07, "logits/chosen": -0.2339959591627121, "logits/rejected": 0.34462836384773254, "logps/chosen": -352.59423828125, "logps/rejected": -605.0172729492188, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.4007705748081207, "rewards/margins": 15.869169235229492, "rewards/rejected": -15.468399047851562, "step": 4460 }, { "epoch": 1.52, "learning_rate": 2.742037013722774e-07, "logits/chosen": -0.2713382840156555, "logits/rejected": 0.29998236894607544, "logps/chosen": -282.3365173339844, "logps/rejected": -639.5241088867188, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.13092830777168274, "rewards/margins": 16.42253303527832, "rewards/rejected": -16.291606903076172, "step": 4470 }, { "epoch": 1.52, "learning_rate": 2.735742162910739e-07, "logits/chosen": -0.30418092012405396, "logits/rejected": 0.3604821264743805, "logps/chosen": -310.68292236328125, "logps/rejected": -442.4320373535156, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 0.13955359160900116, "rewards/margins": 15.164472579956055, "rewards/rejected": -15.024919509887695, "step": 4480 }, { "epoch": 1.53, "learning_rate": 2.729447312098703e-07, "logits/chosen": -0.17105242609977722, "logits/rejected": 0.3524318337440491, "logps/chosen": -394.6695251464844, "logps/rejected": -670.5609130859375, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": 0.4518977105617523, "rewards/margins": 15.727327346801758, "rewards/rejected": -15.275428771972656, "step": 4490 }, { "epoch": 1.53, "learning_rate": 2.7231524612866675e-07, "logits/chosen": -0.19429433345794678, "logits/rejected": 0.38165944814682007, "logps/chosen": -222.7614288330078, "logps/rejected": -653.3795166015625, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 0.3607620596885681, "rewards/margins": 15.001808166503906, "rewards/rejected": -14.641047477722168, "step": 4500 }, { "epoch": 1.53, "eval_logits/chosen": -0.1979503333568573, "eval_logits/rejected": 0.4021304249763489, "eval_logps/chosen": -271.6719055175781, "eval_logps/rejected": -605.2649536132812, "eval_loss": 0.009791034273803234, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.2809707820415497, "eval_rewards/margins": 15.797797203063965, "eval_rewards/rejected": -15.516826629638672, "eval_runtime": 527.9967, "eval_samples_per_second": 17.993, "eval_steps_per_second": 0.563, "step": 4500 }, { "epoch": 1.53, "learning_rate": 2.7168576104746317e-07, "logits/chosen": -0.21145014464855194, "logits/rejected": 0.4209953248500824, "logps/chosen": -262.1670837402344, "logps/rejected": -677.2559204101562, "loss": 0.0202, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.28308892250061035, "rewards/margins": 17.44225311279297, "rewards/rejected": -17.15916633605957, "step": 4510 }, { "epoch": 1.54, "learning_rate": 2.710562759662596e-07, "logits/chosen": -0.22124645113945007, "logits/rejected": 0.2341385781764984, "logps/chosen": -285.5039978027344, "logps/rejected": -637.0916137695312, "loss": 0.007, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5809570550918579, "rewards/margins": 16.057004928588867, "rewards/rejected": -15.476048469543457, "step": 4520 }, { "epoch": 1.54, "learning_rate": 2.70426790885056e-07, "logits/chosen": -0.17391130328178406, "logits/rejected": 0.2289833277463913, "logps/chosen": -270.3614807128906, "logps/rejected": -798.002197265625, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.5974786877632141, "rewards/margins": 17.885028839111328, "rewards/rejected": -17.28754997253418, "step": 4530 }, { "epoch": 1.54, "learning_rate": 2.6979730580385244e-07, "logits/chosen": -0.3203919529914856, "logits/rejected": 0.28854313492774963, "logps/chosen": -355.32769775390625, "logps/rejected": -655.5968017578125, "loss": 0.0215, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.25840434432029724, "rewards/margins": 14.968040466308594, "rewards/rejected": -15.226445198059082, "step": 4540 }, { "epoch": 1.55, "learning_rate": 2.6916782072264886e-07, "logits/chosen": -0.23398566246032715, "logits/rejected": 0.2993885576725006, "logps/chosen": -272.38702392578125, "logps/rejected": -494.76947021484375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 0.534290075302124, "rewards/margins": 16.07679557800293, "rewards/rejected": -15.542506217956543, "step": 4550 }, { "epoch": 1.55, "learning_rate": 2.685383356414453e-07, "logits/chosen": -0.25912633538246155, "logits/rejected": 0.3410409390926361, "logps/chosen": -289.5722961425781, "logps/rejected": -560.5997314453125, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": 0.009580835700035095, "rewards/margins": 18.48375129699707, "rewards/rejected": -18.47416877746582, "step": 4560 }, { "epoch": 1.55, "learning_rate": 2.679088505602417e-07, "logits/chosen": -0.19066600501537323, "logits/rejected": 0.2817936837673187, "logps/chosen": -329.889892578125, "logps/rejected": -693.7817993164062, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.0898318812251091, "rewards/margins": 16.643192291259766, "rewards/rejected": -16.55335807800293, "step": 4570 }, { "epoch": 1.56, "learning_rate": 2.6727936547903813e-07, "logits/chosen": -0.2025507241487503, "logits/rejected": 0.07465730607509613, "logps/chosen": -214.1136932373047, "logps/rejected": -643.9960327148438, "loss": 0.0126, "rewards/accuracies": 1.0, "rewards/chosen": 0.20408311486244202, "rewards/margins": 14.158143997192383, "rewards/rejected": -13.954058647155762, "step": 4580 }, { "epoch": 1.56, "learning_rate": 2.6664988039783455e-07, "logits/chosen": -0.2320069968700409, "logits/rejected": 0.3459148705005646, "logps/chosen": -209.6259765625, "logps/rejected": -465.03765869140625, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.03744647651910782, "rewards/margins": 16.02958869934082, "rewards/rejected": -15.992141723632812, "step": 4590 }, { "epoch": 1.56, "learning_rate": 2.66020395316631e-07, "logits/chosen": -0.2566419839859009, "logits/rejected": 0.4115219712257385, "logps/chosen": -211.89599609375, "logps/rejected": -498.4521484375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.15333518385887146, "rewards/margins": 15.164199829101562, "rewards/rejected": -15.317535400390625, "step": 4600 }, { "epoch": 1.56, "eval_logits/chosen": -0.17659763991832733, "eval_logits/rejected": 0.42246758937835693, "eval_logps/chosen": -271.5559997558594, "eval_logps/rejected": -604.8609619140625, "eval_loss": 0.012269796803593636, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.2925608456134796, "eval_rewards/margins": 15.76898193359375, "eval_rewards/rejected": -15.476421356201172, "eval_runtime": 527.034, "eval_samples_per_second": 18.025, "eval_steps_per_second": 0.564, "step": 4600 }, { "epoch": 1.57, "learning_rate": 2.6539091023542745e-07, "logits/chosen": -0.14315541088581085, "logits/rejected": 0.3021068871021271, "logps/chosen": -210.60659790039062, "logps/rejected": -726.88525390625, "loss": 0.0155, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.979668915271759, "rewards/margins": 17.024396896362305, "rewards/rejected": -16.044727325439453, "step": 4610 }, { "epoch": 1.57, "learning_rate": 2.647614251542238e-07, "logits/chosen": -0.15154698491096497, "logits/rejected": 0.3085904121398926, "logps/chosen": -244.5245361328125, "logps/rejected": -746.0638427734375, "loss": 0.0185, "rewards/accuracies": 1.0, "rewards/chosen": -0.25938934087753296, "rewards/margins": 16.27005958557129, "rewards/rejected": -16.52944564819336, "step": 4620 }, { "epoch": 1.57, "learning_rate": 2.6413194007302024e-07, "logits/chosen": -0.22468647360801697, "logits/rejected": 0.18161095678806305, "logps/chosen": -228.0222625732422, "logps/rejected": -618.8980712890625, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -0.024521708488464355, "rewards/margins": 14.942939758300781, "rewards/rejected": -14.967460632324219, "step": 4630 }, { "epoch": 1.58, "learning_rate": 2.6350245499181666e-07, "logits/chosen": -0.2867588400840759, "logits/rejected": 0.3163248896598816, "logps/chosen": -296.0140075683594, "logps/rejected": -612.0528564453125, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": 0.4555296003818512, "rewards/margins": 15.6041259765625, "rewards/rejected": -15.148595809936523, "step": 4640 }, { "epoch": 1.58, "learning_rate": 2.628729699106131e-07, "logits/chosen": -0.24217942357063293, "logits/rejected": 0.400942862033844, "logps/chosen": -214.73239135742188, "logps/rejected": -607.3278198242188, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.448162704706192, "rewards/margins": 16.893836975097656, "rewards/rejected": -16.445674896240234, "step": 4650 }, { "epoch": 1.58, "learning_rate": 2.6224348482940956e-07, "logits/chosen": -0.22408249974250793, "logits/rejected": 0.3466007113456726, "logps/chosen": -383.6099548339844, "logps/rejected": -703.7899780273438, "loss": 0.0203, "rewards/accuracies": 1.0, "rewards/chosen": 0.6677167415618896, "rewards/margins": 15.482014656066895, "rewards/rejected": -14.814297676086426, "step": 4660 }, { "epoch": 1.59, "learning_rate": 2.61613999748206e-07, "logits/chosen": -0.3056062161922455, "logits/rejected": 0.48152846097946167, "logps/chosen": -308.5443420410156, "logps/rejected": -423.1805725097656, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.20133328437805176, "rewards/margins": 14.763463020324707, "rewards/rejected": -14.562129020690918, "step": 4670 }, { "epoch": 1.59, "learning_rate": 2.609845146670024e-07, "logits/chosen": -0.28442245721817017, "logits/rejected": 0.34398096799850464, "logps/chosen": -203.17767333984375, "logps/rejected": -443.27984619140625, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.20597729086875916, "rewards/margins": 14.40227222442627, "rewards/rejected": -14.196294784545898, "step": 4680 }, { "epoch": 1.59, "learning_rate": 2.603550295857988e-07, "logits/chosen": -0.23682594299316406, "logits/rejected": 0.2294616997241974, "logps/chosen": -230.08944702148438, "logps/rejected": -621.9158325195312, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": 0.10989930480718613, "rewards/margins": 14.301159858703613, "rewards/rejected": -14.191258430480957, "step": 4690 }, { "epoch": 1.6, "learning_rate": 2.597255445045952e-07, "logits/chosen": -0.2888338267803192, "logits/rejected": 0.4640123248100281, "logps/chosen": -334.68890380859375, "logps/rejected": -467.4002990722656, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -0.2218518704175949, "rewards/margins": 15.025634765625, "rewards/rejected": -15.247485160827637, "step": 4700 }, { "epoch": 1.6, "eval_logits/chosen": -0.20259489119052887, "eval_logits/rejected": 0.4067513942718506, "eval_logps/chosen": -271.2900085449219, "eval_logps/rejected": -598.4222412109375, "eval_loss": 0.012475229799747467, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.3191603124141693, "eval_rewards/margins": 15.151710510253906, "eval_rewards/rejected": -14.832549095153809, "eval_runtime": 527.9768, "eval_samples_per_second": 17.993, "eval_steps_per_second": 0.563, "step": 4700 }, { "epoch": 1.6, "learning_rate": 2.590960594233916e-07, "logits/chosen": -0.21099357306957245, "logits/rejected": 0.4108428955078125, "logps/chosen": -240.6837921142578, "logps/rejected": -602.33837890625, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.2833942472934723, "rewards/margins": 14.436793327331543, "rewards/rejected": -14.153399467468262, "step": 4710 }, { "epoch": 1.6, "learning_rate": 2.584665743421881e-07, "logits/chosen": -0.2661481499671936, "logits/rejected": 0.2770903706550598, "logps/chosen": -223.4495391845703, "logps/rejected": -450.3095703125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.5875486731529236, "rewards/margins": 16.70699691772461, "rewards/rejected": -16.119447708129883, "step": 4720 }, { "epoch": 1.61, "learning_rate": 2.578370892609845e-07, "logits/chosen": -0.33346670866012573, "logits/rejected": 0.3698267638683319, "logps/chosen": -271.51214599609375, "logps/rejected": -446.98883056640625, "loss": 0.0059, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.07609344273805618, "rewards/margins": 14.60930347442627, "rewards/rejected": -14.685399055480957, "step": 4730 }, { "epoch": 1.61, "learning_rate": 2.5720760417978095e-07, "logits/chosen": -0.2538115382194519, "logits/rejected": 0.3549404740333557, "logps/chosen": -312.93060302734375, "logps/rejected": -567.3137817382812, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.06588401645421982, "rewards/margins": 14.448613166809082, "rewards/rejected": -14.382726669311523, "step": 4740 }, { "epoch": 1.61, "learning_rate": 2.5657811909857737e-07, "logits/chosen": -0.29633599519729614, "logits/rejected": 0.2919926345348358, "logps/chosen": -304.09637451171875, "logps/rejected": -520.0494995117188, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 0.1872343271970749, "rewards/margins": 13.405220031738281, "rewards/rejected": -13.217985153198242, "step": 4750 }, { "epoch": 1.62, "learning_rate": 2.559486340173738e-07, "logits/chosen": -0.2086435854434967, "logits/rejected": 0.3460771441459656, "logps/chosen": -221.3948211669922, "logps/rejected": -483.939208984375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.4714789390563965, "rewards/margins": 15.110015869140625, "rewards/rejected": -14.638537406921387, "step": 4760 }, { "epoch": 1.62, "learning_rate": 2.5531914893617016e-07, "logits/chosen": -0.1618521809577942, "logits/rejected": 0.29943743348121643, "logps/chosen": -224.4468994140625, "logps/rejected": -839.2628173828125, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -0.21037332713603973, "rewards/margins": 15.711776733398438, "rewards/rejected": -15.922147750854492, "step": 4770 }, { "epoch": 1.62, "learning_rate": 2.5468966385496664e-07, "logits/chosen": -0.28031715750694275, "logits/rejected": 0.28640881180763245, "logps/chosen": -220.4100341796875, "logps/rejected": -413.3926696777344, "loss": 0.0056, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21526245772838593, "rewards/margins": 14.668710708618164, "rewards/rejected": -14.453447341918945, "step": 4780 }, { "epoch": 1.63, "learning_rate": 2.5406017877376306e-07, "logits/chosen": -0.30295294523239136, "logits/rejected": 0.28319600224494934, "logps/chosen": -262.92974853515625, "logps/rejected": -542.721923828125, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.37117844820022583, "rewards/margins": 15.564549446105957, "rewards/rejected": -15.935728073120117, "step": 4790 }, { "epoch": 1.63, "learning_rate": 2.534306936925595e-07, "logits/chosen": -0.33107519149780273, "logits/rejected": 0.35467594861984253, "logps/chosen": -286.2915344238281, "logps/rejected": -574.7144775390625, "loss": 0.0079, "rewards/accuracies": 1.0, "rewards/chosen": 0.06421319395303726, "rewards/margins": 16.75643539428711, "rewards/rejected": -16.692218780517578, "step": 4800 }, { "epoch": 1.63, "eval_logits/chosen": -0.2088957279920578, "eval_logits/rejected": 0.39422333240509033, "eval_logps/chosen": -273.0765380859375, "eval_logps/rejected": -601.6903686523438, "eval_loss": 0.012430666014552116, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.14050839841365814, "eval_rewards/margins": 15.299872398376465, "eval_rewards/rejected": -15.159363746643066, "eval_runtime": 527.1317, "eval_samples_per_second": 18.022, "eval_steps_per_second": 0.563, "step": 4800 }, { "epoch": 1.63, "learning_rate": 2.528012086113559e-07, "logits/chosen": -0.22002315521240234, "logits/rejected": 0.26010748744010925, "logps/chosen": -229.3431854248047, "logps/rejected": -763.7672729492188, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.2652167081832886, "rewards/margins": 16.378318786621094, "rewards/rejected": -16.113101959228516, "step": 4810 }, { "epoch": 1.64, "learning_rate": 2.5217172353015233e-07, "logits/chosen": -0.2008160650730133, "logits/rejected": 0.22270803153514862, "logps/chosen": -280.2085876464844, "logps/rejected": -664.7764892578125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.6572667956352234, "rewards/margins": 15.257092475891113, "rewards/rejected": -14.599825859069824, "step": 4820 }, { "epoch": 1.64, "learning_rate": 2.5154223844894875e-07, "logits/chosen": -0.2790294289588928, "logits/rejected": 0.4160275459289551, "logps/chosen": -307.0839538574219, "logps/rejected": -444.87249755859375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.054200269281864166, "rewards/margins": 14.834139823913574, "rewards/rejected": -14.779940605163574, "step": 4830 }, { "epoch": 1.65, "learning_rate": 2.509127533677452e-07, "logits/chosen": -0.2958906292915344, "logits/rejected": 0.2890189588069916, "logps/chosen": -288.7923889160156, "logps/rejected": -419.3224182128906, "loss": 0.0083, "rewards/accuracies": 1.0, "rewards/chosen": 0.2274448126554489, "rewards/margins": 16.487064361572266, "rewards/rejected": -16.25962257385254, "step": 4840 }, { "epoch": 1.65, "learning_rate": 2.502832682865416e-07, "logits/chosen": -0.2723850607872009, "logits/rejected": 0.29555416107177734, "logps/chosen": -295.7272644042969, "logps/rejected": -539.4978637695312, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": 0.23985135555267334, "rewards/margins": 15.059781074523926, "rewards/rejected": -14.819929122924805, "step": 4850 }, { "epoch": 1.65, "learning_rate": 2.49653783205338e-07, "logits/chosen": -0.18608307838439941, "logits/rejected": 0.31418102979660034, "logps/chosen": -216.3018798828125, "logps/rejected": -573.4141235351562, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": 0.14337901771068573, "rewards/margins": 16.980440139770508, "rewards/rejected": -16.837060928344727, "step": 4860 }, { "epoch": 1.66, "learning_rate": 2.4902429812413444e-07, "logits/chosen": -0.2207600325345993, "logits/rejected": 0.38090404868125916, "logps/chosen": -227.4120330810547, "logps/rejected": -588.0453491210938, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.08310683071613312, "rewards/margins": 16.381702423095703, "rewards/rejected": -16.298593521118164, "step": 4870 }, { "epoch": 1.66, "learning_rate": 2.4839481304293086e-07, "logits/chosen": -0.2098553627729416, "logits/rejected": 0.1850660741329193, "logps/chosen": -274.17315673828125, "logps/rejected": -650.1477661132812, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -0.0786561444401741, "rewards/margins": 15.420692443847656, "rewards/rejected": -15.499348640441895, "step": 4880 }, { "epoch": 1.66, "learning_rate": 2.477653279617273e-07, "logits/chosen": -0.1868363618850708, "logits/rejected": 0.21766746044158936, "logps/chosen": -317.53302001953125, "logps/rejected": -833.5399169921875, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -0.3290696144104004, "rewards/margins": 17.530960083007812, "rewards/rejected": -17.860029220581055, "step": 4890 }, { "epoch": 1.67, "learning_rate": 2.471358428805237e-07, "logits/chosen": -0.22821466624736786, "logits/rejected": 0.2361174076795578, "logps/chosen": -302.99383544921875, "logps/rejected": -654.7114868164062, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -0.41509729623794556, "rewards/margins": 16.12540054321289, "rewards/rejected": -16.540496826171875, "step": 4900 }, { "epoch": 1.67, "eval_logits/chosen": -0.21402065455913544, "eval_logits/rejected": 0.32938286662101746, "eval_logps/chosen": -275.8886413574219, "eval_logps/rejected": -624.8115844726562, "eval_loss": 0.010827288031578064, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": -0.14070430397987366, "eval_rewards/margins": 17.330785751342773, "eval_rewards/rejected": -17.47148895263672, "eval_runtime": 527.1189, "eval_samples_per_second": 18.023, "eval_steps_per_second": 0.563, "step": 4900 }, { "epoch": 1.67, "learning_rate": 2.4650635779932013e-07, "logits/chosen": -0.28772053122520447, "logits/rejected": 0.3310549855232239, "logps/chosen": -225.89450073242188, "logps/rejected": -507.30120849609375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.12360192835330963, "rewards/margins": 18.88557243347168, "rewards/rejected": -19.00917625427246, "step": 4910 }, { "epoch": 1.67, "learning_rate": 2.4587687271811656e-07, "logits/chosen": -0.2613827586174011, "logits/rejected": 0.16434982419013977, "logps/chosen": -238.8052520751953, "logps/rejected": -768.7489013671875, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 0.2475823163986206, "rewards/margins": 18.669038772583008, "rewards/rejected": -18.42145347595215, "step": 4920 }, { "epoch": 1.68, "learning_rate": 2.45247387636913e-07, "logits/chosen": -0.24611854553222656, "logits/rejected": 0.15252389013767242, "logps/chosen": -271.3021240234375, "logps/rejected": -576.16259765625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.2396242320537567, "rewards/margins": 16.149471282958984, "rewards/rejected": -15.9098482131958, "step": 4930 }, { "epoch": 1.68, "learning_rate": 2.446179025557094e-07, "logits/chosen": -0.27984005212783813, "logits/rejected": 0.20142945647239685, "logps/chosen": -410.6554260253906, "logps/rejected": -743.0693969726562, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -0.3968011736869812, "rewards/margins": 16.853412628173828, "rewards/rejected": -17.25021743774414, "step": 4940 }, { "epoch": 1.68, "learning_rate": 2.439884174745059e-07, "logits/chosen": -0.21833479404449463, "logits/rejected": 0.365252822637558, "logps/chosen": -289.46136474609375, "logps/rejected": -568.4913330078125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.35943013429641724, "rewards/margins": 20.069005966186523, "rewards/rejected": -20.428436279296875, "step": 4950 }, { "epoch": 1.69, "learning_rate": 2.4335893239330225e-07, "logits/chosen": -0.32104796171188354, "logits/rejected": 0.2392534464597702, "logps/chosen": -336.40704345703125, "logps/rejected": -601.685302734375, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -0.5921145081520081, "rewards/margins": 16.761905670166016, "rewards/rejected": -17.354019165039062, "step": 4960 }, { "epoch": 1.69, "learning_rate": 2.4272944731209867e-07, "logits/chosen": -0.29500317573547363, "logits/rejected": 0.1818784475326538, "logps/chosen": -292.989013671875, "logps/rejected": -573.9219360351562, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": 0.2035837173461914, "rewards/margins": 18.5906925201416, "rewards/rejected": -18.38710594177246, "step": 4970 }, { "epoch": 1.69, "learning_rate": 2.4209996223089514e-07, "logits/chosen": -0.2332979440689087, "logits/rejected": 0.16983795166015625, "logps/chosen": -288.332275390625, "logps/rejected": -596.6060791015625, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.03922147676348686, "rewards/margins": 16.202951431274414, "rewards/rejected": -16.16373062133789, "step": 4980 }, { "epoch": 1.7, "learning_rate": 2.4147047714969157e-07, "logits/chosen": -0.2545982003211975, "logits/rejected": 0.16094762086868286, "logps/chosen": -227.6043701171875, "logps/rejected": -640.76953125, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.2446214258670807, "rewards/margins": 15.290324211120605, "rewards/rejected": -15.045702934265137, "step": 4990 }, { "epoch": 1.7, "learning_rate": 2.4084099206848794e-07, "logits/chosen": -0.3321176767349243, "logits/rejected": 0.163829505443573, "logps/chosen": -219.1016845703125, "logps/rejected": -492.32464599609375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.5506273508071899, "rewards/margins": 17.125093460083008, "rewards/rejected": -16.574466705322266, "step": 5000 }, { "epoch": 1.7, "eval_logits/chosen": -0.2062416821718216, "eval_logits/rejected": 0.33607715368270874, "eval_logps/chosen": -275.0013427734375, "eval_logps/rejected": -629.650390625, "eval_loss": 0.011567746289074421, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.051973309367895126, "eval_rewards/margins": 17.903396606445312, "eval_rewards/rejected": -17.955368041992188, "eval_runtime": 527.1836, "eval_samples_per_second": 18.02, "eval_steps_per_second": 0.563, "step": 5000 }, { "epoch": 1.7, "learning_rate": 2.402115069872844e-07, "logits/chosen": -0.2897815704345703, "logits/rejected": 0.2548253834247589, "logps/chosen": -246.47314453125, "logps/rejected": -499.8934631347656, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -0.2794676721096039, "rewards/margins": 18.873273849487305, "rewards/rejected": -19.152740478515625, "step": 5010 }, { "epoch": 1.71, "learning_rate": 2.3958202190608084e-07, "logits/chosen": -0.22513000667095184, "logits/rejected": 0.3280385732650757, "logps/chosen": -295.3684997558594, "logps/rejected": -520.6778564453125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.4327625334262848, "rewards/margins": 16.545631408691406, "rewards/rejected": -16.11286735534668, "step": 5020 }, { "epoch": 1.71, "learning_rate": 2.3895253682487726e-07, "logits/chosen": -0.22265076637268066, "logits/rejected": 0.21412424743175507, "logps/chosen": -229.51889038085938, "logps/rejected": -613.8779907226562, "loss": 0.0191, "rewards/accuracies": 1.0, "rewards/chosen": 0.2781461775302887, "rewards/margins": 15.082743644714355, "rewards/rejected": -14.804597854614258, "step": 5030 }, { "epoch": 1.71, "learning_rate": 2.3832305174367368e-07, "logits/chosen": -0.25865438580513, "logits/rejected": 0.2960045635700226, "logps/chosen": -281.5153503417969, "logps/rejected": -560.0691528320312, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": 0.22121700644493103, "rewards/margins": 16.197710037231445, "rewards/rejected": -15.97649097442627, "step": 5040 }, { "epoch": 1.72, "learning_rate": 2.3769356666247008e-07, "logits/chosen": -0.24015462398529053, "logits/rejected": 0.2546975016593933, "logps/chosen": -270.96746826171875, "logps/rejected": -580.884033203125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": 0.272571861743927, "rewards/margins": 14.725293159484863, "rewards/rejected": -14.452720642089844, "step": 5050 }, { "epoch": 1.72, "learning_rate": 2.370640815812665e-07, "logits/chosen": -0.2913532853126526, "logits/rejected": 0.33612799644470215, "logps/chosen": -288.2020263671875, "logps/rejected": -623.3416137695312, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.3955974280834198, "rewards/margins": 17.634288787841797, "rewards/rejected": -17.238693237304688, "step": 5060 }, { "epoch": 1.72, "learning_rate": 2.3643459650006295e-07, "logits/chosen": -0.34062132239341736, "logits/rejected": 0.27215588092803955, "logps/chosen": -284.84710693359375, "logps/rejected": -400.5345458984375, "loss": 0.0165, "rewards/accuracies": 1.0, "rewards/chosen": 0.6288712024688721, "rewards/margins": 14.63945484161377, "rewards/rejected": -14.010583877563477, "step": 5070 }, { "epoch": 1.73, "learning_rate": 2.3580511141885937e-07, "logits/chosen": -0.2738853096961975, "logits/rejected": 0.3333335518836975, "logps/chosen": -277.1520080566406, "logps/rejected": -450.26806640625, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": 0.6213118433952332, "rewards/margins": 14.29711627960205, "rewards/rejected": -13.675804138183594, "step": 5080 }, { "epoch": 1.73, "learning_rate": 2.3517562633765577e-07, "logits/chosen": -0.27537304162979126, "logits/rejected": 0.28307509422302246, "logps/chosen": -203.46707153320312, "logps/rejected": -445.3966369628906, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 0.45260030031204224, "rewards/margins": 14.669706344604492, "rewards/rejected": -14.2171049118042, "step": 5090 }, { "epoch": 1.73, "learning_rate": 2.3454614125645222e-07, "logits/chosen": -0.20561501383781433, "logits/rejected": 0.26533254981040955, "logps/chosen": -249.07394409179688, "logps/rejected": -647.5260620117188, "loss": 0.0151, "rewards/accuracies": 1.0, "rewards/chosen": 0.4156990945339203, "rewards/margins": 15.301963806152344, "rewards/rejected": -14.886263847351074, "step": 5100 }, { "epoch": 1.73, "eval_logits/chosen": -0.1896672248840332, "eval_logits/rejected": 0.4233788847923279, "eval_logps/chosen": -268.7549743652344, "eval_logps/rejected": -593.5914306640625, "eval_loss": 0.011181685142219067, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.5726622343063354, "eval_rewards/margins": 14.922133445739746, "eval_rewards/rejected": -14.349471092224121, "eval_runtime": 528.0668, "eval_samples_per_second": 17.99, "eval_steps_per_second": 0.562, "step": 5100 }, { "epoch": 1.74, "learning_rate": 2.3391665617524864e-07, "logits/chosen": -0.18073321878910065, "logits/rejected": 0.18017032742500305, "logps/chosen": -341.35650634765625, "logps/rejected": -816.6260375976562, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 0.8515820503234863, "rewards/margins": 15.290278434753418, "rewards/rejected": -14.438695907592773, "step": 5110 }, { "epoch": 1.74, "learning_rate": 2.3328717109404506e-07, "logits/chosen": -0.3268122971057892, "logits/rejected": 0.465710312128067, "logps/chosen": -224.68173217773438, "logps/rejected": -486.5732421875, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": 0.7094311714172363, "rewards/margins": 16.02373695373535, "rewards/rejected": -15.314305305480957, "step": 5120 }, { "epoch": 1.74, "learning_rate": 2.3265768601284149e-07, "logits/chosen": -0.22480475902557373, "logits/rejected": 0.3461781144142151, "logps/chosen": -363.7989196777344, "logps/rejected": -621.1608276367188, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 0.3799824118614197, "rewards/margins": 14.550779342651367, "rewards/rejected": -14.170801162719727, "step": 5130 }, { "epoch": 1.75, "learning_rate": 2.320282009316379e-07, "logits/chosen": -0.21733203530311584, "logits/rejected": 0.24383404850959778, "logps/chosen": -359.83380126953125, "logps/rejected": -763.839599609375, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": 0.7539977431297302, "rewards/margins": 15.40283489227295, "rewards/rejected": -14.648836135864258, "step": 5140 }, { "epoch": 1.75, "learning_rate": 2.3139871585043433e-07, "logits/chosen": -0.2977370321750641, "logits/rejected": 0.3453931510448456, "logps/chosen": -280.06500244140625, "logps/rejected": -482.2930603027344, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.03562828153371811, "rewards/margins": 14.455141067504883, "rewards/rejected": -14.419511795043945, "step": 5150 }, { "epoch": 1.75, "learning_rate": 2.3076923076923078e-07, "logits/chosen": -0.3190616965293884, "logits/rejected": 0.368486225605011, "logps/chosen": -386.70269775390625, "logps/rejected": -563.67529296875, "loss": 0.054, "rewards/accuracies": 1.0, "rewards/chosen": 0.9160925149917603, "rewards/margins": 17.472848892211914, "rewards/rejected": -16.5567569732666, "step": 5160 }, { "epoch": 1.76, "learning_rate": 2.3013974568802718e-07, "logits/chosen": -0.2891232371330261, "logits/rejected": 0.2817436754703522, "logps/chosen": -279.6646423339844, "logps/rejected": -499.3915100097656, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.6977526545524597, "rewards/margins": 15.701227188110352, "rewards/rejected": -15.003474235534668, "step": 5170 }, { "epoch": 1.76, "learning_rate": 2.295102606068236e-07, "logits/chosen": -0.12109546363353729, "logits/rejected": 0.18052329123020172, "logps/chosen": -227.43606567382812, "logps/rejected": -852.9351806640625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.2551245391368866, "rewards/margins": 17.006755828857422, "rewards/rejected": -16.751630783081055, "step": 5180 }, { "epoch": 1.76, "learning_rate": 2.2888077552562005e-07, "logits/chosen": -0.19929683208465576, "logits/rejected": 0.35472631454467773, "logps/chosen": -225.7498321533203, "logps/rejected": -628.779296875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.9622241854667664, "rewards/margins": 15.9389066696167, "rewards/rejected": -14.97668170928955, "step": 5190 }, { "epoch": 1.77, "learning_rate": 2.2825129044441647e-07, "logits/chosen": -0.26482826471328735, "logits/rejected": 0.3434629738330841, "logps/chosen": -210.18173217773438, "logps/rejected": -541.1041870117188, "loss": 0.0049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.03324339538812637, "rewards/margins": 15.925933837890625, "rewards/rejected": -15.959177017211914, "step": 5200 }, { "epoch": 1.77, "eval_logits/chosen": -0.20541931688785553, "eval_logits/rejected": 0.37884852290153503, "eval_logps/chosen": -271.7729797363281, "eval_logps/rejected": -607.6893310546875, "eval_loss": 0.008295748382806778, "eval_rewards/accuracies": 0.997474730014801, "eval_rewards/chosen": 0.27086204290390015, "eval_rewards/margins": 16.030126571655273, "eval_rewards/rejected": -15.759262084960938, "eval_runtime": 528.9947, "eval_samples_per_second": 17.959, "eval_steps_per_second": 0.561, "step": 5200 }, { "epoch": 1.77, "learning_rate": 2.2762180536321287e-07, "logits/chosen": -0.23312728106975555, "logits/rejected": 0.13429893553256989, "logps/chosen": -216.7541046142578, "logps/rejected": -634.6949462890625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 0.27014797925949097, "rewards/margins": 14.53984546661377, "rewards/rejected": -14.269697189331055, "step": 5210 }, { "epoch": 1.77, "learning_rate": 2.2699232028200932e-07, "logits/chosen": -0.18093737959861755, "logits/rejected": 0.14724013209342957, "logps/chosen": -288.3201599121094, "logps/rejected": -737.4957885742188, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 0.37222304940223694, "rewards/margins": 15.577410697937012, "rewards/rejected": -15.205187797546387, "step": 5220 }, { "epoch": 1.78, "learning_rate": 2.2636283520080574e-07, "logits/chosen": -0.35411718487739563, "logits/rejected": 0.40987199544906616, "logps/chosen": -336.4322509765625, "logps/rejected": -528.9947509765625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.0678262710571289, "rewards/margins": 14.709344863891602, "rewards/rejected": -14.777170181274414, "step": 5230 }, { "epoch": 1.78, "learning_rate": 2.2573335011960216e-07, "logits/chosen": -0.2540600895881653, "logits/rejected": 0.24830476939678192, "logps/chosen": -212.78915405273438, "logps/rejected": -734.7907104492188, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": 0.4834038317203522, "rewards/margins": 16.574302673339844, "rewards/rejected": -16.090900421142578, "step": 5240 }, { "epoch": 1.78, "learning_rate": 2.2510386503839856e-07, "logits/chosen": -0.20603147149085999, "logits/rejected": 0.23837938904762268, "logps/chosen": -228.5655059814453, "logps/rejected": -642.9719848632812, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 0.18447169661521912, "rewards/margins": 17.08462905883789, "rewards/rejected": -16.900157928466797, "step": 5250 }, { "epoch": 1.79, "learning_rate": 2.24474379957195e-07, "logits/chosen": -0.2627946734428406, "logits/rejected": 0.31086212396621704, "logps/chosen": -394.79638671875, "logps/rejected": -746.9334716796875, "loss": 0.0153, "rewards/accuracies": 1.0, "rewards/chosen": 0.11962447315454483, "rewards/margins": 17.41043472290039, "rewards/rejected": -17.290809631347656, "step": 5260 }, { "epoch": 1.79, "learning_rate": 2.2384489487599143e-07, "logits/chosen": -0.16932441294193268, "logits/rejected": 0.33229994773864746, "logps/chosen": -359.9416198730469, "logps/rejected": -749.4761962890625, "loss": 0.0658, "rewards/accuracies": 1.0, "rewards/chosen": 0.6766946315765381, "rewards/margins": 17.496437072753906, "rewards/rejected": -16.81974220275879, "step": 5270 }, { "epoch": 1.79, "learning_rate": 2.2321540979478783e-07, "logits/chosen": -0.18589183688163757, "logits/rejected": 0.22859080135822296, "logps/chosen": -281.2740478515625, "logps/rejected": -676.157470703125, "loss": 0.0101, "rewards/accuracies": 1.0, "rewards/chosen": -0.2821868062019348, "rewards/margins": 17.9400577545166, "rewards/rejected": -18.222244262695312, "step": 5280 }, { "epoch": 1.8, "learning_rate": 2.2258592471358428e-07, "logits/chosen": -0.20236733555793762, "logits/rejected": 0.2464982271194458, "logps/chosen": -223.578857421875, "logps/rejected": -678.2619018554688, "loss": 0.0085, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.17314372956752777, "rewards/margins": 14.922569274902344, "rewards/rejected": -15.095712661743164, "step": 5290 }, { "epoch": 1.8, "learning_rate": 2.219564396323807e-07, "logits/chosen": -0.10903950035572052, "logits/rejected": 0.2953929305076599, "logps/chosen": -218.30789184570312, "logps/rejected": -757.9998779296875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.00979046244174242, "rewards/margins": 16.945783615112305, "rewards/rejected": -16.93599510192871, "step": 5300 }, { "epoch": 1.8, "eval_logits/chosen": -0.19612129032611847, "eval_logits/rejected": 0.3798409104347229, "eval_logps/chosen": -272.14556884765625, "eval_logps/rejected": -613.8856201171875, "eval_loss": 0.009273181669414043, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.23360373079776764, "eval_rewards/margins": 16.61249542236328, "eval_rewards/rejected": -16.37889289855957, "eval_runtime": 527.4184, "eval_samples_per_second": 18.012, "eval_steps_per_second": 0.563, "step": 5300 }, { "epoch": 1.8, "learning_rate": 2.2132695455117712e-07, "logits/chosen": -0.3316023349761963, "logits/rejected": 0.19848781824111938, "logps/chosen": -232.0495147705078, "logps/rejected": -482.89862060546875, "loss": 0.007, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.01798144541680813, "rewards/margins": 17.791440963745117, "rewards/rejected": -17.773460388183594, "step": 5310 }, { "epoch": 1.81, "learning_rate": 2.2069746946997355e-07, "logits/chosen": -0.16779540479183197, "logits/rejected": 0.2986212968826294, "logps/chosen": -193.29454040527344, "logps/rejected": -403.9443054199219, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.021046245470643044, "rewards/margins": 16.456911087036133, "rewards/rejected": -16.435863494873047, "step": 5320 }, { "epoch": 1.81, "learning_rate": 2.2006798438876997e-07, "logits/chosen": -0.22558502852916718, "logits/rejected": 0.23283079266548157, "logps/chosen": -362.5176086425781, "logps/rejected": -726.2399291992188, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.40531760454177856, "rewards/margins": 16.2552433013916, "rewards/rejected": -16.660560607910156, "step": 5330 }, { "epoch": 1.82, "learning_rate": 2.194384993075664e-07, "logits/chosen": -0.15307076275348663, "logits/rejected": 0.2188805639743805, "logps/chosen": -203.8430633544922, "logps/rejected": -646.9580078125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.22509615123271942, "rewards/margins": 17.147891998291016, "rewards/rejected": -16.922794342041016, "step": 5340 }, { "epoch": 1.82, "learning_rate": 2.1880901422636284e-07, "logits/chosen": -0.2980901896953583, "logits/rejected": 0.3152994215488434, "logps/chosen": -218.3779754638672, "logps/rejected": -548.4962768554688, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 0.32254430651664734, "rewards/margins": 17.208433151245117, "rewards/rejected": -16.885889053344727, "step": 5350 }, { "epoch": 1.82, "learning_rate": 2.1817952914515924e-07, "logits/chosen": -0.2368878573179245, "logits/rejected": 0.15683916211128235, "logps/chosen": -213.7670135498047, "logps/rejected": -582.83056640625, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.47492972016334534, "rewards/margins": 16.29520606994629, "rewards/rejected": -15.820277214050293, "step": 5360 }, { "epoch": 1.83, "learning_rate": 2.1755004406395566e-07, "logits/chosen": -0.27801913022994995, "logits/rejected": 0.4505406320095062, "logps/chosen": -310.16357421875, "logps/rejected": -508.34881591796875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.6883191466331482, "rewards/margins": 20.144969940185547, "rewards/rejected": -19.456653594970703, "step": 5370 }, { "epoch": 1.83, "learning_rate": 2.169205589827521e-07, "logits/chosen": -0.24207353591918945, "logits/rejected": 0.3546573221683502, "logps/chosen": -224.4813690185547, "logps/rejected": -571.1903686523438, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": 0.31762224435806274, "rewards/margins": 19.010717391967773, "rewards/rejected": -18.69309425354004, "step": 5380 }, { "epoch": 1.83, "learning_rate": 2.1629107390154853e-07, "logits/chosen": -0.17498892545700073, "logits/rejected": 0.32275187969207764, "logps/chosen": -217.82754516601562, "logps/rejected": -704.5285034179688, "loss": 0.009, "rewards/accuracies": 1.0, "rewards/chosen": 0.11381125450134277, "rewards/margins": 18.922584533691406, "rewards/rejected": -18.808773040771484, "step": 5390 }, { "epoch": 1.84, "learning_rate": 2.1566158882034493e-07, "logits/chosen": -0.21838775277137756, "logits/rejected": 0.23731546103954315, "logps/chosen": -218.18173217773438, "logps/rejected": -456.1255798339844, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": 0.37379026412963867, "rewards/margins": 16.473407745361328, "rewards/rejected": -16.0996150970459, "step": 5400 }, { "epoch": 1.84, "eval_logits/chosen": -0.20900799334049225, "eval_logits/rejected": 0.3843803107738495, "eval_logps/chosen": -271.8638916015625, "eval_logps/rejected": -611.7510986328125, "eval_loss": 0.008570768870413303, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.26177167892456055, "eval_rewards/margins": 16.42721176147461, "eval_rewards/rejected": -16.16543960571289, "eval_runtime": 527.1124, "eval_samples_per_second": 18.023, "eval_steps_per_second": 0.563, "step": 5400 }, { "epoch": 1.84, "learning_rate": 2.1503210373914138e-07, "logits/chosen": -0.27639999985694885, "logits/rejected": 0.2629597783088684, "logps/chosen": -283.88433837890625, "logps/rejected": -734.3416137695312, "loss": 0.0054, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.3526756167411804, "rewards/margins": 15.018835067749023, "rewards/rejected": -14.666158676147461, "step": 5410 }, { "epoch": 1.84, "learning_rate": 2.144026186579378e-07, "logits/chosen": -0.3069196343421936, "logits/rejected": 0.3490968346595764, "logps/chosen": -198.52545166015625, "logps/rejected": -754.1739501953125, "loss": 0.0131, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.41662734746932983, "rewards/margins": 18.11870002746582, "rewards/rejected": -17.702072143554688, "step": 5420 }, { "epoch": 1.85, "learning_rate": 2.1377313357673422e-07, "logits/chosen": -0.20775966346263885, "logits/rejected": 0.24204035103321075, "logps/chosen": -275.33209228515625, "logps/rejected": -639.8811645507812, "loss": 0.0082, "rewards/accuracies": 1.0, "rewards/chosen": 0.06298469007015228, "rewards/margins": 15.272375106811523, "rewards/rejected": -15.209393501281738, "step": 5430 }, { "epoch": 1.85, "learning_rate": 2.1314364849553065e-07, "logits/chosen": -0.1870991289615631, "logits/rejected": 0.4029426574707031, "logps/chosen": -266.2757873535156, "logps/rejected": -536.8067626953125, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -0.19343501329421997, "rewards/margins": 15.832219123840332, "rewards/rejected": -16.025653839111328, "step": 5440 }, { "epoch": 1.85, "learning_rate": 2.1251416341432707e-07, "logits/chosen": -0.2428824007511139, "logits/rejected": 0.36825767159461975, "logps/chosen": -294.28173828125, "logps/rejected": -704.3658447265625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.3539329171180725, "rewards/margins": 17.246095657348633, "rewards/rejected": -16.892162322998047, "step": 5450 }, { "epoch": 1.86, "learning_rate": 2.118846783331235e-07, "logits/chosen": -0.2324047088623047, "logits/rejected": 0.43397340178489685, "logps/chosen": -321.74591064453125, "logps/rejected": -644.1819458007812, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.26601654291152954, "rewards/margins": 17.912141799926758, "rewards/rejected": -17.646121978759766, "step": 5460 }, { "epoch": 1.86, "learning_rate": 2.1125519325191994e-07, "logits/chosen": -0.1619143784046173, "logits/rejected": 0.4060164988040924, "logps/chosen": -229.3539581298828, "logps/rejected": -725.5740356445312, "loss": 0.0076, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5579464435577393, "rewards/margins": 18.396854400634766, "rewards/rejected": -17.838909149169922, "step": 5470 }, { "epoch": 1.86, "learning_rate": 2.1062570817071634e-07, "logits/chosen": -0.3166027367115021, "logits/rejected": 0.3034141957759857, "logps/chosen": -344.9189147949219, "logps/rejected": -619.9742431640625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.7150207757949829, "rewards/margins": 14.304791450500488, "rewards/rejected": -13.589770317077637, "step": 5480 }, { "epoch": 1.87, "learning_rate": 2.0999622308951276e-07, "logits/chosen": -0.2615721523761749, "logits/rejected": 0.38696208596229553, "logps/chosen": -270.4552001953125, "logps/rejected": -690.7411499023438, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.6382284164428711, "rewards/margins": 16.92736053466797, "rewards/rejected": -16.289133071899414, "step": 5490 }, { "epoch": 1.87, "learning_rate": 2.093667380083092e-07, "logits/chosen": -0.19689472019672394, "logits/rejected": 0.36273878812789917, "logps/chosen": -221.4030303955078, "logps/rejected": -549.1292724609375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.37151676416397095, "rewards/margins": 16.04450225830078, "rewards/rejected": -15.672983169555664, "step": 5500 }, { "epoch": 1.87, "eval_logits/chosen": -0.2134546935558319, "eval_logits/rejected": 0.38739654421806335, "eval_logps/chosen": -271.0378112792969, "eval_logps/rejected": -607.9134521484375, "eval_loss": 0.007940355688333511, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.3443792164325714, "eval_rewards/margins": 16.126060485839844, "eval_rewards/rejected": -15.781681060791016, "eval_runtime": 525.6532, "eval_samples_per_second": 18.073, "eval_steps_per_second": 0.565, "step": 5500 }, { "epoch": 1.87, "learning_rate": 2.087372529271056e-07, "logits/chosen": -0.2595334053039551, "logits/rejected": 0.3243879973888397, "logps/chosen": -366.8973388671875, "logps/rejected": -755.8107299804688, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": 0.7508545517921448, "rewards/margins": 17.171112060546875, "rewards/rejected": -16.420255661010742, "step": 5510 }, { "epoch": 1.88, "learning_rate": 2.0810776784590203e-07, "logits/chosen": -0.2848939299583435, "logits/rejected": 0.49537643790245056, "logps/chosen": -223.2776641845703, "logps/rejected": -588.4720458984375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.2536093294620514, "rewards/margins": 16.945606231689453, "rewards/rejected": -16.69199562072754, "step": 5520 }, { "epoch": 1.88, "learning_rate": 2.0747828276469848e-07, "logits/chosen": -0.19446370005607605, "logits/rejected": 0.22513294219970703, "logps/chosen": -333.64947509765625, "logps/rejected": -752.1638793945312, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.3184429705142975, "rewards/margins": 14.908169746398926, "rewards/rejected": -14.589727401733398, "step": 5530 }, { "epoch": 1.88, "learning_rate": 2.068487976834949e-07, "logits/chosen": -0.23224219679832458, "logits/rejected": 0.33672937750816345, "logps/chosen": -227.2715301513672, "logps/rejected": -599.836181640625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.40623635053634644, "rewards/margins": 16.359764099121094, "rewards/rejected": -15.953527450561523, "step": 5540 }, { "epoch": 1.89, "learning_rate": 2.062193126022913e-07, "logits/chosen": -0.21212884783744812, "logits/rejected": 0.3291279673576355, "logps/chosen": -211.81649780273438, "logps/rejected": -712.8986206054688, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.019483065232634544, "rewards/margins": 16.863235473632812, "rewards/rejected": -16.84375, "step": 5550 }, { "epoch": 1.89, "learning_rate": 2.0558982752108775e-07, "logits/chosen": -0.23001182079315186, "logits/rejected": 0.39228659868240356, "logps/chosen": -238.03939819335938, "logps/rejected": -709.7716064453125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.1797041893005371, "rewards/margins": 17.386634826660156, "rewards/rejected": -17.206932067871094, "step": 5560 }, { "epoch": 1.89, "learning_rate": 2.0496034243988417e-07, "logits/chosen": -0.15934190154075623, "logits/rejected": 0.28871503472328186, "logps/chosen": -281.2515869140625, "logps/rejected": -700.445556640625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.40522265434265137, "rewards/margins": 15.102128982543945, "rewards/rejected": -15.507352828979492, "step": 5570 }, { "epoch": 1.9, "learning_rate": 2.043308573586806e-07, "logits/chosen": -0.26556357741355896, "logits/rejected": 0.29871588945388794, "logps/chosen": -349.8337097167969, "logps/rejected": -760.4382934570312, "loss": 0.0159, "rewards/accuracies": 1.0, "rewards/chosen": 0.21544304490089417, "rewards/margins": 16.09063720703125, "rewards/rejected": -15.87519359588623, "step": 5580 }, { "epoch": 1.9, "learning_rate": 2.0370137227747701e-07, "logits/chosen": -0.267770379781723, "logits/rejected": 0.28027230501174927, "logps/chosen": -293.6412353515625, "logps/rejected": -693.4312133789062, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.05495818331837654, "rewards/margins": 16.68754768371582, "rewards/rejected": -16.63258934020996, "step": 5590 }, { "epoch": 1.9, "learning_rate": 2.0307188719627344e-07, "logits/chosen": -0.3187578320503235, "logits/rejected": 0.32822805643081665, "logps/chosen": -352.2935485839844, "logps/rejected": -520.6339111328125, "loss": 0.0102, "rewards/accuracies": 1.0, "rewards/chosen": 0.15941059589385986, "rewards/margins": 14.930543899536133, "rewards/rejected": -14.771133422851562, "step": 5600 }, { "epoch": 1.9, "eval_logits/chosen": -0.2243366241455078, "eval_logits/rejected": 0.37126895785331726, "eval_logps/chosen": -271.2060852050781, "eval_logps/rejected": -607.3621826171875, "eval_loss": 0.008112690411508083, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.32755622267723083, "eval_rewards/margins": 16.05410385131836, "eval_rewards/rejected": -15.726546287536621, "eval_runtime": 527.7702, "eval_samples_per_second": 18.0, "eval_steps_per_second": 0.563, "step": 5600 }, { "epoch": 1.91, "learning_rate": 2.0244240211506986e-07, "logits/chosen": -0.3094936013221741, "logits/rejected": 0.24474425613880157, "logps/chosen": -250.4047088623047, "logps/rejected": -444.4305725097656, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.08906960487365723, "rewards/margins": 16.0728759765625, "rewards/rejected": -15.983807563781738, "step": 5610 }, { "epoch": 1.91, "learning_rate": 2.018129170338663e-07, "logits/chosen": -0.2810131907463074, "logits/rejected": 0.44150057435035706, "logps/chosen": -424.03369140625, "logps/rejected": -544.9591064453125, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.4597306251525879, "rewards/margins": 16.7926025390625, "rewards/rejected": -16.33287239074707, "step": 5620 }, { "epoch": 1.91, "learning_rate": 2.011834319526627e-07, "logits/chosen": -0.28891420364379883, "logits/rejected": 0.2027096301317215, "logps/chosen": -287.26898193359375, "logps/rejected": -589.1583862304688, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 0.558029294013977, "rewards/margins": 16.288555145263672, "rewards/rejected": -15.730525016784668, "step": 5630 }, { "epoch": 1.92, "learning_rate": 2.0055394687145913e-07, "logits/chosen": -0.13017983734607697, "logits/rejected": 0.13880492746829987, "logps/chosen": -285.3046569824219, "logps/rejected": -872.0685424804688, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.07373488694429398, "rewards/margins": 16.499713897705078, "rewards/rejected": -16.573450088500977, "step": 5640 }, { "epoch": 1.92, "learning_rate": 1.9992446179025558e-07, "logits/chosen": -0.17533858120441437, "logits/rejected": 0.1345931589603424, "logps/chosen": -214.3559112548828, "logps/rejected": -662.1519775390625, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -0.06298886239528656, "rewards/margins": 14.306245803833008, "rewards/rejected": -14.369234085083008, "step": 5650 }, { "epoch": 1.92, "learning_rate": 1.99294976709052e-07, "logits/chosen": -0.2773711383342743, "logits/rejected": 0.29405921697616577, "logps/chosen": -224.879638671875, "logps/rejected": -620.8670654296875, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.31945663690567017, "rewards/margins": 16.46816635131836, "rewards/rejected": -16.14870834350586, "step": 5660 }, { "epoch": 1.93, "learning_rate": 1.986654916278484e-07, "logits/chosen": -0.3291993737220764, "logits/rejected": 0.1797594428062439, "logps/chosen": -216.73208618164062, "logps/rejected": -618.1077880859375, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.24222438037395477, "rewards/margins": 16.04678726196289, "rewards/rejected": -15.804563522338867, "step": 5670 }, { "epoch": 1.93, "learning_rate": 1.9803600654664484e-07, "logits/chosen": -0.18082396686077118, "logits/rejected": 0.18466249108314514, "logps/chosen": -220.67111206054688, "logps/rejected": -849.0403442382812, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.5298973917961121, "rewards/margins": 15.39549446105957, "rewards/rejected": -14.865594863891602, "step": 5680 }, { "epoch": 1.93, "learning_rate": 1.9740652146544127e-07, "logits/chosen": -0.2862517535686493, "logits/rejected": 0.31639164686203003, "logps/chosen": -296.8106384277344, "logps/rejected": -610.2625732421875, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": 0.42867621779441833, "rewards/margins": 16.1112060546875, "rewards/rejected": -15.682531356811523, "step": 5690 }, { "epoch": 1.94, "learning_rate": 1.9677703638423766e-07, "logits/chosen": -0.3020087480545044, "logits/rejected": 0.3095568120479584, "logps/chosen": -223.4989776611328, "logps/rejected": -550.5674438476562, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": 0.1525387316942215, "rewards/margins": 18.096965789794922, "rewards/rejected": -17.944425582885742, "step": 5700 }, { "epoch": 1.94, "eval_logits/chosen": -0.24294044077396393, "eval_logits/rejected": 0.35615256428718567, "eval_logps/chosen": -270.5030212402344, "eval_logps/rejected": -608.7669677734375, "eval_loss": 0.007202023174613714, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.3978580832481384, "eval_rewards/margins": 16.264881134033203, "eval_rewards/rejected": -15.867026329040527, "eval_runtime": 527.6207, "eval_samples_per_second": 18.005, "eval_steps_per_second": 0.563, "step": 5700 }, { "epoch": 1.94, "learning_rate": 1.961475513030341e-07, "logits/chosen": -0.33480992913246155, "logits/rejected": 0.2547816336154938, "logps/chosen": -342.53411865234375, "logps/rejected": -672.2218017578125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.12401384115219116, "rewards/margins": 16.24891471862793, "rewards/rejected": -16.12489891052246, "step": 5710 }, { "epoch": 1.94, "learning_rate": 1.9551806622183054e-07, "logits/chosen": -0.23172907531261444, "logits/rejected": 0.21992447972297668, "logps/chosen": -213.4493865966797, "logps/rejected": -678.9910888671875, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.36775389313697815, "rewards/margins": 16.887378692626953, "rewards/rejected": -16.519624710083008, "step": 5720 }, { "epoch": 1.95, "learning_rate": 1.9488858114062696e-07, "logits/chosen": -0.35242146253585815, "logits/rejected": 0.32691460847854614, "logps/chosen": -218.6578826904297, "logps/rejected": -480.55877685546875, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": 0.8630514144897461, "rewards/margins": 16.954036712646484, "rewards/rejected": -16.090984344482422, "step": 5730 }, { "epoch": 1.95, "learning_rate": 1.9425909605942338e-07, "logits/chosen": -0.23032359778881073, "logits/rejected": 0.36701005697250366, "logps/chosen": -429.4517517089844, "logps/rejected": -620.8499145507812, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": 0.04655808210372925, "rewards/margins": 14.94971752166748, "rewards/rejected": -14.903158187866211, "step": 5740 }, { "epoch": 1.95, "learning_rate": 1.936296109782198e-07, "logits/chosen": -0.22126004099845886, "logits/rejected": 0.2644481062889099, "logps/chosen": -263.5820617675781, "logps/rejected": -866.2135009765625, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": 0.38394907116889954, "rewards/margins": 19.288494110107422, "rewards/rejected": -18.9045467376709, "step": 5750 }, { "epoch": 1.96, "learning_rate": 1.9300012589701623e-07, "logits/chosen": -0.2722070515155792, "logits/rejected": 0.11727213859558105, "logps/chosen": -302.3346862792969, "logps/rejected": -749.9793701171875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.2403169572353363, "rewards/margins": 16.197751998901367, "rewards/rejected": -15.957438468933105, "step": 5760 }, { "epoch": 1.96, "learning_rate": 1.9237064081581268e-07, "logits/chosen": -0.2854838967323303, "logits/rejected": 0.30364522337913513, "logps/chosen": -235.62368774414062, "logps/rejected": -698.3501586914062, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": 1.2268247604370117, "rewards/margins": 18.424163818359375, "rewards/rejected": -17.197338104248047, "step": 5770 }, { "epoch": 1.96, "learning_rate": 1.9174115573460907e-07, "logits/chosen": -0.26903393864631653, "logits/rejected": 0.46403464674949646, "logps/chosen": -293.4249572753906, "logps/rejected": -673.2144775390625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 0.23060627281665802, "rewards/margins": 16.756771087646484, "rewards/rejected": -16.526165008544922, "step": 5780 }, { "epoch": 1.97, "learning_rate": 1.911116706534055e-07, "logits/chosen": -0.29838284850120544, "logits/rejected": 0.43118754029273987, "logps/chosen": -376.1150207519531, "logps/rejected": -607.468505859375, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.0776040107011795, "rewards/margins": 15.346731185913086, "rewards/rejected": -15.424336433410645, "step": 5790 }, { "epoch": 1.97, "learning_rate": 1.9048218557220194e-07, "logits/chosen": -0.28796666860580444, "logits/rejected": 0.40464717149734497, "logps/chosen": -285.26611328125, "logps/rejected": -452.96453857421875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.04571080207824707, "rewards/margins": 16.458232879638672, "rewards/rejected": -16.412521362304688, "step": 5800 }, { "epoch": 1.97, "eval_logits/chosen": -0.2424379140138626, "eval_logits/rejected": 0.3329405188560486, "eval_logps/chosen": -271.66473388671875, "eval_logps/rejected": -617.9911499023438, "eval_loss": 0.006936150137335062, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.2816869020462036, "eval_rewards/margins": 17.071128845214844, "eval_rewards/rejected": -16.789443969726562, "eval_runtime": 528.5559, "eval_samples_per_second": 17.974, "eval_steps_per_second": 0.562, "step": 5800 }, { "epoch": 1.97, "learning_rate": 1.8985270049099837e-07, "logits/chosen": -0.16651441156864166, "logits/rejected": 0.07496734708547592, "logps/chosen": -224.93896484375, "logps/rejected": -709.8622436523438, "loss": 0.0099, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.08881906419992447, "rewards/margins": 17.243427276611328, "rewards/rejected": -17.332242965698242, "step": 5810 }, { "epoch": 1.98, "learning_rate": 1.8922321540979476e-07, "logits/chosen": -0.35579246282577515, "logits/rejected": 0.3091728091239929, "logps/chosen": -364.92608642578125, "logps/rejected": -472.3392028808594, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": 0.15512046217918396, "rewards/margins": 16.682064056396484, "rewards/rejected": -16.52694320678711, "step": 5820 }, { "epoch": 1.98, "learning_rate": 1.885937303285912e-07, "logits/chosen": -0.1523863971233368, "logits/rejected": 0.12501832842826843, "logps/chosen": -207.5217742919922, "logps/rejected": -664.1605224609375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.32198864221572876, "rewards/margins": 17.415260314941406, "rewards/rejected": -17.093273162841797, "step": 5830 }, { "epoch": 1.99, "learning_rate": 1.8796424524738764e-07, "logits/chosen": -0.2685990333557129, "logits/rejected": 0.18150705099105835, "logps/chosen": -299.9930725097656, "logps/rejected": -567.1378784179688, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.26093798875808716, "rewards/margins": 17.061798095703125, "rewards/rejected": -16.800859451293945, "step": 5840 }, { "epoch": 1.99, "learning_rate": 1.8733476016618406e-07, "logits/chosen": -0.2469438761472702, "logits/rejected": 0.2513132095336914, "logps/chosen": -272.7760925292969, "logps/rejected": -634.8369140625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.315464586019516, "rewards/margins": 17.699281692504883, "rewards/rejected": -17.38381576538086, "step": 5850 }, { "epoch": 1.99, "learning_rate": 1.8670527508498048e-07, "logits/chosen": -0.24319203197956085, "logits/rejected": 0.2661622166633606, "logps/chosen": -213.7449188232422, "logps/rejected": -829.9744873046875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": 0.1760626584291458, "rewards/margins": 19.192506790161133, "rewards/rejected": -19.016443252563477, "step": 5860 }, { "epoch": 2.0, "learning_rate": 1.860757900037769e-07, "logits/chosen": -0.2502209544181824, "logits/rejected": 0.1530526578426361, "logps/chosen": -216.0438232421875, "logps/rejected": -531.3895874023438, "loss": 0.0067, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.4344845712184906, "rewards/margins": 18.487579345703125, "rewards/rejected": -18.053096771240234, "step": 5870 }, { "epoch": 2.0, "learning_rate": 1.8544630492257333e-07, "logits/chosen": -0.21497955918312073, "logits/rejected": 0.19132864475250244, "logps/chosen": -211.25729370117188, "logps/rejected": -689.0565185546875, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": 0.145445317029953, "rewards/margins": 17.28690528869629, "rewards/rejected": -17.141456604003906, "step": 5880 }, { "epoch": 2.0, "learning_rate": 1.8481681984136978e-07, "logits/chosen": -0.31305691599845886, "logits/rejected": 0.17143578827381134, "logps/chosen": -237.39932250976562, "logps/rejected": -480.2059631347656, "loss": 0.0042, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.05644633248448372, "rewards/margins": 16.20521354675293, "rewards/rejected": -16.14876937866211, "step": 5890 }, { "epoch": 2.01, "learning_rate": 1.8418733476016617e-07, "logits/chosen": -0.32942646741867065, "logits/rejected": 0.128877654671669, "logps/chosen": -282.2909240722656, "logps/rejected": -563.0151977539062, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.2096305638551712, "rewards/margins": 16.51173973083496, "rewards/rejected": -16.30211067199707, "step": 5900 }, { "epoch": 2.01, "eval_logits/chosen": -0.24791781604290009, "eval_logits/rejected": 0.314196914434433, "eval_logps/chosen": -273.7563781738281, "eval_logps/rejected": -623.6633911132812, "eval_loss": 0.006145686376839876, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.0725221335887909, "eval_rewards/margins": 17.42918586730957, "eval_rewards/rejected": -17.356666564941406, "eval_runtime": 523.5415, "eval_samples_per_second": 18.146, "eval_steps_per_second": 0.567, "step": 5900 }, { "epoch": 2.01, "learning_rate": 1.835578496789626e-07, "logits/chosen": -0.29892697930336, "logits/rejected": 0.124337337911129, "logps/chosen": -289.169189453125, "logps/rejected": -641.1806030273438, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.13880005478858948, "rewards/margins": 15.922407150268555, "rewards/rejected": -15.783609390258789, "step": 5910 }, { "epoch": 2.01, "learning_rate": 1.8292836459775904e-07, "logits/chosen": -0.26730072498321533, "logits/rejected": 0.09826089441776276, "logps/chosen": -350.3676452636719, "logps/rejected": -805.8424072265625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.016233813017606735, "rewards/margins": 18.4133243560791, "rewards/rejected": -18.429555892944336, "step": 5920 }, { "epoch": 2.02, "learning_rate": 1.8229887951655544e-07, "logits/chosen": -0.29472970962524414, "logits/rejected": 0.18553289771080017, "logps/chosen": -203.06497192382812, "logps/rejected": -632.9088134765625, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 0.07927639782428741, "rewards/margins": 17.170421600341797, "rewards/rejected": -17.091142654418945, "step": 5930 }, { "epoch": 2.02, "learning_rate": 1.8166939443535186e-07, "logits/chosen": -0.38160768151283264, "logits/rejected": 0.18650658428668976, "logps/chosen": -303.5093078613281, "logps/rejected": -445.9251403808594, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.15304484963417053, "rewards/margins": 17.608108520507812, "rewards/rejected": -17.455066680908203, "step": 5940 }, { "epoch": 2.02, "learning_rate": 1.8103990935414829e-07, "logits/chosen": -0.2598913311958313, "logits/rejected": 0.24410715699195862, "logps/chosen": -218.79238891601562, "logps/rejected": -631.7150268554688, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.0487818717956543, "rewards/margins": 15.921966552734375, "rewards/rejected": -15.970746994018555, "step": 5950 }, { "epoch": 2.03, "learning_rate": 1.8041042427294474e-07, "logits/chosen": -0.4094265401363373, "logits/rejected": 0.15787403285503387, "logps/chosen": -283.5187683105469, "logps/rejected": -384.2884216308594, "loss": 0.0047, "rewards/accuracies": 1.0, "rewards/chosen": -0.31633272767066956, "rewards/margins": 17.737594604492188, "rewards/rejected": -18.05392837524414, "step": 5960 }, { "epoch": 2.03, "learning_rate": 1.7978093919174113e-07, "logits/chosen": -0.2012213170528412, "logits/rejected": 0.20214995741844177, "logps/chosen": -278.2190856933594, "logps/rejected": -743.1666259765625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.3700023293495178, "rewards/margins": 16.54741668701172, "rewards/rejected": -16.91741943359375, "step": 5970 }, { "epoch": 2.03, "learning_rate": 1.7915145411053755e-07, "logits/chosen": -0.2155729979276657, "logits/rejected": 0.20178446173667908, "logps/chosen": -285.4269714355469, "logps/rejected": -695.5745239257812, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.11450423300266266, "rewards/margins": 15.882418632507324, "rewards/rejected": -15.996920585632324, "step": 5980 }, { "epoch": 2.04, "learning_rate": 1.78521969029334e-07, "logits/chosen": -0.16392046213150024, "logits/rejected": 0.30007871985435486, "logps/chosen": -288.28961181640625, "logps/rejected": -782.7386474609375, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.22988276183605194, "rewards/margins": 18.197681427001953, "rewards/rejected": -18.427562713623047, "step": 5990 }, { "epoch": 2.04, "learning_rate": 1.7789248394813043e-07, "logits/chosen": -0.34768110513687134, "logits/rejected": 0.16700276732444763, "logps/chosen": -344.2459411621094, "logps/rejected": -516.8478393554688, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.1089736595749855, "rewards/margins": 19.375873565673828, "rewards/rejected": -19.266897201538086, "step": 6000 }, { "epoch": 2.04, "eval_logits/chosen": -0.2445012927055359, "eval_logits/rejected": 0.2947598695755005, "eval_logps/chosen": -274.66058349609375, "eval_logps/rejected": -632.0012817382812, "eval_loss": 0.006602023728191853, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.01789783127605915, "eval_rewards/margins": 18.17255973815918, "eval_rewards/rejected": -18.190458297729492, "eval_runtime": 524.438, "eval_samples_per_second": 18.115, "eval_steps_per_second": 0.566, "step": 6000 }, { "epoch": 2.04, "learning_rate": 1.7726299886692682e-07, "logits/chosen": -0.23072710633277893, "logits/rejected": 0.1372895985841751, "logps/chosen": -244.45614624023438, "logps/rejected": -545.9716796875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -0.23154664039611816, "rewards/margins": 16.016613006591797, "rewards/rejected": -16.248159408569336, "step": 6010 }, { "epoch": 2.05, "learning_rate": 1.7663351378572327e-07, "logits/chosen": -0.17434139549732208, "logits/rejected": 0.11416490375995636, "logps/chosen": -213.87002563476562, "logps/rejected": -847.3606567382812, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.22937551140785217, "rewards/margins": 18.93256950378418, "rewards/rejected": -19.161945343017578, "step": 6020 }, { "epoch": 2.05, "learning_rate": 1.760040287045197e-07, "logits/chosen": -0.2952490448951721, "logits/rejected": 0.20077571272850037, "logps/chosen": -426.46124267578125, "logps/rejected": -586.065185546875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.1649525910615921, "rewards/margins": 16.20620346069336, "rewards/rejected": -16.37115478515625, "step": 6030 }, { "epoch": 2.05, "learning_rate": 1.7537454362331612e-07, "logits/chosen": -0.30631059408187866, "logits/rejected": 0.16821905970573425, "logps/chosen": -275.1513671875, "logps/rejected": -577.6903076171875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.5488194823265076, "rewards/margins": 17.64291000366211, "rewards/rejected": -18.191730499267578, "step": 6040 }, { "epoch": 2.06, "learning_rate": 1.7474505854211254e-07, "logits/chosen": -0.3238984942436218, "logits/rejected": 0.35157641768455505, "logps/chosen": -415.6222229003906, "logps/rejected": -603.0684814453125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.19886530935764313, "rewards/margins": 18.552001953125, "rewards/rejected": -18.750864028930664, "step": 6050 }, { "epoch": 2.06, "learning_rate": 1.7411557346090896e-07, "logits/chosen": -0.37774115800857544, "logits/rejected": 0.27368661761283875, "logps/chosen": -269.29656982421875, "logps/rejected": -520.0762939453125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.3215358257293701, "rewards/margins": 17.221912384033203, "rewards/rejected": -17.543447494506836, "step": 6060 }, { "epoch": 2.06, "learning_rate": 1.7348608837970539e-07, "logits/chosen": -0.25069373846054077, "logits/rejected": 0.18899503350257874, "logps/chosen": -278.92498779296875, "logps/rejected": -690.9259033203125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.17887410521507263, "rewards/margins": 18.217323303222656, "rewards/rejected": -18.0384521484375, "step": 6070 }, { "epoch": 2.07, "learning_rate": 1.7285660329850184e-07, "logits/chosen": -0.2981416583061218, "logits/rejected": 0.2948748469352722, "logps/chosen": -216.1656951904297, "logps/rejected": -604.5318603515625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.3024456799030304, "rewards/margins": 20.219736099243164, "rewards/rejected": -19.91728973388672, "step": 6080 }, { "epoch": 2.07, "learning_rate": 1.7222711821729823e-07, "logits/chosen": -0.22098591923713684, "logits/rejected": 0.22349879145622253, "logps/chosen": -206.4998016357422, "logps/rejected": -739.8681030273438, "loss": 0.0049, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.00014909208402968943, "rewards/margins": 16.985790252685547, "rewards/rejected": -16.98594093322754, "step": 6090 }, { "epoch": 2.07, "learning_rate": 1.7159763313609465e-07, "logits/chosen": -0.36355119943618774, "logits/rejected": 0.24322807788848877, "logps/chosen": -294.1744689941406, "logps/rejected": -555.7070922851562, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.05815547704696655, "rewards/margins": 18.38794708251953, "rewards/rejected": -18.446104049682617, "step": 6100 }, { "epoch": 2.07, "eval_logits/chosen": -0.24244312942028046, "eval_logits/rejected": 0.30429720878601074, "eval_logps/chosen": -274.34136962890625, "eval_logps/rejected": -628.537353515625, "eval_loss": 0.006368852686136961, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.014024504460394382, "eval_rewards/margins": 17.85808753967285, "eval_rewards/rejected": -17.844064712524414, "eval_runtime": 524.8872, "eval_samples_per_second": 18.099, "eval_steps_per_second": 0.566, "step": 6100 }, { "epoch": 2.08, "learning_rate": 1.709681480548911e-07, "logits/chosen": -0.36126190423965454, "logits/rejected": 0.20376403629779816, "logps/chosen": -203.55181884765625, "logps/rejected": -507.58154296875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": 0.12046430259943008, "rewards/margins": 17.964651107788086, "rewards/rejected": -17.844188690185547, "step": 6110 }, { "epoch": 2.08, "learning_rate": 1.7033866297368753e-07, "logits/chosen": -0.2814735770225525, "logits/rejected": 0.19016481935977936, "logps/chosen": -298.61956787109375, "logps/rejected": -676.4288330078125, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.23248633742332458, "rewards/margins": 18.993560791015625, "rewards/rejected": -18.761075973510742, "step": 6120 }, { "epoch": 2.08, "learning_rate": 1.6970917789248392e-07, "logits/chosen": -0.2658081650733948, "logits/rejected": 0.1751350611448288, "logps/chosen": -254.8261260986328, "logps/rejected": -791.1603393554688, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.6687658429145813, "rewards/margins": 19.54302978515625, "rewards/rejected": -20.211795806884766, "step": 6130 }, { "epoch": 2.09, "learning_rate": 1.6907969281128037e-07, "logits/chosen": -0.3647603988647461, "logits/rejected": 0.3770098090171814, "logps/chosen": -362.87103271484375, "logps/rejected": -567.1387939453125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.7428981065750122, "rewards/margins": 17.529659271240234, "rewards/rejected": -18.27255630493164, "step": 6140 }, { "epoch": 2.09, "learning_rate": 1.684502077300768e-07, "logits/chosen": -0.2362566441297531, "logits/rejected": 0.1749507486820221, "logps/chosen": -307.07257080078125, "logps/rejected": -610.452880859375, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.18092085421085358, "rewards/margins": 18.39754295349121, "rewards/rejected": -18.57846450805664, "step": 6150 }, { "epoch": 2.09, "learning_rate": 1.678207226488732e-07, "logits/chosen": -0.314441442489624, "logits/rejected": 0.18106064200401306, "logps/chosen": -280.343994140625, "logps/rejected": -550.4921264648438, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.1464841663837433, "rewards/margins": 17.868839263916016, "rewards/rejected": -17.722354888916016, "step": 6160 }, { "epoch": 2.1, "learning_rate": 1.6719123756766964e-07, "logits/chosen": -0.2472033053636551, "logits/rejected": 0.24510344862937927, "logps/chosen": -292.866943359375, "logps/rejected": -786.6128540039062, "loss": 0.0026, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.5560672283172607, "rewards/margins": 21.542831420898438, "rewards/rejected": -20.986764907836914, "step": 6170 }, { "epoch": 2.1, "learning_rate": 1.6656175248646606e-07, "logits/chosen": -0.2800043225288391, "logits/rejected": 0.2617780566215515, "logps/chosen": -261.5422058105469, "logps/rejected": -612.5068359375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.26690882444381714, "rewards/margins": 18.93963050842285, "rewards/rejected": -19.206539154052734, "step": 6180 }, { "epoch": 2.1, "learning_rate": 1.6593226740526249e-07, "logits/chosen": -0.2366696298122406, "logits/rejected": 0.26710277795791626, "logps/chosen": -196.0789794921875, "logps/rejected": -486.6404724121094, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.18462862074375153, "rewards/margins": 19.6533145904541, "rewards/rejected": -19.468685150146484, "step": 6190 }, { "epoch": 2.11, "learning_rate": 1.653027823240589e-07, "logits/chosen": -0.22490207850933075, "logits/rejected": 0.22433826327323914, "logps/chosen": -219.13461303710938, "logps/rejected": -665.281005859375, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.669370710849762, "rewards/margins": 18.104248046875, "rewards/rejected": -18.77362060546875, "step": 6200 }, { "epoch": 2.11, "eval_logits/chosen": -0.24159939587116241, "eval_logits/rejected": 0.2862565815448761, "eval_logps/chosen": -274.26531982421875, "eval_logps/rejected": -635.9210205078125, "eval_loss": 0.006564146373420954, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.02163289487361908, "eval_rewards/margins": 18.604068756103516, "eval_rewards/rejected": -18.582435607910156, "eval_runtime": 525.9038, "eval_samples_per_second": 18.064, "eval_steps_per_second": 0.565, "step": 6200 }, { "epoch": 2.11, "learning_rate": 1.6467329724285533e-07, "logits/chosen": -0.23332974314689636, "logits/rejected": 0.204209566116333, "logps/chosen": -229.6195068359375, "logps/rejected": -735.859619140625, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": -0.47944408655166626, "rewards/margins": 16.410812377929688, "rewards/rejected": -16.890256881713867, "step": 6210 }, { "epoch": 2.11, "learning_rate": 1.6404381216165175e-07, "logits/chosen": -0.28962624073028564, "logits/rejected": 0.19206061959266663, "logps/chosen": -214.2273712158203, "logps/rejected": -564.3878173828125, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.4044765532016754, "rewards/margins": 18.02292823791504, "rewards/rejected": -17.61844825744629, "step": 6220 }, { "epoch": 2.12, "learning_rate": 1.634143270804482e-07, "logits/chosen": -0.2503669261932373, "logits/rejected": 0.1635022610425949, "logps/chosen": -255.6868438720703, "logps/rejected": -568.8594360351562, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.19802293181419373, "rewards/margins": 16.677457809448242, "rewards/rejected": -16.479434967041016, "step": 6230 }, { "epoch": 2.12, "learning_rate": 1.627848419992446e-07, "logits/chosen": -0.25492826104164124, "logits/rejected": 0.2414228618144989, "logps/chosen": -348.48419189453125, "logps/rejected": -862.1163940429688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.00822954811155796, "rewards/margins": 20.911083221435547, "rewards/rejected": -20.90285301208496, "step": 6240 }, { "epoch": 2.12, "learning_rate": 1.6215535691804102e-07, "logits/chosen": -0.33898821473121643, "logits/rejected": 0.21025148034095764, "logps/chosen": -270.09014892578125, "logps/rejected": -546.6973876953125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.021437710151076317, "rewards/margins": 16.47176742553711, "rewards/rejected": -16.493206024169922, "step": 6250 }, { "epoch": 2.13, "learning_rate": 1.6152587183683747e-07, "logits/chosen": -0.1625215858221054, "logits/rejected": 0.2376302182674408, "logps/chosen": -215.5438995361328, "logps/rejected": -747.602294921875, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.17036493122577667, "rewards/margins": 21.094579696655273, "rewards/rejected": -21.26494789123535, "step": 6260 }, { "epoch": 2.13, "learning_rate": 1.608963867556339e-07, "logits/chosen": -0.3158889710903168, "logits/rejected": 0.2518649697303772, "logps/chosen": -219.6953582763672, "logps/rejected": -509.873291015625, "loss": 0.0074, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2510015368461609, "rewards/margins": 17.275257110595703, "rewards/rejected": -17.526260375976562, "step": 6270 }, { "epoch": 2.13, "learning_rate": 1.602669016744303e-07, "logits/chosen": -0.2834361493587494, "logits/rejected": 0.1608031690120697, "logps/chosen": -358.74908447265625, "logps/rejected": -866.6531982421875, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": 0.23839053511619568, "rewards/margins": 17.280546188354492, "rewards/rejected": -17.042156219482422, "step": 6280 }, { "epoch": 2.14, "learning_rate": 1.5963741659322674e-07, "logits/chosen": -0.31994783878326416, "logits/rejected": 0.16533759236335754, "logps/chosen": -233.2906951904297, "logps/rejected": -727.2020263671875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.14350751042366028, "rewards/margins": 19.70914649963379, "rewards/rejected": -19.565637588500977, "step": 6290 }, { "epoch": 2.14, "learning_rate": 1.5900793151202316e-07, "logits/chosen": -0.28093427419662476, "logits/rejected": 0.22180216014385223, "logps/chosen": -216.1814727783203, "logps/rejected": -659.6163330078125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.23562172055244446, "rewards/margins": 16.059825897216797, "rewards/rejected": -16.29545021057129, "step": 6300 }, { "epoch": 2.14, "eval_logits/chosen": -0.26910948753356934, "eval_logits/rejected": 0.27831971645355225, "eval_logps/chosen": -274.126953125, "eval_logps/rejected": -633.3418579101562, "eval_loss": 0.0070245829410851, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.03546842932701111, "eval_rewards/margins": 18.359983444213867, "eval_rewards/rejected": -18.324514389038086, "eval_runtime": 526.9797, "eval_samples_per_second": 18.027, "eval_steps_per_second": 0.564, "step": 6300 }, { "epoch": 2.14, "learning_rate": 1.5837844643081959e-07, "logits/chosen": -0.19959309697151184, "logits/rejected": 0.1755446493625641, "logps/chosen": -197.4145965576172, "logps/rejected": -751.698486328125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.38930022716522217, "rewards/margins": 18.272533416748047, "rewards/rejected": -17.88323211669922, "step": 6310 }, { "epoch": 2.15, "learning_rate": 1.57748961349616e-07, "logits/chosen": -0.25960877537727356, "logits/rejected": 0.23351207375526428, "logps/chosen": -289.0184020996094, "logps/rejected": -841.3214721679688, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.030473792925477028, "rewards/margins": 18.618886947631836, "rewards/rejected": -18.58841323852539, "step": 6320 }, { "epoch": 2.15, "learning_rate": 1.5711947626841243e-07, "logits/chosen": -0.344145268201828, "logits/rejected": 0.20020468533039093, "logps/chosen": -394.69677734375, "logps/rejected": -480.083251953125, "loss": 0.0051, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.1097058430314064, "rewards/margins": 16.93448257446289, "rewards/rejected": -17.044185638427734, "step": 6330 }, { "epoch": 2.15, "learning_rate": 1.5648999118720885e-07, "logits/chosen": -0.3571568429470062, "logits/rejected": 0.16910606622695923, "logps/chosen": -296.998046875, "logps/rejected": -525.7539672851562, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 0.316161185503006, "rewards/margins": 19.883943557739258, "rewards/rejected": -19.567781448364258, "step": 6340 }, { "epoch": 2.16, "learning_rate": 1.558605061060053e-07, "logits/chosen": -0.269703209400177, "logits/rejected": 0.23532319068908691, "logps/chosen": -234.08511352539062, "logps/rejected": -642.4534912109375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": 0.07753195613622665, "rewards/margins": 19.141117095947266, "rewards/rejected": -19.063583374023438, "step": 6350 }, { "epoch": 2.16, "learning_rate": 1.552310210248017e-07, "logits/chosen": -0.32278892397880554, "logits/rejected": 0.26047182083129883, "logps/chosen": -348.65179443359375, "logps/rejected": -795.9913330078125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": 0.16471293568611145, "rewards/margins": 19.69805335998535, "rewards/rejected": -19.533342361450195, "step": 6360 }, { "epoch": 2.17, "learning_rate": 1.5460153594359812e-07, "logits/chosen": -0.3468092978000641, "logits/rejected": 0.25072795152664185, "logps/chosen": -353.29937744140625, "logps/rejected": -668.8068237304688, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.17953363060951233, "rewards/margins": 19.120363235473633, "rewards/rejected": -19.29990005493164, "step": 6370 }, { "epoch": 2.17, "learning_rate": 1.5397205086239457e-07, "logits/chosen": -0.2564821243286133, "logits/rejected": 0.12518107891082764, "logps/chosen": -283.51531982421875, "logps/rejected": -742.7449951171875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 0.0033891082275658846, "rewards/margins": 20.041833877563477, "rewards/rejected": -20.03844451904297, "step": 6380 }, { "epoch": 2.17, "learning_rate": 1.5334256578119097e-07, "logits/chosen": -0.3658653199672699, "logits/rejected": 0.14439979195594788, "logps/chosen": -292.1717224121094, "logps/rejected": -632.4939575195312, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.5611403584480286, "rewards/margins": 18.587505340576172, "rewards/rejected": -18.0263671875, "step": 6390 }, { "epoch": 2.18, "learning_rate": 1.527130806999874e-07, "logits/chosen": -0.3083403706550598, "logits/rejected": 0.2851422429084778, "logps/chosen": -362.3641052246094, "logps/rejected": -498.9744567871094, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.11234583705663681, "rewards/margins": 16.924144744873047, "rewards/rejected": -17.036489486694336, "step": 6400 }, { "epoch": 2.18, "eval_logits/chosen": -0.274431437253952, "eval_logits/rejected": 0.27716779708862305, "eval_logps/chosen": -273.0082702636719, "eval_logps/rejected": -632.3882446289062, "eval_loss": 0.006464376579970121, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.14733552932739258, "eval_rewards/margins": 18.376487731933594, "eval_rewards/rejected": -18.22915267944336, "eval_runtime": 524.0101, "eval_samples_per_second": 18.129, "eval_steps_per_second": 0.567, "step": 6400 }, { "epoch": 2.18, "learning_rate": 1.5208359561878384e-07, "logits/chosen": -0.36429715156555176, "logits/rejected": 0.2089766561985016, "logps/chosen": -282.7515563964844, "logps/rejected": -587.1927490234375, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.3326440751552582, "rewards/margins": 18.147247314453125, "rewards/rejected": -18.47989273071289, "step": 6410 }, { "epoch": 2.18, "learning_rate": 1.5145411053758026e-07, "logits/chosen": -0.3960524797439575, "logits/rejected": 0.21260377764701843, "logps/chosen": -281.7369079589844, "logps/rejected": -606.4387817382812, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": 0.3391602039337158, "rewards/margins": 20.824857711791992, "rewards/rejected": -20.48569679260254, "step": 6420 }, { "epoch": 2.19, "learning_rate": 1.5082462545637666e-07, "logits/chosen": -0.27986031770706177, "logits/rejected": 0.2526266276836395, "logps/chosen": -223.9673614501953, "logps/rejected": -479.51495361328125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -0.49703145027160645, "rewards/margins": 19.599702835083008, "rewards/rejected": -20.096736907958984, "step": 6430 }, { "epoch": 2.19, "learning_rate": 1.501951403751731e-07, "logits/chosen": -0.30177170038223267, "logits/rejected": 0.2082267701625824, "logps/chosen": -224.7807159423828, "logps/rejected": -600.1585083007812, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -0.38912221789360046, "rewards/margins": 19.065900802612305, "rewards/rejected": -19.45502281188965, "step": 6440 }, { "epoch": 2.19, "learning_rate": 1.4956565529396953e-07, "logits/chosen": -0.3368867337703705, "logits/rejected": 0.3179774284362793, "logps/chosen": -230.76309204101562, "logps/rejected": -624.25, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.003590142820030451, "rewards/margins": 21.26156997680664, "rewards/rejected": -21.265161514282227, "step": 6450 }, { "epoch": 2.2, "learning_rate": 1.4893617021276595e-07, "logits/chosen": -0.2632468044757843, "logits/rejected": 0.1666715294122696, "logps/chosen": -225.4149169921875, "logps/rejected": -697.9102172851562, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.28756603598594666, "rewards/margins": 19.429786682128906, "rewards/rejected": -19.142223358154297, "step": 6460 }, { "epoch": 2.2, "learning_rate": 1.4830668513156238e-07, "logits/chosen": -0.25526729226112366, "logits/rejected": 0.33205950260162354, "logps/chosen": -198.2593536376953, "logps/rejected": -423.2491760253906, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.36298051476478577, "rewards/margins": 19.73740577697754, "rewards/rejected": -20.100383758544922, "step": 6470 }, { "epoch": 2.2, "learning_rate": 1.476772000503588e-07, "logits/chosen": -0.3059542775154114, "logits/rejected": 0.16166433691978455, "logps/chosen": -218.1188507080078, "logps/rejected": -562.7731323242188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.20851261913776398, "rewards/margins": 19.380380630493164, "rewards/rejected": -19.17186737060547, "step": 6480 }, { "epoch": 2.21, "learning_rate": 1.4704771496915522e-07, "logits/chosen": -0.3601822555065155, "logits/rejected": 0.2488260269165039, "logps/chosen": -231.4885711669922, "logps/rejected": -575.127685546875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": 0.06502005457878113, "rewards/margins": 20.86343002319336, "rewards/rejected": -20.79840850830078, "step": 6490 }, { "epoch": 2.21, "learning_rate": 1.4641822988795167e-07, "logits/chosen": -0.30996280908584595, "logits/rejected": 0.190445676445961, "logps/chosen": -219.4307403564453, "logps/rejected": -642.9260864257812, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.05231257528066635, "rewards/margins": 17.057174682617188, "rewards/rejected": -17.109487533569336, "step": 6500 }, { "epoch": 2.21, "eval_logits/chosen": -0.2755988836288452, "eval_logits/rejected": 0.27500149607658386, "eval_logps/chosen": -274.0310974121094, "eval_logps/rejected": -629.1483154296875, "eval_loss": 0.006406448315829039, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.04504971206188202, "eval_rewards/margins": 17.950206756591797, "eval_rewards/rejected": -17.905155181884766, "eval_runtime": 527.8368, "eval_samples_per_second": 17.998, "eval_steps_per_second": 0.563, "step": 6500 }, { "epoch": 2.21, "learning_rate": 1.4578874480674807e-07, "logits/chosen": -0.2850262224674225, "logits/rejected": 0.30328232049942017, "logps/chosen": -330.7077941894531, "logps/rejected": -592.9375610351562, "loss": 0.0048, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.14787635207176208, "rewards/margins": 18.595319747924805, "rewards/rejected": -18.74319839477539, "step": 6510 }, { "epoch": 2.22, "learning_rate": 1.451592597255445e-07, "logits/chosen": -0.27345672249794006, "logits/rejected": 0.24951598048210144, "logps/chosen": -290.5386657714844, "logps/rejected": -570.4874877929688, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.20620405673980713, "rewards/margins": 17.949241638183594, "rewards/rejected": -18.155445098876953, "step": 6520 }, { "epoch": 2.22, "learning_rate": 1.4452977464434094e-07, "logits/chosen": -0.3413148820400238, "logits/rejected": 0.17275585234165192, "logps/chosen": -268.54986572265625, "logps/rejected": -597.8937377929688, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.13056015968322754, "rewards/margins": 17.25094223022461, "rewards/rejected": -17.381500244140625, "step": 6530 }, { "epoch": 2.22, "learning_rate": 1.4390028956313736e-07, "logits/chosen": -0.24656908214092255, "logits/rejected": 0.27930551767349243, "logps/chosen": -221.09317016601562, "logps/rejected": -633.1236572265625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": 0.3559962809085846, "rewards/margins": 19.315505981445312, "rewards/rejected": -18.95950698852539, "step": 6540 }, { "epoch": 2.23, "learning_rate": 1.4327080448193376e-07, "logits/chosen": -0.35809722542762756, "logits/rejected": 0.34384089708328247, "logps/chosen": -411.90496826171875, "logps/rejected": -537.7197265625, "loss": 0.0105, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.15119090676307678, "rewards/margins": 17.972585678100586, "rewards/rejected": -18.123775482177734, "step": 6550 }, { "epoch": 2.23, "learning_rate": 1.426413194007302e-07, "logits/chosen": -0.2958097457885742, "logits/rejected": 0.17242743074893951, "logps/chosen": -231.0353240966797, "logps/rejected": -594.7828369140625, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.3599727153778076, "rewards/margins": 17.902753829956055, "rewards/rejected": -17.54277992248535, "step": 6560 }, { "epoch": 2.23, "learning_rate": 1.4201183431952663e-07, "logits/chosen": -0.20133109390735626, "logits/rejected": 0.1813780963420868, "logps/chosen": -284.58526611328125, "logps/rejected": -828.3527221679688, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": 0.05978628993034363, "rewards/margins": 16.500408172607422, "rewards/rejected": -16.440624237060547, "step": 6570 }, { "epoch": 2.24, "learning_rate": 1.4138234923832303e-07, "logits/chosen": -0.3828926682472229, "logits/rejected": 0.14561806619167328, "logps/chosen": -286.7456970214844, "logps/rejected": -648.7333374023438, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.7256938815116882, "rewards/margins": 18.010339736938477, "rewards/rejected": -17.28464698791504, "step": 6580 }, { "epoch": 2.24, "learning_rate": 1.4075286415711948e-07, "logits/chosen": -0.3705894649028778, "logits/rejected": 0.31055203080177307, "logps/chosen": -300.64007568359375, "logps/rejected": -506.8006286621094, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.4711211621761322, "rewards/margins": 17.814119338989258, "rewards/rejected": -18.28523826599121, "step": 6590 }, { "epoch": 2.24, "learning_rate": 1.401233790759159e-07, "logits/chosen": -0.31257864832878113, "logits/rejected": 0.1258104145526886, "logps/chosen": -331.1398620605469, "logps/rejected": -589.6734008789062, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.3982500731945038, "rewards/margins": 15.513778686523438, "rewards/rejected": -15.912028312683105, "step": 6600 }, { "epoch": 2.24, "eval_logits/chosen": -0.2881312370300293, "eval_logits/rejected": 0.2773011028766632, "eval_logps/chosen": -275.1236572265625, "eval_logps/rejected": -624.3618774414062, "eval_loss": 0.006066096480935812, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.0642067939043045, "eval_rewards/margins": 17.362306594848633, "eval_rewards/rejected": -17.426513671875, "eval_runtime": 526.6244, "eval_samples_per_second": 18.039, "eval_steps_per_second": 0.564, "step": 6600 }, { "epoch": 2.25, "learning_rate": 1.3949389399471232e-07, "logits/chosen": -0.2873706519603729, "logits/rejected": 0.18461988866329193, "logps/chosen": -217.5690460205078, "logps/rejected": -642.1131591796875, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.27603667974472046, "rewards/margins": 19.614681243896484, "rewards/rejected": -19.890716552734375, "step": 6610 }, { "epoch": 2.25, "learning_rate": 1.3886440891350874e-07, "logits/chosen": -0.35169124603271484, "logits/rejected": 0.40907782316207886, "logps/chosen": -299.08282470703125, "logps/rejected": -487.9291076660156, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.28849440813064575, "rewards/margins": 18.588369369506836, "rewards/rejected": -18.876863479614258, "step": 6620 }, { "epoch": 2.25, "learning_rate": 1.3823492383230517e-07, "logits/chosen": -0.3078821301460266, "logits/rejected": 0.31803709268569946, "logps/chosen": -225.2115020751953, "logps/rejected": -709.38330078125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.25974780321121216, "rewards/margins": 18.907503128051758, "rewards/rejected": -18.647754669189453, "step": 6630 }, { "epoch": 2.26, "learning_rate": 1.376054387511016e-07, "logits/chosen": -0.31211522221565247, "logits/rejected": 0.17506560683250427, "logps/chosen": -240.1725616455078, "logps/rejected": -613.14453125, "loss": 0.0027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.35547250509262085, "rewards/margins": 17.4250545501709, "rewards/rejected": -17.780527114868164, "step": 6640 }, { "epoch": 2.26, "learning_rate": 1.36975953669898e-07, "logits/chosen": -0.37276896834373474, "logits/rejected": 0.17359210550785065, "logps/chosen": -271.49462890625, "logps/rejected": -539.6759033203125, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -0.1788751631975174, "rewards/margins": 18.04171371459961, "rewards/rejected": -18.22058868408203, "step": 6650 }, { "epoch": 2.26, "learning_rate": 1.3634646858869444e-07, "logits/chosen": -0.35923847556114197, "logits/rejected": 0.2674804627895355, "logps/chosen": -234.902587890625, "logps/rejected": -460.99932861328125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.11959018558263779, "rewards/margins": 17.098094940185547, "rewards/rejected": -17.21768569946289, "step": 6660 }, { "epoch": 2.27, "learning_rate": 1.3571698350749086e-07, "logits/chosen": -0.36603420972824097, "logits/rejected": 0.21085193753242493, "logps/chosen": -280.54608154296875, "logps/rejected": -600.9298706054688, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.17692211270332336, "rewards/margins": 17.090572357177734, "rewards/rejected": -16.913652420043945, "step": 6670 }, { "epoch": 2.27, "learning_rate": 1.3508749842628728e-07, "logits/chosen": -0.3573821783065796, "logits/rejected": 0.12679870426654816, "logps/chosen": -275.7672119140625, "logps/rejected": -631.8856201171875, "loss": 0.0064, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.09983641654253006, "rewards/margins": 17.009384155273438, "rewards/rejected": -17.10921859741211, "step": 6680 }, { "epoch": 2.27, "learning_rate": 1.3445801334508373e-07, "logits/chosen": -0.3703244924545288, "logits/rejected": 0.3044824004173279, "logps/chosen": -336.94586181640625, "logps/rejected": -406.2928771972656, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.1251940131187439, "rewards/margins": 16.859691619873047, "rewards/rejected": -16.984882354736328, "step": 6690 }, { "epoch": 2.28, "learning_rate": 1.3382852826388013e-07, "logits/chosen": -0.33444303274154663, "logits/rejected": 0.20392084121704102, "logps/chosen": -229.10458374023438, "logps/rejected": -512.11767578125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -0.0058318376541137695, "rewards/margins": 16.53385353088379, "rewards/rejected": -16.539682388305664, "step": 6700 }, { "epoch": 2.28, "eval_logits/chosen": -0.28909429907798767, "eval_logits/rejected": 0.2702862322330475, "eval_logps/chosen": -273.5308837890625, "eval_logps/rejected": -623.80712890625, "eval_loss": 0.005824754945933819, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.09507199376821518, "eval_rewards/margins": 17.466110229492188, "eval_rewards/rejected": -17.37103843688965, "eval_runtime": 526.4659, "eval_samples_per_second": 18.045, "eval_steps_per_second": 0.564, "step": 6700 }, { "epoch": 2.28, "learning_rate": 1.3319904318267655e-07, "logits/chosen": -0.26189690828323364, "logits/rejected": 0.08143340051174164, "logps/chosen": -277.47283935546875, "logps/rejected": -670.2061767578125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.10988521575927734, "rewards/margins": 17.04896354675293, "rewards/rejected": -17.158849716186523, "step": 6710 }, { "epoch": 2.28, "learning_rate": 1.32569558101473e-07, "logits/chosen": -0.371031254529953, "logits/rejected": 0.26280128955841064, "logps/chosen": -235.5426025390625, "logps/rejected": -604.6168823242188, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": 0.013932263478636742, "rewards/margins": 16.79815101623535, "rewards/rejected": -16.78421974182129, "step": 6720 }, { "epoch": 2.29, "learning_rate": 1.3194007302026942e-07, "logits/chosen": -0.3834632933139801, "logits/rejected": 0.2301315814256668, "logps/chosen": -251.02792358398438, "logps/rejected": -511.966064453125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.17480282485485077, "rewards/margins": 18.175342559814453, "rewards/rejected": -18.000537872314453, "step": 6730 }, { "epoch": 2.29, "learning_rate": 1.3131058793906582e-07, "logits/chosen": -0.3819087743759155, "logits/rejected": 0.28472423553466797, "logps/chosen": -344.573974609375, "logps/rejected": -584.6652221679688, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.29203689098358154, "rewards/margins": 16.51443099975586, "rewards/rejected": -16.806468963623047, "step": 6740 }, { "epoch": 2.29, "learning_rate": 1.3068110285786227e-07, "logits/chosen": -0.3306361734867096, "logits/rejected": 0.12100537866353989, "logps/chosen": -276.29583740234375, "logps/rejected": -521.8596801757812, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.05718214437365532, "rewards/margins": 17.102375030517578, "rewards/rejected": -17.04519271850586, "step": 6750 }, { "epoch": 2.3, "learning_rate": 1.300516177766587e-07, "logits/chosen": -0.35892319679260254, "logits/rejected": 0.2749837040901184, "logps/chosen": -268.84576416015625, "logps/rejected": -529.2630615234375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.09981068223714828, "rewards/margins": 17.4359188079834, "rewards/rejected": -17.336109161376953, "step": 6760 }, { "epoch": 2.3, "learning_rate": 1.294221326954551e-07, "logits/chosen": -0.27516138553619385, "logits/rejected": 0.14558982849121094, "logps/chosen": -291.1524963378906, "logps/rejected": -811.1837158203125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.35289672017097473, "rewards/margins": 20.788103103637695, "rewards/rejected": -21.14099884033203, "step": 6770 }, { "epoch": 2.3, "learning_rate": 1.2879264761425154e-07, "logits/chosen": -0.3847086429595947, "logits/rejected": 0.26943549513816833, "logps/chosen": -235.5531768798828, "logps/rejected": -491.88470458984375, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.002425330923870206, "rewards/margins": 19.156465530395508, "rewards/rejected": -19.154037475585938, "step": 6780 }, { "epoch": 2.31, "learning_rate": 1.2816316253304796e-07, "logits/chosen": -0.35768094658851624, "logits/rejected": 0.15673236548900604, "logps/chosen": -192.24472045898438, "logps/rejected": -559.1871337890625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": 0.06703317165374756, "rewards/margins": 18.494285583496094, "rewards/rejected": -18.427249908447266, "step": 6790 }, { "epoch": 2.31, "learning_rate": 1.2753367745184438e-07, "logits/chosen": -0.3695257902145386, "logits/rejected": 0.2545199692249298, "logps/chosen": -286.92901611328125, "logps/rejected": -451.7738342285156, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.20022472739219666, "rewards/margins": 17.79176902770996, "rewards/rejected": -17.5915470123291, "step": 6800 }, { "epoch": 2.31, "eval_logits/chosen": -0.28229886293411255, "eval_logits/rejected": 0.2544463872909546, "eval_logps/chosen": -275.01416015625, "eval_logps/rejected": -632.8410034179688, "eval_loss": 0.005682698916643858, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.05325419828295708, "eval_rewards/margins": 18.22117805480957, "eval_rewards/rejected": -18.274431228637695, "eval_runtime": 525.8057, "eval_samples_per_second": 18.068, "eval_steps_per_second": 0.565, "step": 6800 }, { "epoch": 2.31, "learning_rate": 1.2690419237064083e-07, "logits/chosen": -0.3517678380012512, "logits/rejected": 0.13300937414169312, "logps/chosen": -408.74468994140625, "logps/rejected": -623.3833618164062, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.23189333081245422, "rewards/margins": 16.918251037597656, "rewards/rejected": -16.686357498168945, "step": 6810 }, { "epoch": 2.32, "learning_rate": 1.2627470728943723e-07, "logits/chosen": -0.3187394142150879, "logits/rejected": 0.14183321595191956, "logps/chosen": -393.2695007324219, "logps/rejected": -729.14111328125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.5453308820724487, "rewards/margins": 19.44426727294922, "rewards/rejected": -18.898937225341797, "step": 6820 }, { "epoch": 2.32, "learning_rate": 1.2564522220823365e-07, "logits/chosen": -0.33181434869766235, "logits/rejected": 0.15471655130386353, "logps/chosen": -289.40576171875, "logps/rejected": -694.7301635742188, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.03701719641685486, "rewards/margins": 18.47975730895996, "rewards/rejected": -18.516773223876953, "step": 6830 }, { "epoch": 2.32, "learning_rate": 1.250157371270301e-07, "logits/chosen": -0.36202389001846313, "logits/rejected": 0.172877699136734, "logps/chosen": -296.92181396484375, "logps/rejected": -552.6640625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.05221182107925415, "rewards/margins": 20.838550567626953, "rewards/rejected": -20.786338806152344, "step": 6840 }, { "epoch": 2.33, "learning_rate": 1.243862520458265e-07, "logits/chosen": -0.2942168116569519, "logits/rejected": 0.07513265311717987, "logps/chosen": -279.4120178222656, "logps/rejected": -522.6356201171875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.1194159984588623, "rewards/margins": 18.97164535522461, "rewards/rejected": -18.852231979370117, "step": 6850 }, { "epoch": 2.33, "learning_rate": 1.2375676696462294e-07, "logits/chosen": -0.3219735026359558, "logits/rejected": 0.13254138827323914, "logps/chosen": -305.2547912597656, "logps/rejected": -745.8877563476562, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.7385798692703247, "rewards/margins": 17.299104690551758, "rewards/rejected": -18.037683486938477, "step": 6860 }, { "epoch": 2.34, "learning_rate": 1.2312728188341934e-07, "logits/chosen": -0.3473687171936035, "logits/rejected": 0.12425991147756577, "logps/chosen": -244.7881317138672, "logps/rejected": -691.1434936523438, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -0.08783552795648575, "rewards/margins": 21.83948516845703, "rewards/rejected": -21.927322387695312, "step": 6870 }, { "epoch": 2.34, "learning_rate": 1.224977968022158e-07, "logits/chosen": -0.3955625593662262, "logits/rejected": 0.19323265552520752, "logps/chosen": -347.40277099609375, "logps/rejected": -565.8941650390625, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -0.47357600927352905, "rewards/margins": 17.6691837310791, "rewards/rejected": -18.14276123046875, "step": 6880 }, { "epoch": 2.34, "learning_rate": 1.218683117210122e-07, "logits/chosen": -0.2569199204444885, "logits/rejected": 0.23157675564289093, "logps/chosen": -212.14480590820312, "logps/rejected": -724.9212036132812, "loss": 0.0127, "rewards/accuracies": 1.0, "rewards/chosen": 0.6120128631591797, "rewards/margins": 19.983755111694336, "rewards/rejected": -19.371746063232422, "step": 6890 }, { "epoch": 2.35, "learning_rate": 1.2123882663980863e-07, "logits/chosen": -0.35264772176742554, "logits/rejected": 0.18894609808921814, "logps/chosen": -302.9128112792969, "logps/rejected": -722.63525390625, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": 0.29235535860061646, "rewards/margins": 15.606472969055176, "rewards/rejected": -15.314115524291992, "step": 6900 }, { "epoch": 2.35, "eval_logits/chosen": -0.292820543050766, "eval_logits/rejected": 0.23645417392253876, "eval_logps/chosen": -274.330078125, "eval_logps/rejected": -636.3722534179688, "eval_loss": 0.0073310802690684795, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": 0.015153343789279461, "eval_rewards/margins": 18.642711639404297, "eval_rewards/rejected": -18.6275577545166, "eval_runtime": 525.0458, "eval_samples_per_second": 18.094, "eval_steps_per_second": 0.566, "step": 6900 }, { "epoch": 2.35, "learning_rate": 1.2060934155860506e-07, "logits/chosen": -0.3606736361980438, "logits/rejected": 0.31031718850135803, "logps/chosen": -233.1505889892578, "logps/rejected": -520.3114013671875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.037613458931446075, "rewards/margins": 17.34442901611328, "rewards/rejected": -17.30681610107422, "step": 6910 }, { "epoch": 2.35, "learning_rate": 1.1997985647740148e-07, "logits/chosen": -0.36891770362854004, "logits/rejected": 0.11135606467723846, "logps/chosen": -231.2191925048828, "logps/rejected": -630.4271240234375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.36508819460868835, "rewards/margins": 18.550813674926758, "rewards/rejected": -18.18572425842285, "step": 6920 }, { "epoch": 2.36, "learning_rate": 1.193503713961979e-07, "logits/chosen": -0.20481212437152863, "logits/rejected": 0.25965866446495056, "logps/chosen": -399.758544921875, "logps/rejected": -891.5769653320312, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.23881623148918152, "rewards/margins": 19.03354263305664, "rewards/rejected": -18.79472541809082, "step": 6930 }, { "epoch": 2.36, "learning_rate": 1.1872088631499433e-07, "logits/chosen": -0.3304951786994934, "logits/rejected": 0.059341687709093094, "logps/chosen": -254.83908081054688, "logps/rejected": -666.64599609375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.5450111031532288, "rewards/margins": 18.13606071472168, "rewards/rejected": -18.681072235107422, "step": 6940 }, { "epoch": 2.36, "learning_rate": 1.1809140123379076e-07, "logits/chosen": -0.31505587697029114, "logits/rejected": 0.056240878999233246, "logps/chosen": -250.3033447265625, "logps/rejected": -715.9754638671875, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.7836655974388123, "rewards/margins": 19.46747398376465, "rewards/rejected": -18.68381118774414, "step": 6950 }, { "epoch": 2.37, "learning_rate": 1.1746191615258717e-07, "logits/chosen": -0.3939222991466522, "logits/rejected": 0.22182238101959229, "logps/chosen": -228.7406463623047, "logps/rejected": -681.3053588867188, "loss": 0.003, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.20938244462013245, "rewards/margins": 20.197490692138672, "rewards/rejected": -19.988107681274414, "step": 6960 }, { "epoch": 2.37, "learning_rate": 1.1683243107138361e-07, "logits/chosen": -0.4135221540927887, "logits/rejected": 0.22105951607227325, "logps/chosen": -306.249755859375, "logps/rejected": -579.1951904296875, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.02935473993420601, "rewards/margins": 20.69234275817871, "rewards/rejected": -20.66299057006836, "step": 6970 }, { "epoch": 2.37, "learning_rate": 1.1620294599018003e-07, "logits/chosen": -0.3536306321620941, "logits/rejected": 0.051975317299366, "logps/chosen": -233.8282928466797, "logps/rejected": -532.3546752929688, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -0.5861068964004517, "rewards/margins": 17.699121475219727, "rewards/rejected": -18.285228729248047, "step": 6980 }, { "epoch": 2.38, "learning_rate": 1.1557346090897645e-07, "logits/chosen": -0.28494125604629517, "logits/rejected": 0.05003322288393974, "logps/chosen": -217.6209259033203, "logps/rejected": -817.3129272460938, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": 0.1412900686264038, "rewards/margins": 19.69931411743164, "rewards/rejected": -19.558025360107422, "step": 6990 }, { "epoch": 2.38, "learning_rate": 1.1494397582777288e-07, "logits/chosen": -0.23703083395957947, "logits/rejected": 0.02210414409637451, "logps/chosen": -204.94918823242188, "logps/rejected": -670.94091796875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": 0.12995386123657227, "rewards/margins": 17.80183219909668, "rewards/rejected": -17.671878814697266, "step": 7000 }, { "epoch": 2.38, "eval_logits/chosen": -0.2889222502708435, "eval_logits/rejected": 0.23657073080539703, "eval_logps/chosen": -274.16058349609375, "eval_logps/rejected": -634.69677734375, "eval_loss": 0.00555398827418685, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": 0.032101649791002274, "eval_rewards/margins": 18.492111206054688, "eval_rewards/rejected": -18.46000862121582, "eval_runtime": 526.2509, "eval_samples_per_second": 18.052, "eval_steps_per_second": 0.564, "step": 7000 }, { "epoch": 2.38, "learning_rate": 1.1431449074656931e-07, "logits/chosen": -0.3933561444282532, "logits/rejected": 0.19053684175014496, "logps/chosen": -418.323486328125, "logps/rejected": -495.36224365234375, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.2113698422908783, "rewards/margins": 18.088050842285156, "rewards/rejected": -17.87668228149414, "step": 7010 }, { "epoch": 2.39, "learning_rate": 1.1368500566536572e-07, "logits/chosen": -0.29092225432395935, "logits/rejected": 0.15280678868293762, "logps/chosen": -362.4805603027344, "logps/rejected": -692.3865966796875, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": 0.24914464354515076, "rewards/margins": 16.258434295654297, "rewards/rejected": -16.009288787841797, "step": 7020 }, { "epoch": 2.39, "learning_rate": 1.1305552058416214e-07, "logits/chosen": -0.2299823760986328, "logits/rejected": 0.11801712214946747, "logps/chosen": -234.7991180419922, "logps/rejected": -660.0935668945312, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.09448641538619995, "rewards/margins": 20.36806297302246, "rewards/rejected": -20.27358055114746, "step": 7030 }, { "epoch": 2.39, "learning_rate": 1.1242603550295858e-07, "logits/chosen": -0.22739700973033905, "logits/rejected": 0.09789099544286728, "logps/chosen": -208.9736785888672, "logps/rejected": -740.1498413085938, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.4068352282047272, "rewards/margins": 21.5587158203125, "rewards/rejected": -21.15188217163086, "step": 7040 }, { "epoch": 2.4, "learning_rate": 1.1179655042175499e-07, "logits/chosen": -0.3735640347003937, "logits/rejected": 0.2467024028301239, "logps/chosen": -237.59909057617188, "logps/rejected": -489.48687744140625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.8851602673530579, "rewards/margins": 16.748958587646484, "rewards/rejected": -17.634117126464844, "step": 7050 }, { "epoch": 2.4, "learning_rate": 1.1116706534055143e-07, "logits/chosen": -0.369795024394989, "logits/rejected": 0.22245745360851288, "logps/chosen": -281.7445068359375, "logps/rejected": -606.8353271484375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": 0.13057518005371094, "rewards/margins": 22.0218563079834, "rewards/rejected": -21.89128303527832, "step": 7060 }, { "epoch": 2.4, "learning_rate": 1.1053758025934785e-07, "logits/chosen": -0.1929634064435959, "logits/rejected": 0.05626733973622322, "logps/chosen": -236.2475128173828, "logps/rejected": -834.8046875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.03109445609152317, "rewards/margins": 19.80615234375, "rewards/rejected": -19.837247848510742, "step": 7070 }, { "epoch": 2.41, "learning_rate": 1.0990809517814427e-07, "logits/chosen": -0.3562391698360443, "logits/rejected": 0.20181787014007568, "logps/chosen": -281.6477966308594, "logps/rejected": -502.891357421875, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -0.31066933274269104, "rewards/margins": 17.39512825012207, "rewards/rejected": -17.705799102783203, "step": 7080 }, { "epoch": 2.41, "learning_rate": 1.092786100969407e-07, "logits/chosen": -0.3409980237483978, "logits/rejected": 0.03216409310698509, "logps/chosen": -234.3061981201172, "logps/rejected": -625.5872192382812, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.3176437020301819, "rewards/margins": 17.5345401763916, "rewards/rejected": -17.21689796447754, "step": 7090 }, { "epoch": 2.41, "learning_rate": 1.0864912501573713e-07, "logits/chosen": -0.32030707597732544, "logits/rejected": 0.04825950041413307, "logps/chosen": -219.4656982421875, "logps/rejected": -550.5779418945312, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.331972599029541, "rewards/margins": 16.18992042541504, "rewards/rejected": -16.521894454956055, "step": 7100 }, { "epoch": 2.41, "eval_logits/chosen": -0.2898252606391907, "eval_logits/rejected": 0.23055203258991241, "eval_logps/chosen": -275.7859191894531, "eval_logps/rejected": -641.836669921875, "eval_loss": 0.005764603149145842, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.13043056428432465, "eval_rewards/margins": 19.043567657470703, "eval_rewards/rejected": -19.17399787902832, "eval_runtime": 522.8632, "eval_samples_per_second": 18.169, "eval_steps_per_second": 0.568, "step": 7100 }, { "epoch": 2.42, "learning_rate": 1.0801963993453354e-07, "logits/chosen": -0.31572142243385315, "logits/rejected": 0.11929450929164886, "logps/chosen": -229.9832763671875, "logps/rejected": -464.2555236816406, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.35817646980285645, "rewards/margins": 19.070262908935547, "rewards/rejected": -19.42844009399414, "step": 7110 }, { "epoch": 2.42, "learning_rate": 1.0739015485332998e-07, "logits/chosen": -0.3372807502746582, "logits/rejected": 0.0840967521071434, "logps/chosen": -324.3774719238281, "logps/rejected": -688.3312377929688, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.16585907340049744, "rewards/margins": 18.10820198059082, "rewards/rejected": -17.942340850830078, "step": 7120 }, { "epoch": 2.42, "learning_rate": 1.067606697721264e-07, "logits/chosen": -0.2933925688266754, "logits/rejected": 0.07642112672328949, "logps/chosen": -303.24835205078125, "logps/rejected": -556.6817626953125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.674970269203186, "rewards/margins": 17.59970474243164, "rewards/rejected": -18.274673461914062, "step": 7130 }, { "epoch": 2.43, "learning_rate": 1.0613118469092282e-07, "logits/chosen": -0.24628238379955292, "logits/rejected": 0.0924672931432724, "logps/chosen": -285.8111267089844, "logps/rejected": -1006.5325927734375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.13285118341445923, "rewards/margins": 18.37169075012207, "rewards/rejected": -18.238840103149414, "step": 7140 }, { "epoch": 2.43, "learning_rate": 1.0550169960971924e-07, "logits/chosen": -0.33263540267944336, "logits/rejected": 0.12547115981578827, "logps/chosen": -340.56719970703125, "logps/rejected": -779.1002197265625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.06299266964197159, "rewards/margins": 19.00230598449707, "rewards/rejected": -19.06529998779297, "step": 7150 }, { "epoch": 2.43, "learning_rate": 1.0487221452851568e-07, "logits/chosen": -0.27566084265708923, "logits/rejected": 0.03873121365904808, "logps/chosen": -270.87322998046875, "logps/rejected": -779.4202880859375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.5755264163017273, "rewards/margins": 17.204593658447266, "rewards/rejected": -17.780118942260742, "step": 7160 }, { "epoch": 2.44, "learning_rate": 1.0424272944731209e-07, "logits/chosen": -0.28330662846565247, "logits/rejected": 0.138466015458107, "logps/chosen": -301.295654296875, "logps/rejected": -766.9324340820312, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.05453551933169365, "rewards/margins": 18.872732162475586, "rewards/rejected": -18.927265167236328, "step": 7170 }, { "epoch": 2.44, "learning_rate": 1.0361324436610853e-07, "logits/chosen": -0.40624675154685974, "logits/rejected": 0.27502506971359253, "logps/chosen": -287.0796203613281, "logps/rejected": -576.8126220703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.028964543715119362, "rewards/margins": 22.391847610473633, "rewards/rejected": -22.362884521484375, "step": 7180 }, { "epoch": 2.44, "learning_rate": 1.0298375928490494e-07, "logits/chosen": -0.26948338747024536, "logits/rejected": 0.2611820101737976, "logps/chosen": -213.81082153320312, "logps/rejected": -680.5365600585938, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.2979668080806732, "rewards/margins": 21.0222225189209, "rewards/rejected": -21.320188522338867, "step": 7190 }, { "epoch": 2.45, "learning_rate": 1.0235427420370137e-07, "logits/chosen": -0.3449554741382599, "logits/rejected": 0.05601067095994949, "logps/chosen": -239.67233276367188, "logps/rejected": -606.7269287109375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.3694484829902649, "rewards/margins": 19.13559341430664, "rewards/rejected": -19.50503921508789, "step": 7200 }, { "epoch": 2.45, "eval_logits/chosen": -0.29157039523124695, "eval_logits/rejected": 0.222737118601799, "eval_logps/chosen": -275.4589538574219, "eval_logps/rejected": -642.7957763671875, "eval_loss": 0.005937620997428894, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.09773629903793335, "eval_rewards/margins": 19.172164916992188, "eval_rewards/rejected": -19.2698974609375, "eval_runtime": 519.2935, "eval_samples_per_second": 18.294, "eval_steps_per_second": 0.572, "step": 7200 }, { "epoch": 2.45, "learning_rate": 1.017247891224978e-07, "logits/chosen": -0.3412637710571289, "logits/rejected": 0.22422143816947937, "logps/chosen": -217.7747802734375, "logps/rejected": -429.983642578125, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.08856790512800217, "rewards/margins": 18.66463279724121, "rewards/rejected": -18.75320053100586, "step": 7210 }, { "epoch": 2.45, "learning_rate": 1.0109530404129422e-07, "logits/chosen": -0.35995417833328247, "logits/rejected": 0.191864475607872, "logps/chosen": -296.9629211425781, "logps/rejected": -550.5357666015625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.3980129361152649, "rewards/margins": 20.258106231689453, "rewards/rejected": -20.656118392944336, "step": 7220 }, { "epoch": 2.46, "learning_rate": 1.0046581896009064e-07, "logits/chosen": -0.28712373971939087, "logits/rejected": 0.18069307506084442, "logps/chosen": -274.2567138671875, "logps/rejected": -625.5621948242188, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.3911634087562561, "rewards/margins": 20.88827133178711, "rewards/rejected": -21.279434204101562, "step": 7230 }, { "epoch": 2.46, "learning_rate": 9.983633387888708e-08, "logits/chosen": -0.1525527834892273, "logits/rejected": 0.12688273191452026, "logps/chosen": -276.7810363769531, "logps/rejected": -833.2777099609375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.32876792550086975, "rewards/margins": 19.52300453186035, "rewards/rejected": -19.851770401000977, "step": 7240 }, { "epoch": 2.46, "learning_rate": 9.920684879768348e-08, "logits/chosen": -0.358464777469635, "logits/rejected": 0.22583091259002686, "logps/chosen": -330.94927978515625, "logps/rejected": -682.8681030273438, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.10059633105993271, "rewards/margins": 17.086048126220703, "rewards/rejected": -17.186643600463867, "step": 7250 }, { "epoch": 2.47, "learning_rate": 9.857736371647991e-08, "logits/chosen": -0.3261156976222992, "logits/rejected": 0.17372381687164307, "logps/chosen": -368.6526184082031, "logps/rejected": -702.1719360351562, "loss": 0.0063, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21570368111133575, "rewards/margins": 19.6741886138916, "rewards/rejected": -19.45848274230957, "step": 7260 }, { "epoch": 2.47, "learning_rate": 9.794787863527634e-08, "logits/chosen": -0.35899755358695984, "logits/rejected": 0.18659745156764984, "logps/chosen": -366.28057861328125, "logps/rejected": -447.0079650878906, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -0.08983780443668365, "rewards/margins": 20.534582138061523, "rewards/rejected": -20.624420166015625, "step": 7270 }, { "epoch": 2.47, "learning_rate": 9.731839355407275e-08, "logits/chosen": -0.3836769461631775, "logits/rejected": 0.04558302089571953, "logps/chosen": -276.10345458984375, "logps/rejected": -452.349365234375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.17190805077552795, "rewards/margins": 15.682443618774414, "rewards/rejected": -15.854351997375488, "step": 7280 }, { "epoch": 2.48, "learning_rate": 9.668890847286919e-08, "logits/chosen": -0.40941596031188965, "logits/rejected": 0.20928414165973663, "logps/chosen": -354.63592529296875, "logps/rejected": -499.66986083984375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.0776977390050888, "rewards/margins": 19.20831871032715, "rewards/rejected": -19.28601837158203, "step": 7290 }, { "epoch": 2.48, "learning_rate": 9.605942339166561e-08, "logits/chosen": -0.24428148567676544, "logits/rejected": 0.05454058572649956, "logps/chosen": -273.736572265625, "logps/rejected": -890.8504638671875, "loss": 0.0058, "rewards/accuracies": 1.0, "rewards/chosen": 0.1931009292602539, "rewards/margins": 18.039865493774414, "rewards/rejected": -17.846765518188477, "step": 7300 }, { "epoch": 2.48, "eval_logits/chosen": -0.29446113109588623, "eval_logits/rejected": 0.2314145565032959, "eval_logps/chosen": -275.41949462890625, "eval_logps/rejected": -638.7215576171875, "eval_loss": 0.005874544847756624, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.09378984570503235, "eval_rewards/margins": 18.768693923950195, "eval_rewards/rejected": -18.862485885620117, "eval_runtime": 519.5172, "eval_samples_per_second": 18.286, "eval_steps_per_second": 0.572, "step": 7300 }, { "epoch": 2.48, "learning_rate": 9.542993831046203e-08, "logits/chosen": -0.43665337562561035, "logits/rejected": 0.11728908121585846, "logps/chosen": -282.4403076171875, "logps/rejected": -531.848876953125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.12617236375808716, "rewards/margins": 16.726076126098633, "rewards/rejected": -16.852249145507812, "step": 7310 }, { "epoch": 2.49, "learning_rate": 9.480045322925846e-08, "logits/chosen": -0.29463595151901245, "logits/rejected": 0.2189660519361496, "logps/chosen": -282.9891662597656, "logps/rejected": -660.753173828125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.029831266030669212, "rewards/margins": 20.19232749938965, "rewards/rejected": -20.222158432006836, "step": 7320 }, { "epoch": 2.49, "learning_rate": 9.41709681480549e-08, "logits/chosen": -0.34039074182510376, "logits/rejected": 0.25505515933036804, "logps/chosen": -214.7294158935547, "logps/rejected": -571.3095703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.5543395280838013, "rewards/margins": 19.58926773071289, "rewards/rejected": -19.034929275512695, "step": 7330 }, { "epoch": 2.49, "learning_rate": 9.35414830668513e-08, "logits/chosen": -0.18800821900367737, "logits/rejected": 0.07977934181690216, "logps/chosen": -193.48013305664062, "logps/rejected": -647.4368286132812, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.13180044293403625, "rewards/margins": 18.31092071533203, "rewards/rejected": -18.442720413208008, "step": 7340 }, { "epoch": 2.5, "learning_rate": 9.291199798564774e-08, "logits/chosen": -0.3254787027835846, "logits/rejected": 0.11154206842184067, "logps/chosen": -233.76004028320312, "logps/rejected": -636.6951904296875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.3010993003845215, "rewards/margins": 18.32367515563965, "rewards/rejected": -18.022579193115234, "step": 7350 }, { "epoch": 2.5, "learning_rate": 9.228251290444416e-08, "logits/chosen": -0.36636167764663696, "logits/rejected": 0.05550817772746086, "logps/chosen": -284.4520568847656, "logps/rejected": -639.8171997070312, "loss": 0.006, "rewards/accuracies": 1.0, "rewards/chosen": -0.06709767878055573, "rewards/margins": 16.66944122314453, "rewards/rejected": -16.73653793334961, "step": 7360 }, { "epoch": 2.51, "learning_rate": 9.165302782324058e-08, "logits/chosen": -0.38318100571632385, "logits/rejected": 0.20088307559490204, "logps/chosen": -233.8351593017578, "logps/rejected": -598.1431884765625, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.1670762598514557, "rewards/margins": 18.38641357421875, "rewards/rejected": -18.553491592407227, "step": 7370 }, { "epoch": 2.51, "learning_rate": 9.102354274203701e-08, "logits/chosen": -0.3628920018672943, "logits/rejected": 0.08943851292133331, "logps/chosen": -261.6184387207031, "logps/rejected": -641.4315185546875, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.34964656829833984, "rewards/margins": 20.583759307861328, "rewards/rejected": -20.234113693237305, "step": 7380 }, { "epoch": 2.51, "learning_rate": 9.039405766083344e-08, "logits/chosen": -0.2848908305168152, "logits/rejected": 0.14409299194812775, "logps/chosen": -228.44113159179688, "logps/rejected": -784.9698486328125, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.6778356432914734, "rewards/margins": 16.24924659729004, "rewards/rejected": -16.92708396911621, "step": 7390 }, { "epoch": 2.52, "learning_rate": 8.976457257962985e-08, "logits/chosen": -0.3242306113243103, "logits/rejected": -0.03971681743860245, "logps/chosen": -229.00735473632812, "logps/rejected": -681.0172729492188, "loss": 0.004, "rewards/accuracies": 1.0, "rewards/chosen": -0.22452864050865173, "rewards/margins": 18.557270050048828, "rewards/rejected": -18.781797409057617, "step": 7400 }, { "epoch": 2.52, "eval_logits/chosen": -0.29589325189590454, "eval_logits/rejected": 0.21939413249492645, "eval_logps/chosen": -275.9388732910156, "eval_logps/rejected": -642.3609619140625, "eval_loss": 0.006675275973975658, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.1457235962152481, "eval_rewards/margins": 19.080707550048828, "eval_rewards/rejected": -19.226430892944336, "eval_runtime": 518.0889, "eval_samples_per_second": 18.337, "eval_steps_per_second": 0.573, "step": 7400 }, { "epoch": 2.52, "learning_rate": 8.913508749842629e-08, "logits/chosen": -0.3154430389404297, "logits/rejected": 0.20277602970600128, "logps/chosen": -222.65713500976562, "logps/rejected": -617.011962890625, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.161592036485672, "rewards/margins": 18.85121726989746, "rewards/rejected": -19.01280975341797, "step": 7410 }, { "epoch": 2.52, "learning_rate": 8.850560241722271e-08, "logits/chosen": -0.3136681914329529, "logits/rejected": 0.07410818338394165, "logps/chosen": -295.84539794921875, "logps/rejected": -652.9530029296875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.6224571466445923, "rewards/margins": 18.904088973999023, "rewards/rejected": -19.526546478271484, "step": 7420 }, { "epoch": 2.53, "learning_rate": 8.787611733601913e-08, "logits/chosen": -0.45588821172714233, "logits/rejected": 0.28531765937805176, "logps/chosen": -414.3863220214844, "logps/rejected": -461.6678771972656, "loss": 0.0055, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.2732238173484802, "rewards/margins": 20.246780395507812, "rewards/rejected": -20.520004272460938, "step": 7430 }, { "epoch": 2.53, "learning_rate": 8.724663225481556e-08, "logits/chosen": -0.3600725531578064, "logits/rejected": 0.24953369796276093, "logps/chosen": -277.8388977050781, "logps/rejected": -779.6924438476562, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.4963870942592621, "rewards/margins": 20.214771270751953, "rewards/rejected": -19.7183837890625, "step": 7440 }, { "epoch": 2.53, "learning_rate": 8.6617147173612e-08, "logits/chosen": -0.3309049606323242, "logits/rejected": 0.2283121645450592, "logps/chosen": -227.34176635742188, "logps/rejected": -754.4185791015625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.29490751028060913, "rewards/margins": 18.109058380126953, "rewards/rejected": -18.403966903686523, "step": 7450 }, { "epoch": 2.54, "learning_rate": 8.59876620924084e-08, "logits/chosen": -0.38397377729415894, "logits/rejected": 0.13029304146766663, "logps/chosen": -424.2220764160156, "logps/rejected": -688.549560546875, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.22794663906097412, "rewards/margins": 19.047555923461914, "rewards/rejected": -18.81960678100586, "step": 7460 }, { "epoch": 2.54, "learning_rate": 8.535817701120483e-08, "logits/chosen": -0.2966124415397644, "logits/rejected": 0.09680347889661789, "logps/chosen": -238.99606323242188, "logps/rejected": -828.2506103515625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": 0.26800960302352905, "rewards/margins": 18.93511962890625, "rewards/rejected": -18.667110443115234, "step": 7470 }, { "epoch": 2.54, "learning_rate": 8.472869193000126e-08, "logits/chosen": -0.45557960867881775, "logits/rejected": 0.24650350213050842, "logps/chosen": -445.41668701171875, "logps/rejected": -653.29931640625, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": 0.06937725841999054, "rewards/margins": 19.05282974243164, "rewards/rejected": -18.98345375061035, "step": 7480 }, { "epoch": 2.55, "learning_rate": 8.409920684879767e-08, "logits/chosen": -0.3966844975948334, "logits/rejected": 0.11668910831212997, "logps/chosen": -284.5379943847656, "logps/rejected": -637.3410034179688, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.08849454671144485, "rewards/margins": 18.085834503173828, "rewards/rejected": -17.997339248657227, "step": 7490 }, { "epoch": 2.55, "learning_rate": 8.346972176759411e-08, "logits/chosen": -0.28426432609558105, "logits/rejected": 0.15019795298576355, "logps/chosen": -397.32476806640625, "logps/rejected": -691.5030517578125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.01955687440931797, "rewards/margins": 19.193527221679688, "rewards/rejected": -19.213085174560547, "step": 7500 }, { "epoch": 2.55, "eval_logits/chosen": -0.3002593219280243, "eval_logits/rejected": 0.2398287057876587, "eval_logps/chosen": -274.5811767578125, "eval_logps/rejected": -636.1057739257812, "eval_loss": 0.005634963512420654, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.009955662302672863, "eval_rewards/margins": 18.5909481048584, "eval_rewards/rejected": -18.600902557373047, "eval_runtime": 519.0159, "eval_samples_per_second": 18.304, "eval_steps_per_second": 0.572, "step": 7500 }, { "epoch": 2.55, "learning_rate": 8.284023668639053e-08, "logits/chosen": -0.3423105776309967, "logits/rejected": 0.09113410115242004, "logps/chosen": -392.9249572753906, "logps/rejected": -522.718017578125, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.12955355644226074, "rewards/margins": 15.672220230102539, "rewards/rejected": -15.801773071289062, "step": 7510 }, { "epoch": 2.56, "learning_rate": 8.221075160518695e-08, "logits/chosen": -0.33970439434051514, "logits/rejected": 0.006451125256717205, "logps/chosen": -327.6536865234375, "logps/rejected": -587.7950439453125, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": 0.1017824187874794, "rewards/margins": 15.696223258972168, "rewards/rejected": -15.594439506530762, "step": 7520 }, { "epoch": 2.56, "learning_rate": 8.158126652398338e-08, "logits/chosen": -0.4362737238407135, "logits/rejected": 0.21203747391700745, "logps/chosen": -343.24505615234375, "logps/rejected": -465.859619140625, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": 0.31137630343437195, "rewards/margins": 18.133377075195312, "rewards/rejected": -17.822002410888672, "step": 7530 }, { "epoch": 2.56, "learning_rate": 8.09517814427798e-08, "logits/chosen": -0.37612515687942505, "logits/rejected": 0.14953847229480743, "logps/chosen": -212.30075073242188, "logps/rejected": -621.3903198242188, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.20880427956581116, "rewards/margins": 20.724872589111328, "rewards/rejected": -20.516071319580078, "step": 7540 }, { "epoch": 2.57, "learning_rate": 8.032229636157622e-08, "logits/chosen": -0.2706584334373474, "logits/rejected": 0.14884339272975922, "logps/chosen": -205.06982421875, "logps/rejected": -732.9454345703125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.5632843971252441, "rewards/margins": 17.477998733520508, "rewards/rejected": -18.041284561157227, "step": 7550 }, { "epoch": 2.57, "learning_rate": 7.969281128037266e-08, "logits/chosen": -0.34731918573379517, "logits/rejected": 0.14705519378185272, "logps/chosen": -227.9785614013672, "logps/rejected": -575.3389892578125, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.22708649933338165, "rewards/margins": 19.281150817871094, "rewards/rejected": -19.508237838745117, "step": 7560 }, { "epoch": 2.57, "learning_rate": 7.906332619916907e-08, "logits/chosen": -0.27502918243408203, "logits/rejected": 0.12634186446666718, "logps/chosen": -302.40643310546875, "logps/rejected": -657.5842895507812, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -0.13008829951286316, "rewards/margins": 18.881595611572266, "rewards/rejected": -19.011686325073242, "step": 7570 }, { "epoch": 2.58, "learning_rate": 7.84338411179655e-08, "logits/chosen": -0.33784133195877075, "logits/rejected": 0.28256258368492126, "logps/chosen": -318.814453125, "logps/rejected": -618.578125, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.085172139108181, "rewards/margins": 19.453466415405273, "rewards/rejected": -19.538637161254883, "step": 7580 }, { "epoch": 2.58, "learning_rate": 7.780435603676193e-08, "logits/chosen": -0.31481438875198364, "logits/rejected": 0.207257941365242, "logps/chosen": -213.3035430908203, "logps/rejected": -643.3805541992188, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -0.14178331196308136, "rewards/margins": 21.01847267150879, "rewards/rejected": -21.160259246826172, "step": 7590 }, { "epoch": 2.58, "learning_rate": 7.717487095555835e-08, "logits/chosen": -0.31889837980270386, "logits/rejected": 0.18951822817325592, "logps/chosen": -235.14743041992188, "logps/rejected": -700.6536254882812, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.4250165820121765, "rewards/margins": 19.89285659790039, "rewards/rejected": -20.317873001098633, "step": 7600 }, { "epoch": 2.58, "eval_logits/chosen": -0.29652124643325806, "eval_logits/rejected": 0.22154194116592407, "eval_logps/chosen": -275.9655456542969, "eval_logps/rejected": -643.21044921875, "eval_loss": 0.005692795384675264, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.14839214086532593, "eval_rewards/margins": 19.162981033325195, "eval_rewards/rejected": -19.31137466430664, "eval_runtime": 520.5468, "eval_samples_per_second": 18.25, "eval_steps_per_second": 0.571, "step": 7600 }, { "epoch": 2.59, "learning_rate": 7.654538587435477e-08, "logits/chosen": -0.3244675397872925, "logits/rejected": 0.12065118551254272, "logps/chosen": -242.30868530273438, "logps/rejected": -608.1856689453125, "loss": 0.0025, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.12762486934661865, "rewards/margins": 18.923259735107422, "rewards/rejected": -18.795637130737305, "step": 7610 }, { "epoch": 2.59, "learning_rate": 7.591590079315121e-08, "logits/chosen": -0.3614066243171692, "logits/rejected": 0.11683876812458038, "logps/chosen": -276.7565002441406, "logps/rejected": -626.7915649414062, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.33749982714653015, "rewards/margins": 18.07803726196289, "rewards/rejected": -18.415538787841797, "step": 7620 }, { "epoch": 2.59, "learning_rate": 7.528641571194762e-08, "logits/chosen": -0.312629371881485, "logits/rejected": -0.0049002161249518394, "logps/chosen": -212.4114227294922, "logps/rejected": -772.835205078125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.08289633691310883, "rewards/margins": 18.966411590576172, "rewards/rejected": -19.0493106842041, "step": 7630 }, { "epoch": 2.6, "learning_rate": 7.465693063074405e-08, "logits/chosen": -0.3716451823711395, "logits/rejected": 0.2384670227766037, "logps/chosen": -304.5723876953125, "logps/rejected": -641.0076904296875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.7929674386978149, "rewards/margins": 20.201087951660156, "rewards/rejected": -20.99405860900879, "step": 7640 }, { "epoch": 2.6, "learning_rate": 7.402744554954048e-08, "logits/chosen": -0.37695926427841187, "logits/rejected": 0.13788816332817078, "logps/chosen": -290.6534729003906, "logps/rejected": -555.81591796875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.1566406488418579, "rewards/margins": 19.207292556762695, "rewards/rejected": -19.050649642944336, "step": 7650 }, { "epoch": 2.6, "learning_rate": 7.33979604683369e-08, "logits/chosen": -0.3401496112346649, "logits/rejected": 0.10450832545757294, "logps/chosen": -295.5526123046875, "logps/rejected": -546.8719482421875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.14801934361457825, "rewards/margins": 16.521982192993164, "rewards/rejected": -16.670001983642578, "step": 7660 }, { "epoch": 2.61, "learning_rate": 7.276847538713332e-08, "logits/chosen": -0.37646132707595825, "logits/rejected": 0.16392526030540466, "logps/chosen": -314.00189208984375, "logps/rejected": -570.8324584960938, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.13823597133159637, "rewards/margins": 19.1723690032959, "rewards/rejected": -19.310604095458984, "step": 7670 }, { "epoch": 2.61, "learning_rate": 7.213899030592976e-08, "logits/chosen": -0.39041608572006226, "logits/rejected": 0.2614973187446594, "logps/chosen": -307.13848876953125, "logps/rejected": -496.443603515625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.17635855078697205, "rewards/margins": 15.533548355102539, "rewards/rejected": -15.709907531738281, "step": 7680 }, { "epoch": 2.61, "learning_rate": 7.150950522472617e-08, "logits/chosen": -0.3218209147453308, "logits/rejected": 0.11713643372058868, "logps/chosen": -292.4608154296875, "logps/rejected": -535.8592529296875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": 0.3582201600074768, "rewards/margins": 20.062313079833984, "rewards/rejected": -19.704092025756836, "step": 7690 }, { "epoch": 2.62, "learning_rate": 7.088002014352259e-08, "logits/chosen": -0.3947954773902893, "logits/rejected": 0.20336881279945374, "logps/chosen": -367.61224365234375, "logps/rejected": -592.3577880859375, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": 0.16297319531440735, "rewards/margins": 18.450090408325195, "rewards/rejected": -18.287113189697266, "step": 7700 }, { "epoch": 2.62, "eval_logits/chosen": -0.30521494150161743, "eval_logits/rejected": 0.22098080813884735, "eval_logps/chosen": -276.05517578125, "eval_logps/rejected": -642.49462890625, "eval_loss": 0.005143594928085804, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.157354936003685, "eval_rewards/margins": 19.082447052001953, "eval_rewards/rejected": -19.23979949951172, "eval_runtime": 520.8189, "eval_samples_per_second": 18.241, "eval_steps_per_second": 0.57, "step": 7700 }, { "epoch": 2.62, "learning_rate": 7.025053506231903e-08, "logits/chosen": -0.3793858289718628, "logits/rejected": 0.21426954865455627, "logps/chosen": -264.5860900878906, "logps/rejected": -478.255615234375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.04093002527952194, "rewards/margins": 20.84441375732422, "rewards/rejected": -20.885345458984375, "step": 7710 }, { "epoch": 2.62, "learning_rate": 6.962104998111543e-08, "logits/chosen": -0.42135709524154663, "logits/rejected": 0.13730263710021973, "logps/chosen": -249.4564971923828, "logps/rejected": -578.6167602539062, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": 0.08852307498455048, "rewards/margins": 19.58481216430664, "rewards/rejected": -19.496288299560547, "step": 7720 }, { "epoch": 2.63, "learning_rate": 6.899156489991187e-08, "logits/chosen": -0.40930598974227905, "logits/rejected": 0.1835886687040329, "logps/chosen": -226.67013549804688, "logps/rejected": -628.182373046875, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.008288180455565453, "rewards/margins": 21.340389251708984, "rewards/rejected": -21.348678588867188, "step": 7730 }, { "epoch": 2.63, "learning_rate": 6.83620798187083e-08, "logits/chosen": -0.41553354263305664, "logits/rejected": 0.23179876804351807, "logps/chosen": -353.7486877441406, "logps/rejected": -522.2650146484375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -0.13907566666603088, "rewards/margins": 18.291526794433594, "rewards/rejected": -18.430601119995117, "step": 7740 }, { "epoch": 2.63, "learning_rate": 6.773259473750472e-08, "logits/chosen": -0.3328774571418762, "logits/rejected": 0.021827470511198044, "logps/chosen": -209.7210693359375, "logps/rejected": -583.0328979492188, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.07647516578435898, "rewards/margins": 19.715879440307617, "rewards/rejected": -19.792356491088867, "step": 7750 }, { "epoch": 2.64, "learning_rate": 6.710310965630114e-08, "logits/chosen": -0.3538828492164612, "logits/rejected": 0.13174602389335632, "logps/chosen": -467.94757080078125, "logps/rejected": -667.637451171875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.12718422710895538, "rewards/margins": 18.189077377319336, "rewards/rejected": -18.316265106201172, "step": 7760 }, { "epoch": 2.64, "learning_rate": 6.647362457509758e-08, "logits/chosen": -0.35735079646110535, "logits/rejected": 0.08711853623390198, "logps/chosen": -275.4328308105469, "logps/rejected": -513.7911376953125, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.1406056135892868, "rewards/margins": 18.97892189025879, "rewards/rejected": -18.838314056396484, "step": 7770 }, { "epoch": 2.64, "learning_rate": 6.584413949389398e-08, "logits/chosen": -0.32199621200561523, "logits/rejected": 0.10628409683704376, "logps/chosen": -352.6533203125, "logps/rejected": -692.7146606445312, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.3309893012046814, "rewards/margins": 17.752674102783203, "rewards/rejected": -18.083662033081055, "step": 7780 }, { "epoch": 2.65, "learning_rate": 6.521465441269042e-08, "logits/chosen": -0.3177502751350403, "logits/rejected": 0.059540171176195145, "logps/chosen": -339.9471435546875, "logps/rejected": -737.2791137695312, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.11908113956451416, "rewards/margins": 18.673648834228516, "rewards/rejected": -18.554569244384766, "step": 7790 }, { "epoch": 2.65, "learning_rate": 6.458516933148684e-08, "logits/chosen": -0.33347851037979126, "logits/rejected": 0.09273570030927658, "logps/chosen": -271.3387451171875, "logps/rejected": -718.3186645507812, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.056274689733982086, "rewards/margins": 19.58603286743164, "rewards/rejected": -19.52975845336914, "step": 7800 }, { "epoch": 2.65, "eval_logits/chosen": -0.3164680302143097, "eval_logits/rejected": 0.21019113063812256, "eval_logps/chosen": -275.70111083984375, "eval_logps/rejected": -645.320068359375, "eval_loss": 0.0052458783611655235, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.12195272743701935, "eval_rewards/margins": 19.4003849029541, "eval_rewards/rejected": -19.5223388671875, "eval_runtime": 518.2001, "eval_samples_per_second": 18.333, "eval_steps_per_second": 0.573, "step": 7800 }, { "epoch": 2.65, "learning_rate": 6.395568425028327e-08, "logits/chosen": -0.3239831328392029, "logits/rejected": 0.02083042822778225, "logps/chosen": -209.90469360351562, "logps/rejected": -843.744140625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.4550672471523285, "rewards/margins": 21.35092544555664, "rewards/rejected": -21.80599594116211, "step": 7810 }, { "epoch": 2.66, "learning_rate": 6.332619916907969e-08, "logits/chosen": -0.32594823837280273, "logits/rejected": 0.21867069602012634, "logps/chosen": -212.94235229492188, "logps/rejected": -647.8493041992188, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": 0.20225724577903748, "rewards/margins": 19.33119010925293, "rewards/rejected": -19.12893295288086, "step": 7820 }, { "epoch": 2.66, "learning_rate": 6.269671408787612e-08, "logits/chosen": -0.293303906917572, "logits/rejected": 0.11694443225860596, "logps/chosen": -239.568603515625, "logps/rejected": -794.8744506835938, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -0.2732515335083008, "rewards/margins": 20.00142478942871, "rewards/rejected": -20.274677276611328, "step": 7830 }, { "epoch": 2.66, "learning_rate": 6.206722900667253e-08, "logits/chosen": -0.2241629660129547, "logits/rejected": 0.05276932567358017, "logps/chosen": -298.73760986328125, "logps/rejected": -874.6536254882812, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.4223242700099945, "rewards/margins": 18.83368492126465, "rewards/rejected": -19.256010055541992, "step": 7840 }, { "epoch": 2.67, "learning_rate": 6.143774392546897e-08, "logits/chosen": -0.41373294591903687, "logits/rejected": 0.09901878982782364, "logps/chosen": -374.4974670410156, "logps/rejected": -534.8238525390625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": 0.47389525175094604, "rewards/margins": 19.81747055053711, "rewards/rejected": -19.343576431274414, "step": 7850 }, { "epoch": 2.67, "learning_rate": 6.080825884426539e-08, "logits/chosen": -0.38597095012664795, "logits/rejected": 0.41752657294273376, "logps/chosen": -290.7638854980469, "logps/rejected": -553.0989990234375, "loss": 0.004, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.46915459632873535, "rewards/margins": 19.278221130371094, "rewards/rejected": -19.74737548828125, "step": 7860 }, { "epoch": 2.68, "learning_rate": 6.017877376306182e-08, "logits/chosen": -0.3414413630962372, "logits/rejected": 0.32725396752357483, "logps/chosen": -292.5951843261719, "logps/rejected": -616.8834838867188, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.41513949632644653, "rewards/margins": 22.58523178100586, "rewards/rejected": -23.000370025634766, "step": 7870 }, { "epoch": 2.68, "learning_rate": 5.954928868185824e-08, "logits/chosen": -0.38075941801071167, "logits/rejected": 0.09170965850353241, "logps/chosen": -368.21240234375, "logps/rejected": -872.2438354492188, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.1773572862148285, "rewards/margins": 19.174217224121094, "rewards/rejected": -19.351573944091797, "step": 7880 }, { "epoch": 2.68, "learning_rate": 5.891980360065466e-08, "logits/chosen": -0.3815912902355194, "logits/rejected": 0.22565467655658722, "logps/chosen": -348.38629150390625, "logps/rejected": -455.5565490722656, "loss": 0.0055, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.7422603964805603, "rewards/margins": 18.222904205322266, "rewards/rejected": -18.965164184570312, "step": 7890 }, { "epoch": 2.69, "learning_rate": 5.8290318519451084e-08, "logits/chosen": -0.3542100787162781, "logits/rejected": 0.12144671380519867, "logps/chosen": -212.35116577148438, "logps/rejected": -505.96661376953125, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -0.16225430369377136, "rewards/margins": 17.09099578857422, "rewards/rejected": -17.253250122070312, "step": 7900 }, { "epoch": 2.69, "eval_logits/chosen": -0.30737820267677307, "eval_logits/rejected": 0.2105799913406372, "eval_logps/chosen": -275.54443359375, "eval_logps/rejected": -646.985595703125, "eval_loss": 0.005060057621449232, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.10628031194210052, "eval_rewards/margins": 19.58260726928711, "eval_rewards/rejected": -19.688886642456055, "eval_runtime": 520.6011, "eval_samples_per_second": 18.248, "eval_steps_per_second": 0.57, "step": 7900 }, { "epoch": 2.69, "learning_rate": 5.7660833438247514e-08, "logits/chosen": -0.32857850193977356, "logits/rejected": 0.2258477658033371, "logps/chosen": -294.4192810058594, "logps/rejected": -647.4607543945312, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.2987922132015228, "rewards/margins": 21.095500946044922, "rewards/rejected": -21.3942928314209, "step": 7910 }, { "epoch": 2.69, "learning_rate": 5.7031348357043937e-08, "logits/chosen": -0.27173879742622375, "logits/rejected": 0.18361282348632812, "logps/chosen": -242.94943237304688, "logps/rejected": -905.1119384765625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.16408918797969818, "rewards/margins": 21.264175415039062, "rewards/rejected": -21.428264617919922, "step": 7920 }, { "epoch": 2.7, "learning_rate": 5.640186327584036e-08, "logits/chosen": -0.3400248885154724, "logits/rejected": 0.18368971347808838, "logps/chosen": -269.14569091796875, "logps/rejected": -573.4324951171875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.3765038847923279, "rewards/margins": 20.634143829345703, "rewards/rejected": -21.01064682006836, "step": 7930 }, { "epoch": 2.7, "learning_rate": 5.577237819463679e-08, "logits/chosen": -0.3912068009376526, "logits/rejected": 0.2054048329591751, "logps/chosen": -282.5838317871094, "logps/rejected": -456.25177001953125, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.04357980564236641, "rewards/margins": 21.565093994140625, "rewards/rejected": -21.608671188354492, "step": 7940 }, { "epoch": 2.7, "learning_rate": 5.514289311343321e-08, "logits/chosen": -0.3552244305610657, "logits/rejected": 0.217352032661438, "logps/chosen": -342.662841796875, "logps/rejected": -719.7527465820312, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.06638326495885849, "rewards/margins": 21.711416244506836, "rewards/rejected": -21.77779769897461, "step": 7950 }, { "epoch": 2.71, "learning_rate": 5.4513408032229634e-08, "logits/chosen": -0.20588716864585876, "logits/rejected": 0.09880268573760986, "logps/chosen": -345.1643981933594, "logps/rejected": -815.1571044921875, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.388627827167511, "rewards/margins": 20.09332275390625, "rewards/rejected": -20.481950759887695, "step": 7960 }, { "epoch": 2.71, "learning_rate": 5.388392295102606e-08, "logits/chosen": -0.4095965027809143, "logits/rejected": 0.11131223291158676, "logps/chosen": -273.6915588378906, "logps/rejected": -469.62353515625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.03245977684855461, "rewards/margins": 18.96076011657715, "rewards/rejected": -18.993221282958984, "step": 7970 }, { "epoch": 2.71, "learning_rate": 5.3254437869822486e-08, "logits/chosen": -0.4530588686466217, "logits/rejected": 0.12834444642066956, "logps/chosen": -219.63577270507812, "logps/rejected": -557.2230224609375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.3285600244998932, "rewards/margins": 20.50228500366211, "rewards/rejected": -20.17372703552246, "step": 7980 }, { "epoch": 2.72, "learning_rate": 5.262495278861891e-08, "logits/chosen": -0.43547454476356506, "logits/rejected": 0.04858526587486267, "logps/chosen": -205.76663208007812, "logps/rejected": -451.46044921875, "loss": 0.0034, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.3461289405822754, "rewards/margins": 18.844764709472656, "rewards/rejected": -19.19089126586914, "step": 7990 }, { "epoch": 2.72, "learning_rate": 5.199546770741533e-08, "logits/chosen": -0.3200010657310486, "logits/rejected": 0.14257127046585083, "logps/chosen": -218.82876586914062, "logps/rejected": -637.85595703125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.41659897565841675, "rewards/margins": 20.195987701416016, "rewards/rejected": -20.61258316040039, "step": 8000 }, { "epoch": 2.72, "eval_logits/chosen": -0.3129103481769562, "eval_logits/rejected": 0.20285719633102417, "eval_logps/chosen": -275.8385925292969, "eval_logps/rejected": -648.4356079101562, "eval_loss": 0.005619946867227554, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.1356971114873886, "eval_rewards/margins": 19.698196411132812, "eval_rewards/rejected": -19.833892822265625, "eval_runtime": 521.3455, "eval_samples_per_second": 18.222, "eval_steps_per_second": 0.57, "step": 8000 }, { "epoch": 2.72, "learning_rate": 5.136598262621176e-08, "logits/chosen": -0.31962037086486816, "logits/rejected": 0.12100497633218765, "logps/chosen": -203.76904296875, "logps/rejected": -552.5272216796875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": 0.18714800477027893, "rewards/margins": 21.676250457763672, "rewards/rejected": -21.48910140991211, "step": 8010 }, { "epoch": 2.73, "learning_rate": 5.073649754500818e-08, "logits/chosen": -0.32648998498916626, "logits/rejected": 0.09895582497119904, "logps/chosen": -228.5251922607422, "logps/rejected": -512.4845581054688, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": 0.3383648097515106, "rewards/margins": 20.962844848632812, "rewards/rejected": -20.624479293823242, "step": 8020 }, { "epoch": 2.73, "learning_rate": 5.01070124638046e-08, "logits/chosen": -0.3331596255302429, "logits/rejected": 0.18246908485889435, "logps/chosen": -317.1623840332031, "logps/rejected": -671.98779296875, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.37407973408699036, "rewards/margins": 21.940258026123047, "rewards/rejected": -22.3143367767334, "step": 8030 }, { "epoch": 2.73, "learning_rate": 4.947752738260103e-08, "logits/chosen": -0.29692673683166504, "logits/rejected": 0.11210252344608307, "logps/chosen": -312.4285888671875, "logps/rejected": -728.0985107421875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -0.48993802070617676, "rewards/margins": 19.80002784729004, "rewards/rejected": -20.28996467590332, "step": 8040 }, { "epoch": 2.74, "learning_rate": 4.884804230139745e-08, "logits/chosen": -0.25822713971138, "logits/rejected": -0.031956903636455536, "logps/chosen": -212.9768524169922, "logps/rejected": -885.8753662109375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": 0.28321021795272827, "rewards/margins": 19.403011322021484, "rewards/rejected": -19.119800567626953, "step": 8050 }, { "epoch": 2.74, "learning_rate": 4.8218557220193875e-08, "logits/chosen": -0.3647927939891815, "logits/rejected": 0.24535784125328064, "logps/chosen": -505.6732482910156, "logps/rejected": -658.9633178710938, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.3358546793460846, "rewards/margins": 17.331623077392578, "rewards/rejected": -17.667476654052734, "step": 8060 }, { "epoch": 2.74, "learning_rate": 4.7589072138990305e-08, "logits/chosen": -0.38610634207725525, "logits/rejected": 0.15628768503665924, "logps/chosen": -270.6438903808594, "logps/rejected": -542.3176879882812, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.31366318464279175, "rewards/margins": 19.942045211791992, "rewards/rejected": -20.25571060180664, "step": 8070 }, { "epoch": 2.75, "learning_rate": 4.695958705778673e-08, "logits/chosen": -0.3756251037120819, "logits/rejected": 0.13033072650432587, "logps/chosen": -417.3048400878906, "logps/rejected": -399.1468505859375, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.6009563207626343, "rewards/margins": 18.38619041442871, "rewards/rejected": -18.987146377563477, "step": 8080 }, { "epoch": 2.75, "learning_rate": 4.633010197658315e-08, "logits/chosen": -0.3574756979942322, "logits/rejected": -0.017300555482506752, "logps/chosen": -298.0933837890625, "logps/rejected": -525.4617309570312, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -0.06970946490764618, "rewards/margins": 16.755199432373047, "rewards/rejected": -16.824907302856445, "step": 8090 }, { "epoch": 2.75, "learning_rate": 4.570061689537958e-08, "logits/chosen": -0.42838984727859497, "logits/rejected": 0.08460330218076706, "logps/chosen": -444.3690490722656, "logps/rejected": -619.1876831054688, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.11683769524097443, "rewards/margins": 18.930652618408203, "rewards/rejected": -19.047489166259766, "step": 8100 }, { "epoch": 2.75, "eval_logits/chosen": -0.31680235266685486, "eval_logits/rejected": 0.1993274986743927, "eval_logps/chosen": -275.505615234375, "eval_logps/rejected": -646.5726318359375, "eval_loss": 0.004957031924277544, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.10240332782268524, "eval_rewards/margins": 19.545196533203125, "eval_rewards/rejected": -19.647598266601562, "eval_runtime": 520.6383, "eval_samples_per_second": 18.247, "eval_steps_per_second": 0.57, "step": 8100 }, { "epoch": 2.76, "learning_rate": 4.5071131814176e-08, "logits/chosen": -0.3674335181713104, "logits/rejected": -0.02518102526664734, "logps/chosen": -356.11041259765625, "logps/rejected": -645.454833984375, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.24242766201496124, "rewards/margins": 17.444467544555664, "rewards/rejected": -17.6868953704834, "step": 8110 }, { "epoch": 2.76, "learning_rate": 4.4441646732972425e-08, "logits/chosen": -0.29900816082954407, "logits/rejected": 0.2449352741241455, "logps/chosen": -224.79736328125, "logps/rejected": -846.6522216796875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.13536189496517181, "rewards/margins": 20.748428344726562, "rewards/rejected": -20.883792877197266, "step": 8120 }, { "epoch": 2.76, "learning_rate": 4.3812161651768855e-08, "logits/chosen": -0.35351595282554626, "logits/rejected": 0.2057342231273651, "logps/chosen": -226.8794708251953, "logps/rejected": -623.8216552734375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.2995452284812927, "rewards/margins": 22.641799926757812, "rewards/rejected": -22.34225845336914, "step": 8130 }, { "epoch": 2.77, "learning_rate": 4.318267657056528e-08, "logits/chosen": -0.4926691949367523, "logits/rejected": -0.04400800168514252, "logps/chosen": -308.2256164550781, "logps/rejected": -526.2093505859375, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.45002955198287964, "rewards/margins": 16.534055709838867, "rewards/rejected": -16.984086990356445, "step": 8140 }, { "epoch": 2.77, "learning_rate": 4.25531914893617e-08, "logits/chosen": -0.35863009095191956, "logits/rejected": 0.01549004577100277, "logps/chosen": -229.8787384033203, "logps/rejected": -733.0128173828125, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.05192599818110466, "rewards/margins": 21.5626163482666, "rewards/rejected": -21.61454200744629, "step": 8150 }, { "epoch": 2.77, "learning_rate": 4.192370640815812e-08, "logits/chosen": -0.455331951379776, "logits/rejected": 0.1577502191066742, "logps/chosen": -260.69903564453125, "logps/rejected": -531.1951904296875, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -0.26856669783592224, "rewards/margins": 18.737468719482422, "rewards/rejected": -19.00603675842285, "step": 8160 }, { "epoch": 2.78, "learning_rate": 4.129422132695455e-08, "logits/chosen": -0.3866944909095764, "logits/rejected": 0.15424861013889313, "logps/chosen": -277.83734130859375, "logps/rejected": -505.60723876953125, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -0.06852970272302628, "rewards/margins": 16.509437561035156, "rewards/rejected": -16.577966690063477, "step": 8170 }, { "epoch": 2.78, "learning_rate": 4.0664736245750975e-08, "logits/chosen": -0.3169029653072357, "logits/rejected": 0.1601024717092514, "logps/chosen": -223.6556854248047, "logps/rejected": -519.9677734375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.16406314074993134, "rewards/margins": 20.55518913269043, "rewards/rejected": -20.391128540039062, "step": 8180 }, { "epoch": 2.78, "learning_rate": 4.00352511645474e-08, "logits/chosen": -0.43145766854286194, "logits/rejected": 0.08270768821239471, "logps/chosen": -300.02239990234375, "logps/rejected": -624.8916015625, "loss": 0.0052, "rewards/accuracies": 1.0, "rewards/chosen": -0.2566896975040436, "rewards/margins": 19.668794631958008, "rewards/rejected": -19.925485610961914, "step": 8190 }, { "epoch": 2.79, "learning_rate": 3.940576608334383e-08, "logits/chosen": -0.3226594924926758, "logits/rejected": 0.11796705424785614, "logps/chosen": -304.64495849609375, "logps/rejected": -616.9491577148438, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.09559761732816696, "rewards/margins": 19.691829681396484, "rewards/rejected": -19.596233367919922, "step": 8200 }, { "epoch": 2.79, "eval_logits/chosen": -0.3120296597480774, "eval_logits/rejected": 0.21520785987377167, "eval_logps/chosen": -274.8596496582031, "eval_logps/rejected": -644.1543579101562, "eval_loss": 0.005188153125345707, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.037803880870342255, "eval_rewards/margins": 19.367956161499023, "eval_rewards/rejected": -19.405759811401367, "eval_runtime": 522.0213, "eval_samples_per_second": 18.198, "eval_steps_per_second": 0.569, "step": 8200 }, { "epoch": 2.79, "learning_rate": 3.877628100214025e-08, "logits/chosen": -0.3560418486595154, "logits/rejected": 0.06806419789791107, "logps/chosen": -282.5479736328125, "logps/rejected": -645.466796875, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.26784008741378784, "rewards/margins": 20.056163787841797, "rewards/rejected": -19.788326263427734, "step": 8210 }, { "epoch": 2.79, "learning_rate": 3.814679592093667e-08, "logits/chosen": -0.3281356394290924, "logits/rejected": 0.20315146446228027, "logps/chosen": -342.67926025390625, "logps/rejected": -869.8928833007812, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.1843252032995224, "rewards/margins": 22.211393356323242, "rewards/rejected": -22.027070999145508, "step": 8220 }, { "epoch": 2.8, "learning_rate": 3.75173108397331e-08, "logits/chosen": -0.29701095819473267, "logits/rejected": 0.15621411800384521, "logps/chosen": -263.4089050292969, "logps/rejected": -756.075927734375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": 0.017467346042394638, "rewards/margins": 20.204809188842773, "rewards/rejected": -20.187341690063477, "step": 8230 }, { "epoch": 2.8, "learning_rate": 3.688782575852952e-08, "logits/chosen": -0.4283333718776703, "logits/rejected": -0.054701946675777435, "logps/chosen": -202.96438598632812, "logps/rejected": -468.15936279296875, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": 0.26011061668395996, "rewards/margins": 17.522605895996094, "rewards/rejected": -17.262493133544922, "step": 8240 }, { "epoch": 2.8, "learning_rate": 3.625834067732594e-08, "logits/chosen": -0.37352633476257324, "logits/rejected": 0.1422167271375656, "logps/chosen": -348.8667907714844, "logps/rejected": -660.7919921875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.1935591995716095, "rewards/margins": 20.00636863708496, "rewards/rejected": -20.199926376342773, "step": 8250 }, { "epoch": 2.81, "learning_rate": 3.562885559612237e-08, "logits/chosen": -0.34707531332969666, "logits/rejected": 0.034444861114025116, "logps/chosen": -318.2336120605469, "logps/rejected": -729.9231567382812, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": 0.0894518494606018, "rewards/margins": 20.152185440063477, "rewards/rejected": -20.062734603881836, "step": 8260 }, { "epoch": 2.81, "learning_rate": 3.499937051491879e-08, "logits/chosen": -0.316379189491272, "logits/rejected": 0.19939155876636505, "logps/chosen": -213.73635864257812, "logps/rejected": -590.4732055664062, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.15837104618549347, "rewards/margins": 21.57001304626465, "rewards/rejected": -21.7283878326416, "step": 8270 }, { "epoch": 2.81, "learning_rate": 3.4369885433715216e-08, "logits/chosen": -0.38096386194229126, "logits/rejected": 0.14628683030605316, "logps/chosen": -200.47122192382812, "logps/rejected": -450.35430908203125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.18745212256908417, "rewards/margins": 16.808597564697266, "rewards/rejected": -16.621143341064453, "step": 8280 }, { "epoch": 2.82, "learning_rate": 3.3740400352511645e-08, "logits/chosen": -0.23089143633842468, "logits/rejected": 0.09957318007946014, "logps/chosen": -198.8109130859375, "logps/rejected": -884.43994140625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.0862189382314682, "rewards/margins": 19.718753814697266, "rewards/rejected": -19.632532119750977, "step": 8290 }, { "epoch": 2.82, "learning_rate": 3.311091527130807e-08, "logits/chosen": -0.38890451192855835, "logits/rejected": 0.09934793412685394, "logps/chosen": -235.9651641845703, "logps/rejected": -462.19537353515625, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.18240725994110107, "rewards/margins": 18.536117553710938, "rewards/rejected": -18.353710174560547, "step": 8300 }, { "epoch": 2.82, "eval_logits/chosen": -0.31677719950675964, "eval_logits/rejected": 0.2093045860528946, "eval_logps/chosen": -275.031005859375, "eval_logps/rejected": -642.6404418945312, "eval_loss": 0.004932201001793146, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.05494037643074989, "eval_rewards/margins": 19.19943618774414, "eval_rewards/rejected": -19.254375457763672, "eval_runtime": 520.3933, "eval_samples_per_second": 18.255, "eval_steps_per_second": 0.571, "step": 8300 }, { "epoch": 2.82, "learning_rate": 3.248143019010449e-08, "logits/chosen": -0.37529391050338745, "logits/rejected": 0.09888962656259537, "logps/chosen": -201.05174255371094, "logps/rejected": -693.7867431640625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -0.15759918093681335, "rewards/margins": 18.00632095336914, "rewards/rejected": -18.16391944885254, "step": 8310 }, { "epoch": 2.83, "learning_rate": 3.1851945108900914e-08, "logits/chosen": -0.3092917799949646, "logits/rejected": 0.03142751008272171, "logps/chosen": -254.9937286376953, "logps/rejected": -721.5255126953125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.13655634224414825, "rewards/margins": 19.118785858154297, "rewards/rejected": -19.25534439086914, "step": 8320 }, { "epoch": 2.83, "learning_rate": 3.122246002769734e-08, "logits/chosen": -0.3670196533203125, "logits/rejected": 0.10763716697692871, "logps/chosen": -198.99435424804688, "logps/rejected": -566.072021484375, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": 0.18733423948287964, "rewards/margins": 20.038074493408203, "rewards/rejected": -19.850740432739258, "step": 8330 }, { "epoch": 2.83, "learning_rate": 3.0592974946493766e-08, "logits/chosen": -0.4142381250858307, "logits/rejected": 0.14496204257011414, "logps/chosen": -261.28936767578125, "logps/rejected": -506.35943603515625, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.40061455965042114, "rewards/margins": 18.915958404541016, "rewards/rejected": -18.515344619750977, "step": 8340 }, { "epoch": 2.84, "learning_rate": 2.996348986529019e-08, "logits/chosen": -0.39297693967819214, "logits/rejected": 0.15535160899162292, "logps/chosen": -364.88507080078125, "logps/rejected": -517.7476196289062, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -0.0554676428437233, "rewards/margins": 17.928203582763672, "rewards/rejected": -17.98366928100586, "step": 8350 }, { "epoch": 2.84, "learning_rate": 2.9334004784086618e-08, "logits/chosen": -0.35367444157600403, "logits/rejected": 0.02894691191613674, "logps/chosen": -235.10122680664062, "logps/rejected": -707.2842407226562, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": 0.033696383237838745, "rewards/margins": 21.31123161315918, "rewards/rejected": -21.27753448486328, "step": 8360 }, { "epoch": 2.85, "learning_rate": 2.870451970288304e-08, "logits/chosen": -0.29045599699020386, "logits/rejected": 0.176130473613739, "logps/chosen": -346.58905029296875, "logps/rejected": -733.6036987304688, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -0.3789774775505066, "rewards/margins": 17.719711303710938, "rewards/rejected": -18.09868812561035, "step": 8370 }, { "epoch": 2.85, "learning_rate": 2.8075034621679467e-08, "logits/chosen": -0.453036367893219, "logits/rejected": 0.16874201595783234, "logps/chosen": -412.557373046875, "logps/rejected": -395.22662353515625, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": 0.24765856564044952, "rewards/margins": 16.626035690307617, "rewards/rejected": -16.378376007080078, "step": 8380 }, { "epoch": 2.85, "learning_rate": 2.744554954047589e-08, "logits/chosen": -0.33247411251068115, "logits/rejected": 0.06772321462631226, "logps/chosen": -211.9908447265625, "logps/rejected": -642.5079345703125, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.26629772782325745, "rewards/margins": 19.931400299072266, "rewards/rejected": -20.19769859313965, "step": 8390 }, { "epoch": 2.86, "learning_rate": 2.6816064459272312e-08, "logits/chosen": -0.34216535091400146, "logits/rejected": 0.061750076711177826, "logps/chosen": -409.92681884765625, "logps/rejected": -685.1072998046875, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.10784037411212921, "rewards/margins": 19.183700561523438, "rewards/rejected": -19.291540145874023, "step": 8400 }, { "epoch": 2.86, "eval_logits/chosen": -0.3181983232498169, "eval_logits/rejected": 0.20570039749145508, "eval_logps/chosen": -275.699951171875, "eval_logps/rejected": -642.7105712890625, "eval_loss": 0.005119685549288988, "eval_rewards/accuracies": 0.9983165264129639, "eval_rewards/chosen": -0.12183143943548203, "eval_rewards/margins": 19.139554977416992, "eval_rewards/rejected": -19.261384963989258, "eval_runtime": 521.8617, "eval_samples_per_second": 18.204, "eval_steps_per_second": 0.569, "step": 8400 }, { "epoch": 2.86, "learning_rate": 2.618657937806874e-08, "logits/chosen": -0.3654525578022003, "logits/rejected": 0.12093323469161987, "logps/chosen": -286.61712646484375, "logps/rejected": -592.947265625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -0.2557828426361084, "rewards/margins": 18.735933303833008, "rewards/rejected": -18.991714477539062, "step": 8410 }, { "epoch": 2.86, "learning_rate": 2.555709429686516e-08, "logits/chosen": -0.3695344924926758, "logits/rejected": 0.04543386399745941, "logps/chosen": -367.65216064453125, "logps/rejected": -636.5623168945312, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": 0.41608715057373047, "rewards/margins": 19.689119338989258, "rewards/rejected": -19.273035049438477, "step": 8420 }, { "epoch": 2.87, "learning_rate": 2.4927609215661587e-08, "logits/chosen": -0.38328441977500916, "logits/rejected": 0.12453804910182953, "logps/chosen": -221.94503784179688, "logps/rejected": -465.61785888671875, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -0.29536086320877075, "rewards/margins": 19.375661849975586, "rewards/rejected": -19.671024322509766, "step": 8430 }, { "epoch": 2.87, "learning_rate": 2.4298124134458013e-08, "logits/chosen": -0.20946168899536133, "logits/rejected": 0.127980038523674, "logps/chosen": -224.0953369140625, "logps/rejected": -699.3449096679688, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.35831114649772644, "rewards/margins": 19.10135269165039, "rewards/rejected": -19.45966339111328, "step": 8440 }, { "epoch": 2.87, "learning_rate": 2.3668639053254436e-08, "logits/chosen": -0.355962336063385, "logits/rejected": 0.1826200634241104, "logps/chosen": -220.44229125976562, "logps/rejected": -482.5506896972656, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.46060651540756226, "rewards/margins": 17.281997680664062, "rewards/rejected": -17.742603302001953, "step": 8450 }, { "epoch": 2.88, "learning_rate": 2.3039153972050862e-08, "logits/chosen": -0.2795206606388092, "logits/rejected": 0.15407665073871613, "logps/chosen": -286.06536865234375, "logps/rejected": -841.4010620117188, "loss": 0.0043, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.13345026969909668, "rewards/margins": 20.240093231201172, "rewards/rejected": -20.373544692993164, "step": 8460 }, { "epoch": 2.88, "learning_rate": 2.2409668890847285e-08, "logits/chosen": -0.2861042618751526, "logits/rejected": 0.06750325113534927, "logps/chosen": -222.812744140625, "logps/rejected": -620.4591064453125, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": 0.09700587391853333, "rewards/margins": 18.216859817504883, "rewards/rejected": -18.119853973388672, "step": 8470 }, { "epoch": 2.88, "learning_rate": 2.178018380964371e-08, "logits/chosen": -0.2719668745994568, "logits/rejected": 0.12401912361383438, "logps/chosen": -291.56719970703125, "logps/rejected": -690.18896484375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.04626452550292015, "rewards/margins": 21.546207427978516, "rewards/rejected": -21.59246826171875, "step": 8480 }, { "epoch": 2.89, "learning_rate": 2.1150698728440137e-08, "logits/chosen": -0.3396082818508148, "logits/rejected": 0.08116491883993149, "logps/chosen": -265.47479248046875, "logps/rejected": -598.5577392578125, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": 0.4892120361328125, "rewards/margins": 20.683177947998047, "rewards/rejected": -20.193965911865234, "step": 8490 }, { "epoch": 2.89, "learning_rate": 2.052121364723656e-08, "logits/chosen": -0.3692508637905121, "logits/rejected": 0.14342454075813293, "logps/chosen": -217.41043090820312, "logps/rejected": -579.2731323242188, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": -0.23894758522510529, "rewards/margins": 16.278301239013672, "rewards/rejected": -16.517250061035156, "step": 8500 }, { "epoch": 2.89, "eval_logits/chosen": -0.31505724787712097, "eval_logits/rejected": 0.20770619809627533, "eval_logps/chosen": -275.5386657714844, "eval_logps/rejected": -643.1372680664062, "eval_loss": 0.004997415468096733, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.10570650547742844, "eval_rewards/margins": 19.198347091674805, "eval_rewards/rejected": -19.304054260253906, "eval_runtime": 512.6857, "eval_samples_per_second": 18.53, "eval_steps_per_second": 0.579, "step": 8500 }, { "epoch": 2.89, "learning_rate": 1.9891728566032983e-08, "logits/chosen": -0.42552971839904785, "logits/rejected": 0.23342449963092804, "logps/chosen": -287.22637939453125, "logps/rejected": -479.0611267089844, "loss": 0.0152, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -0.25857609510421753, "rewards/margins": 19.918804168701172, "rewards/rejected": -20.177383422851562, "step": 8510 }, { "epoch": 2.9, "learning_rate": 1.926224348482941e-08, "logits/chosen": -0.33626025915145874, "logits/rejected": 0.10100536048412323, "logps/chosen": -345.96630859375, "logps/rejected": -587.537353515625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": 0.016762619838118553, "rewards/margins": 20.181842803955078, "rewards/rejected": -20.16507911682129, "step": 8520 }, { "epoch": 2.9, "learning_rate": 1.863275840362583e-08, "logits/chosen": -0.31563514471054077, "logits/rejected": 0.20301476120948792, "logps/chosen": -217.7105712890625, "logps/rejected": -785.9031982421875, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -0.47327232360839844, "rewards/margins": 20.36530876159668, "rewards/rejected": -20.838581085205078, "step": 8530 }, { "epoch": 2.9, "learning_rate": 1.8003273322422258e-08, "logits/chosen": -0.39627450704574585, "logits/rejected": 0.13776658475399017, "logps/chosen": -236.48904418945312, "logps/rejected": -634.7493896484375, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.08518987149000168, "rewards/margins": 17.40428924560547, "rewards/rejected": -17.48948097229004, "step": 8540 }, { "epoch": 2.91, "learning_rate": 1.737378824121868e-08, "logits/chosen": -0.3786807358264923, "logits/rejected": 0.1269473284482956, "logps/chosen": -215.30636596679688, "logps/rejected": -641.5479736328125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -0.5637613534927368, "rewards/margins": 19.29054832458496, "rewards/rejected": -19.854310989379883, "step": 8550 }, { "epoch": 2.91, "learning_rate": 1.6744303160015107e-08, "logits/chosen": -0.33440008759498596, "logits/rejected": 0.19664843380451202, "logps/chosen": -291.2057189941406, "logps/rejected": -608.4605712890625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.5373510122299194, "rewards/margins": 18.95590591430664, "rewards/rejected": -19.49325942993164, "step": 8560 }, { "epoch": 2.91, "learning_rate": 1.6114818078811533e-08, "logits/chosen": -0.34686657786369324, "logits/rejected": 0.0658293291926384, "logps/chosen": -296.5673522949219, "logps/rejected": -744.0396728515625, "loss": 0.0016, "rewards/accuracies": 1.0, "rewards/chosen": -0.11130708456039429, "rewards/margins": 18.271610260009766, "rewards/rejected": -18.382917404174805, "step": 8570 }, { "epoch": 2.92, "learning_rate": 1.5485332997607955e-08, "logits/chosen": -0.2781277000904083, "logits/rejected": 0.11952020227909088, "logps/chosen": -208.7974853515625, "logps/rejected": -607.1897583007812, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -0.03477661684155464, "rewards/margins": 19.4646053314209, "rewards/rejected": -19.49938201904297, "step": 8580 }, { "epoch": 2.92, "learning_rate": 1.485584791640438e-08, "logits/chosen": -0.3437816798686981, "logits/rejected": 0.1654711812734604, "logps/chosen": -272.3050537109375, "logps/rejected": -650.3447875976562, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": -0.544821560382843, "rewards/margins": 18.019500732421875, "rewards/rejected": -18.564321517944336, "step": 8590 }, { "epoch": 2.92, "learning_rate": 1.4226362835200804e-08, "logits/chosen": -0.3330112099647522, "logits/rejected": 0.05484815686941147, "logps/chosen": -222.2970733642578, "logps/rejected": -667.8601684570312, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": 0.08124381303787231, "rewards/margins": 17.252513885498047, "rewards/rejected": -17.171268463134766, "step": 8600 }, { "epoch": 2.92, "eval_logits/chosen": -0.3207896053791046, "eval_logits/rejected": 0.2057424634695053, "eval_logps/chosen": -275.344482421875, "eval_logps/rejected": -642.9228515625, "eval_loss": 0.005201916676014662, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.08628714829683304, "eval_rewards/margins": 19.19631576538086, "eval_rewards/rejected": -19.282604217529297, "eval_runtime": 520.4931, "eval_samples_per_second": 18.252, "eval_steps_per_second": 0.571, "step": 8600 }, { "epoch": 2.93, "learning_rate": 1.3596877753997229e-08, "logits/chosen": -0.39185258746147156, "logits/rejected": 0.17406900227069855, "logps/chosen": -363.3731994628906, "logps/rejected": -504.209228515625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -0.2909688353538513, "rewards/margins": 21.337976455688477, "rewards/rejected": -21.628948211669922, "step": 8610 }, { "epoch": 2.93, "learning_rate": 1.2967392672793655e-08, "logits/chosen": -0.25547683238983154, "logits/rejected": 0.09424517303705215, "logps/chosen": -234.4093780517578, "logps/rejected": -930.8870239257812, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": 0.01749284192919731, "rewards/margins": 19.822769165039062, "rewards/rejected": -19.805273056030273, "step": 8620 }, { "epoch": 2.93, "learning_rate": 1.233790759159008e-08, "logits/chosen": -0.3448290228843689, "logits/rejected": 0.16567359864711761, "logps/chosen": -253.09457397460938, "logps/rejected": -631.2890014648438, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -0.22757604718208313, "rewards/margins": 20.147539138793945, "rewards/rejected": -20.3751163482666, "step": 8630 }, { "epoch": 2.94, "learning_rate": 1.1708422510386504e-08, "logits/chosen": -0.39597609639167786, "logits/rejected": 0.14166459441184998, "logps/chosen": -339.80242919921875, "logps/rejected": -623.1578369140625, "loss": 0.0019, "rewards/accuracies": 1.0, "rewards/chosen": 0.031338103115558624, "rewards/margins": 22.209585189819336, "rewards/rejected": -22.178247451782227, "step": 8640 }, { "epoch": 2.94, "learning_rate": 1.1078937429182926e-08, "logits/chosen": -0.3233053386211395, "logits/rejected": 0.03911024332046509, "logps/chosen": -360.2445068359375, "logps/rejected": -733.8052978515625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -0.25185245275497437, "rewards/margins": 17.769033432006836, "rewards/rejected": -18.020885467529297, "step": 8650 }, { "epoch": 2.94, "learning_rate": 1.0449452347979353e-08, "logits/chosen": -0.40889984369277954, "logits/rejected": 0.15022698044776917, "logps/chosen": -339.4707946777344, "logps/rejected": -632.509521484375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": 0.3711167275905609, "rewards/margins": 19.04721450805664, "rewards/rejected": -18.67609977722168, "step": 8660 }, { "epoch": 2.95, "learning_rate": 9.819967266775777e-09, "logits/chosen": -0.3529255986213684, "logits/rejected": 0.1635160744190216, "logps/chosen": -218.1907196044922, "logps/rejected": -653.00732421875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -0.23545996844768524, "rewards/margins": 19.024065017700195, "rewards/rejected": -19.259525299072266, "step": 8670 }, { "epoch": 2.95, "learning_rate": 9.190482185572201e-09, "logits/chosen": -0.29137301445007324, "logits/rejected": 0.0737558901309967, "logps/chosen": -298.0067138671875, "logps/rejected": -800.0030517578125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -0.12416990101337433, "rewards/margins": 20.053112030029297, "rewards/rejected": -20.17728042602539, "step": 8680 }, { "epoch": 2.95, "learning_rate": 8.560997104368626e-09, "logits/chosen": -0.4549427628517151, "logits/rejected": 0.18920865654945374, "logps/chosen": -278.2685852050781, "logps/rejected": -512.6693115234375, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": 0.3504260182380676, "rewards/margins": 18.3614559173584, "rewards/rejected": -18.011032104492188, "step": 8690 }, { "epoch": 2.96, "learning_rate": 7.931512023165052e-09, "logits/chosen": -0.35821977257728577, "logits/rejected": -0.0030545860063284636, "logps/chosen": -293.7069396972656, "logps/rejected": -614.9229736328125, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -0.5659575462341309, "rewards/margins": 17.639347076416016, "rewards/rejected": -18.205307006835938, "step": 8700 }, { "epoch": 2.96, "eval_logits/chosen": -0.321507066488266, "eval_logits/rejected": 0.20077452063560486, "eval_logps/chosen": -275.5925598144531, "eval_logps/rejected": -643.9338989257812, "eval_loss": 0.004812250845134258, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.1110977753996849, "eval_rewards/margins": 19.27261734008789, "eval_rewards/rejected": -19.383716583251953, "eval_runtime": 525.8506, "eval_samples_per_second": 18.066, "eval_steps_per_second": 0.565, "step": 8700 }, { "epoch": 2.96, "learning_rate": 7.3020269419614755e-09, "logits/chosen": -0.264698326587677, "logits/rejected": -0.014727388508617878, "logps/chosen": -228.2021484375, "logps/rejected": -850.4404296875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.31266480684280396, "rewards/margins": 19.323570251464844, "rewards/rejected": -19.636234283447266, "step": 8710 }, { "epoch": 2.96, "learning_rate": 6.6725418607579e-09, "logits/chosen": -0.32812461256980896, "logits/rejected": 0.1724863350391388, "logps/chosen": -305.09625244140625, "logps/rejected": -727.8049926757812, "loss": 0.0027, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.07840214669704437, "rewards/margins": 22.605356216430664, "rewards/rejected": -22.526952743530273, "step": 8720 }, { "epoch": 2.97, "learning_rate": 6.043056779554324e-09, "logits/chosen": -0.2976331412792206, "logits/rejected": 0.009159152396023273, "logps/chosen": -225.8251190185547, "logps/rejected": -823.5550537109375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -0.22012238204479218, "rewards/margins": 18.091556549072266, "rewards/rejected": -18.31167984008789, "step": 8730 }, { "epoch": 2.97, "learning_rate": 5.41357169835075e-09, "logits/chosen": -0.3259055018424988, "logits/rejected": 0.18306994438171387, "logps/chosen": -306.4685363769531, "logps/rejected": -608.3792724609375, "loss": 0.0035, "rewards/accuracies": 1.0, "rewards/chosen": -0.08833061158657074, "rewards/margins": 20.53449249267578, "rewards/rejected": -20.62282371520996, "step": 8740 }, { "epoch": 2.97, "learning_rate": 4.784086617147173e-09, "logits/chosen": -0.40282148122787476, "logits/rejected": 0.0854668840765953, "logps/chosen": -417.43707275390625, "logps/rejected": -527.4506225585938, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": 0.03364210203289986, "rewards/margins": 18.519622802734375, "rewards/rejected": -18.485979080200195, "step": 8750 }, { "epoch": 2.98, "learning_rate": 4.1546015359435984e-09, "logits/chosen": -0.39789289236068726, "logits/rejected": 0.12660741806030273, "logps/chosen": -345.6722412109375, "logps/rejected": -625.1768188476562, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": 0.07757916301488876, "rewards/margins": 19.9320011138916, "rewards/rejected": -19.854421615600586, "step": 8760 }, { "epoch": 2.98, "learning_rate": 3.5251164547400225e-09, "logits/chosen": -0.3658300042152405, "logits/rejected": 0.10114561021327972, "logps/chosen": -337.4879150390625, "logps/rejected": -659.01220703125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -0.28652843832969666, "rewards/margins": 18.766878128051758, "rewards/rejected": -19.05340576171875, "step": 8770 }, { "epoch": 2.98, "learning_rate": 2.895631373536447e-09, "logits/chosen": -0.41414403915405273, "logits/rejected": 0.11803408712148666, "logps/chosen": -273.524658203125, "logps/rejected": -469.541748046875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -0.7922652959823608, "rewards/margins": 17.880075454711914, "rewards/rejected": -18.672338485717773, "step": 8780 }, { "epoch": 2.99, "learning_rate": 2.2661462923328713e-09, "logits/chosen": -0.321903258562088, "logits/rejected": 0.13337542116641998, "logps/chosen": -280.3749084472656, "logps/rejected": -661.9573364257812, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -0.42813023924827576, "rewards/margins": 19.00562858581543, "rewards/rejected": -19.433757781982422, "step": 8790 }, { "epoch": 2.99, "learning_rate": 1.6366612111292962e-09, "logits/chosen": -0.36723294854164124, "logits/rejected": 0.1273876130580902, "logps/chosen": -227.4621124267578, "logps/rejected": -558.6692504882812, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -0.47145146131515503, "rewards/margins": 21.64960289001465, "rewards/rejected": -22.12105369567871, "step": 8800 }, { "epoch": 2.99, "eval_logits/chosen": -0.32281914353370667, "eval_logits/rejected": 0.1984868198633194, "eval_logps/chosen": -275.5667724609375, "eval_logps/rejected": -644.1723022460938, "eval_loss": 0.005038694478571415, "eval_rewards/accuracies": 0.9991582632064819, "eval_rewards/chosen": -0.1085137277841568, "eval_rewards/margins": 19.299043655395508, "eval_rewards/rejected": -19.407556533813477, "eval_runtime": 525.6476, "eval_samples_per_second": 18.073, "eval_steps_per_second": 0.565, "step": 8800 }, { "epoch": 2.99, "learning_rate": 1.0071761299257208e-09, "logits/chosen": -0.37877988815307617, "logits/rejected": 0.06428341567516327, "logps/chosen": -245.1985321044922, "logps/rejected": -657.049072265625, "loss": 0.0011, "rewards/accuracies": 1.0, "rewards/chosen": 0.26182955503463745, "rewards/margins": 20.15967559814453, "rewards/rejected": -19.897846221923828, "step": 8810 }, { "epoch": 3.0, "learning_rate": 3.7769104872214527e-10, "logits/chosen": -0.3837678134441376, "logits/rejected": 0.2529815435409546, "logps/chosen": -233.7167510986328, "logps/rejected": -504.2972717285156, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.47982874512672424, "rewards/margins": 17.965167999267578, "rewards/rejected": -18.444995880126953, "step": 8820 }, { "epoch": 3.0, "step": 8826, "total_flos": 0.0, "train_loss": 0.03813637946047515, "train_runtime": 102663.2378, "train_samples_per_second": 5.502, "train_steps_per_second": 0.086 } ], "logging_steps": 10, "max_steps": 8826, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }