{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 2000, "global_step": 4168, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0002399232245681382, "grad_norm": 20.90535270812656, "learning_rate": 1.199040767386091e-09, "logits/chosen": -0.48379573225975037, "logits/rejected": -0.48017197847366333, "logps/chosen": -250.1331329345703, "logps/rejected": -232.6839141845703, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0023992322456813818, "grad_norm": 20.316799458165775, "learning_rate": 1.199040767386091e-08, "logits/chosen": -0.4963577091693878, "logits/rejected": -0.5276286005973816, "logps/chosen": -441.6046142578125, "logps/rejected": -363.4785461425781, "loss": 0.693, "rewards/accuracies": 0.4722222089767456, "rewards/chosen": 0.0010866652010008693, "rewards/margins": 0.0004253386869095266, "rewards/rejected": 0.0006613265140913427, "step": 10 }, { "epoch": 0.0047984644913627635, "grad_norm": 20.614630611685648, "learning_rate": 2.398081534772182e-08, "logits/chosen": -0.5616664290428162, "logits/rejected": -0.5348426103591919, "logps/chosen": -311.93389892578125, "logps/rejected": -278.0029602050781, "loss": 0.6934, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00013055796443950385, "rewards/margins": 0.0002189161314163357, "rewards/rejected": -8.835792687023059e-05, "step": 20 }, { "epoch": 0.007197696737044146, "grad_norm": 19.284588490453608, "learning_rate": 3.597122302158273e-08, "logits/chosen": -0.5190974473953247, "logits/rejected": -0.5706892013549805, "logps/chosen": -319.748779296875, "logps/rejected": -331.7994689941406, "loss": 0.6931, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0010061769280582666, "rewards/margins": 0.0013537806225940585, "rewards/rejected": -0.0003476037527434528, "step": 30 }, { "epoch": 0.009596928982725527, "grad_norm": 19.442113506121437, "learning_rate": 4.796163069544364e-08, "logits/chosen": -0.5577880144119263, "logits/rejected": -0.5859715938568115, "logps/chosen": -338.12628173828125, "logps/rejected": -314.81982421875, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00019955830066464841, "rewards/margins": -5.653758853441104e-05, "rewards/rejected": 0.000256095954682678, "step": 40 }, { "epoch": 0.01199616122840691, "grad_norm": 20.77025303650937, "learning_rate": 5.995203836930455e-08, "logits/chosen": -0.5782157182693481, "logits/rejected": -0.5549123287200928, "logps/chosen": -335.87646484375, "logps/rejected": -289.2035217285156, "loss": 0.6934, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0005626773927360773, "rewards/margins": -0.00042233389103785157, "rewards/rejected": 0.0009850109927356243, "step": 50 }, { "epoch": 0.014395393474088292, "grad_norm": 19.993953212894812, "learning_rate": 7.194244604316546e-08, "logits/chosen": -0.5433920621871948, "logits/rejected": -0.49929919838905334, "logps/chosen": -355.02740478515625, "logps/rejected": -338.33148193359375, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0006808604812249541, "rewards/margins": -0.0011384403333067894, "rewards/rejected": 0.0004575795610435307, "step": 60 }, { "epoch": 0.016794625719769675, "grad_norm": 17.979585480540507, "learning_rate": 8.393285371702638e-08, "logits/chosen": -0.49723702669143677, "logits/rejected": -0.48305654525756836, "logps/chosen": -353.8045654296875, "logps/rejected": -327.27716064453125, "loss": 0.6927, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.002497387584298849, "rewards/margins": 0.0025346879847347736, "rewards/rejected": -3.7300120311556384e-05, "step": 70 }, { "epoch": 0.019193857965451054, "grad_norm": 25.26789980160209, "learning_rate": 9.592326139088728e-08, "logits/chosen": -0.5563893914222717, "logits/rejected": -0.4884260594844818, "logps/chosen": -261.504638671875, "logps/rejected": -315.64349365234375, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.002656942466273904, "rewards/margins": 0.0009416007669642568, "rewards/rejected": 0.0017153415828943253, "step": 80 }, { "epoch": 0.021593090211132437, "grad_norm": 18.57391434910598, "learning_rate": 1.0791366906474819e-07, "logits/chosen": -0.5545334815979004, "logits/rejected": -0.5614916086196899, "logps/chosen": -396.33416748046875, "logps/rejected": -342.1172180175781, "loss": 0.6924, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.004076135344803333, "rewards/margins": 0.0013626832515001297, "rewards/rejected": 0.002713452558964491, "step": 90 }, { "epoch": 0.02399232245681382, "grad_norm": 20.078943593256316, "learning_rate": 1.199040767386091e-07, "logits/chosen": -0.5230361819267273, "logits/rejected": -0.4858153760433197, "logps/chosen": -326.3655090332031, "logps/rejected": -351.19390869140625, "loss": 0.6923, "rewards/accuracies": 0.625, "rewards/chosen": 0.0061579798348248005, "rewards/margins": 0.001345540746115148, "rewards/rejected": 0.004812438972294331, "step": 100 }, { "epoch": 0.026391554702495202, "grad_norm": 20.875381676857184, "learning_rate": 1.3189448441247004e-07, "logits/chosen": -0.5459330677986145, "logits/rejected": -0.5579243898391724, "logps/chosen": -287.3791198730469, "logps/rejected": -290.72393798828125, "loss": 0.6915, "rewards/accuracies": 0.5, "rewards/chosen": 0.009638044983148575, "rewards/margins": 0.00030891623464412987, "rewards/rejected": 0.009329128079116344, "step": 110 }, { "epoch": 0.028790786948176585, "grad_norm": 20.107786625562355, "learning_rate": 1.4388489208633092e-07, "logits/chosen": -0.5211232900619507, "logits/rejected": -0.5448856353759766, "logps/chosen": -363.11431884765625, "logps/rejected": -348.9471435546875, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.014917564578354359, "rewards/margins": 0.003776032943278551, "rewards/rejected": 0.01114153116941452, "step": 120 }, { "epoch": 0.031190019193857964, "grad_norm": 17.419578673944976, "learning_rate": 1.5587529976019183e-07, "logits/chosen": -0.5541412830352783, "logits/rejected": -0.5439847707748413, "logps/chosen": -273.02838134765625, "logps/rejected": -365.01483154296875, "loss": 0.6901, "rewards/accuracies": 0.75, "rewards/chosen": 0.019415050745010376, "rewards/margins": 0.00881609134376049, "rewards/rejected": 0.01059896033257246, "step": 130 }, { "epoch": 0.03358925143953935, "grad_norm": 20.19183366811833, "learning_rate": 1.6786570743405277e-07, "logits/chosen": -0.45227426290512085, "logits/rejected": -0.45624417066574097, "logps/chosen": -366.0704040527344, "logps/rejected": -355.80474853515625, "loss": 0.6892, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.015087930485606194, "rewards/margins": 0.005366227589547634, "rewards/rejected": 0.009721704758703709, "step": 140 }, { "epoch": 0.03598848368522073, "grad_norm": 19.066102175382554, "learning_rate": 1.7985611510791365e-07, "logits/chosen": -0.5239461064338684, "logits/rejected": -0.5222934484481812, "logps/chosen": -282.2486267089844, "logps/rejected": -280.42718505859375, "loss": 0.6902, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.020530493929982185, "rewards/margins": 0.0036348134744912386, "rewards/rejected": 0.016895681619644165, "step": 150 }, { "epoch": 0.03838771593090211, "grad_norm": 19.027643825440478, "learning_rate": 1.9184652278177456e-07, "logits/chosen": -0.46417126059532166, "logits/rejected": -0.47142887115478516, "logps/chosen": -372.260009765625, "logps/rejected": -299.72418212890625, "loss": 0.6861, "rewards/accuracies": 0.625, "rewards/chosen": 0.024696629494428635, "rewards/margins": 0.01829499378800392, "rewards/rejected": 0.006401637103408575, "step": 160 }, { "epoch": 0.040786948176583494, "grad_norm": 20.05176445155851, "learning_rate": 2.038369304556355e-07, "logits/chosen": -0.4728211760520935, "logits/rejected": -0.4653477072715759, "logps/chosen": -410.3612365722656, "logps/rejected": -395.3166198730469, "loss": 0.6874, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0359230674803257, "rewards/margins": 0.016820725053548813, "rewards/rejected": 0.019102338701486588, "step": 170 }, { "epoch": 0.04318618042226487, "grad_norm": 19.717298290033078, "learning_rate": 2.1582733812949638e-07, "logits/chosen": -0.5537582039833069, "logits/rejected": -0.5516412854194641, "logps/chosen": -294.61224365234375, "logps/rejected": -295.9138488769531, "loss": 0.6861, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02052612230181694, "rewards/margins": 0.01923990622162819, "rewards/rejected": 0.0012862167786806822, "step": 180 }, { "epoch": 0.04558541266794626, "grad_norm": 22.451123090776118, "learning_rate": 2.278177458033573e-07, "logits/chosen": -0.4669855535030365, "logits/rejected": -0.46975016593933105, "logps/chosen": -386.79052734375, "logps/rejected": -322.21063232421875, "loss": 0.6855, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0220388974994421, "rewards/margins": 0.006877691484987736, "rewards/rejected": 0.015161206014454365, "step": 190 }, { "epoch": 0.04798464491362764, "grad_norm": 18.561601949682256, "learning_rate": 2.398081534772182e-07, "logits/chosen": -0.5395389199256897, "logits/rejected": -0.4788607060909271, "logps/chosen": -370.59832763671875, "logps/rejected": -354.6778869628906, "loss": 0.6811, "rewards/accuracies": 0.625, "rewards/chosen": 0.026123318821191788, "rewards/margins": 0.02886904776096344, "rewards/rejected": -0.002745730336755514, "step": 200 }, { "epoch": 0.05038387715930902, "grad_norm": 22.094980613810247, "learning_rate": 2.517985611510791e-07, "logits/chosen": -0.5375515818595886, "logits/rejected": -0.547138512134552, "logps/chosen": -304.6062927246094, "logps/rejected": -330.10687255859375, "loss": 0.6814, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.03125152364373207, "rewards/margins": 0.03389766812324524, "rewards/rejected": -0.002646142616868019, "step": 210 }, { "epoch": 0.052783109404990404, "grad_norm": 18.489557098025607, "learning_rate": 2.637889688249401e-07, "logits/chosen": -0.5321250557899475, "logits/rejected": -0.5411959886550903, "logps/chosen": -391.7878723144531, "logps/rejected": -377.56280517578125, "loss": 0.6834, "rewards/accuracies": 0.5, "rewards/chosen": 0.012276771478354931, "rewards/margins": -0.0032779511529952288, "rewards/rejected": 0.01555472332984209, "step": 220 }, { "epoch": 0.05518234165067178, "grad_norm": 22.525575101088698, "learning_rate": 2.7577937649880093e-07, "logits/chosen": -0.5341587066650391, "logits/rejected": -0.5006336569786072, "logps/chosen": -303.1698913574219, "logps/rejected": -340.33331298828125, "loss": 0.6738, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005457176826894283, "rewards/margins": 0.024495940655469894, "rewards/rejected": -0.019038762897253036, "step": 230 }, { "epoch": 0.05758157389635317, "grad_norm": 20.562033234462188, "learning_rate": 2.8776978417266184e-07, "logits/chosen": -0.5429738759994507, "logits/rejected": -0.5385856032371521, "logps/chosen": -357.30609130859375, "logps/rejected": -311.60260009765625, "loss": 0.6723, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.030288681387901306, "rewards/margins": 0.048918746411800385, "rewards/rejected": -0.01863006316125393, "step": 240 }, { "epoch": 0.05998080614203455, "grad_norm": 22.776979550503004, "learning_rate": 2.997601918465228e-07, "logits/chosen": -0.5102118253707886, "logits/rejected": -0.5135980844497681, "logps/chosen": -294.0608825683594, "logps/rejected": -275.83673095703125, "loss": 0.6674, "rewards/accuracies": 0.75, "rewards/chosen": 0.005173470359295607, "rewards/margins": 0.06198770925402641, "rewards/rejected": -0.05681424215435982, "step": 250 }, { "epoch": 0.06238003838771593, "grad_norm": 21.890329640256528, "learning_rate": 3.1175059952038366e-07, "logits/chosen": -0.5791837573051453, "logits/rejected": -0.5334831476211548, "logps/chosen": -353.4739074707031, "logps/rejected": -343.4547119140625, "loss": 0.6719, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.02404799312353134, "rewards/margins": 0.04426788166165352, "rewards/rejected": -0.06831587105989456, "step": 260 }, { "epoch": 0.0647792706333973, "grad_norm": 20.523920744785585, "learning_rate": 3.2374100719424457e-07, "logits/chosen": -0.49191370606422424, "logits/rejected": -0.5529422163963318, "logps/chosen": -347.00494384765625, "logps/rejected": -282.3544921875, "loss": 0.6609, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.03170743212103844, "rewards/margins": 0.039711810648441315, "rewards/rejected": -0.07141923159360886, "step": 270 }, { "epoch": 0.0671785028790787, "grad_norm": 20.886438274884952, "learning_rate": 3.3573141486810554e-07, "logits/chosen": -0.5886783599853516, "logits/rejected": -0.5640865564346313, "logps/chosen": -364.08575439453125, "logps/rejected": -354.1321105957031, "loss": 0.6504, "rewards/accuracies": 0.625, "rewards/chosen": -0.020602982491254807, "rewards/margins": 0.0775846317410469, "rewards/rejected": -0.09818761050701141, "step": 280 }, { "epoch": 0.06957773512476008, "grad_norm": 19.345700277409666, "learning_rate": 3.477218225419664e-07, "logits/chosen": -0.5530000329017639, "logits/rejected": -0.5117976665496826, "logps/chosen": -350.86199951171875, "logps/rejected": -327.6963806152344, "loss": 0.6513, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.05797697231173515, "rewards/margins": 0.062293171882629395, "rewards/rejected": -0.12027014791965485, "step": 290 }, { "epoch": 0.07197696737044146, "grad_norm": 22.275769826792928, "learning_rate": 3.597122302158273e-07, "logits/chosen": -0.6038728952407837, "logits/rejected": -0.6336754560470581, "logps/chosen": -332.75714111328125, "logps/rejected": -356.898193359375, "loss": 0.6532, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10048200935125351, "rewards/margins": 0.09853404760360718, "rewards/rejected": -0.1990160346031189, "step": 300 }, { "epoch": 0.07437619961612284, "grad_norm": 22.363116052564926, "learning_rate": 3.7170263788968827e-07, "logits/chosen": -0.5675481557846069, "logits/rejected": -0.6176060438156128, "logps/chosen": -353.2454833984375, "logps/rejected": -325.49066162109375, "loss": 0.6582, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.0909217894077301, "rewards/margins": 0.13223211467266083, "rewards/rejected": -0.22315391898155212, "step": 310 }, { "epoch": 0.07677543186180422, "grad_norm": 24.95798686492851, "learning_rate": 3.836930455635491e-07, "logits/chosen": -0.6077001094818115, "logits/rejected": -0.609139621257782, "logps/chosen": -343.24127197265625, "logps/rejected": -309.5650634765625, "loss": 0.6432, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.055519819259643555, "rewards/margins": 0.10148320347070694, "rewards/rejected": -0.1570030152797699, "step": 320 }, { "epoch": 0.07917466410748561, "grad_norm": 21.14907440323966, "learning_rate": 3.9568345323741003e-07, "logits/chosen": -0.5618354082107544, "logits/rejected": -0.5163384079933167, "logps/chosen": -333.4284973144531, "logps/rejected": -383.4358825683594, "loss": 0.6351, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.13757416605949402, "rewards/margins": 0.1750974953174591, "rewards/rejected": -0.3126716911792755, "step": 330 }, { "epoch": 0.08157389635316699, "grad_norm": 23.505161956514563, "learning_rate": 4.07673860911271e-07, "logits/chosen": -0.5577572584152222, "logits/rejected": -0.5682773590087891, "logps/chosen": -311.04046630859375, "logps/rejected": -350.06011962890625, "loss": 0.6331, "rewards/accuracies": 0.75, "rewards/chosen": -0.10792098939418793, "rewards/margins": 0.2111283242702484, "rewards/rejected": -0.31904932856559753, "step": 340 }, { "epoch": 0.08397312859884837, "grad_norm": 24.141694779806222, "learning_rate": 4.1966426858513185e-07, "logits/chosen": -0.6674095392227173, "logits/rejected": -0.6525458097457886, "logps/chosen": -385.8694152832031, "logps/rejected": -387.1976013183594, "loss": 0.6451, "rewards/accuracies": 0.625, "rewards/chosen": -0.24417324364185333, "rewards/margins": 0.12837204337120056, "rewards/rejected": -0.3725453317165375, "step": 350 }, { "epoch": 0.08637236084452975, "grad_norm": 25.804069948612213, "learning_rate": 4.3165467625899276e-07, "logits/chosen": -0.5833398699760437, "logits/rejected": -0.6397580504417419, "logps/chosen": -350.5534973144531, "logps/rejected": -299.1941833496094, "loss": 0.6415, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2430131882429123, "rewards/margins": 0.10048626363277435, "rewards/rejected": -0.34349945187568665, "step": 360 }, { "epoch": 0.08877159309021113, "grad_norm": 31.03793475076566, "learning_rate": 4.436450839328537e-07, "logits/chosen": -0.5922696590423584, "logits/rejected": -0.5713749527931213, "logps/chosen": -338.27667236328125, "logps/rejected": -366.77166748046875, "loss": 0.6267, "rewards/accuracies": 0.75, "rewards/chosen": -0.27600157260894775, "rewards/margins": 0.22378632426261902, "rewards/rejected": -0.49978795647621155, "step": 370 }, { "epoch": 0.09117082533589252, "grad_norm": 21.29384938198475, "learning_rate": 4.556354916067146e-07, "logits/chosen": -0.6022308468818665, "logits/rejected": -0.5682617425918579, "logps/chosen": -323.4892272949219, "logps/rejected": -347.95111083984375, "loss": 0.6042, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2343587428331375, "rewards/margins": 0.22317072749137878, "rewards/rejected": -0.4575294554233551, "step": 380 }, { "epoch": 0.0935700575815739, "grad_norm": 23.74603674013515, "learning_rate": 4.676258992805755e-07, "logits/chosen": -0.5804970860481262, "logits/rejected": -0.5728699564933777, "logps/chosen": -381.6591796875, "logps/rejected": -358.8669128417969, "loss": 0.6172, "rewards/accuracies": 0.75, "rewards/chosen": -0.3958897590637207, "rewards/margins": 0.13457268476486206, "rewards/rejected": -0.5304625034332275, "step": 390 }, { "epoch": 0.09596928982725528, "grad_norm": 22.488076430906265, "learning_rate": 4.796163069544364e-07, "logits/chosen": -0.6037659049034119, "logits/rejected": -0.6473450660705566, "logps/chosen": -350.5100402832031, "logps/rejected": -356.1009826660156, "loss": 0.6226, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3837326467037201, "rewards/margins": 0.2828107476234436, "rewards/rejected": -0.6665433645248413, "step": 400 }, { "epoch": 0.09836852207293666, "grad_norm": 28.205747441162394, "learning_rate": 4.916067146282974e-07, "logits/chosen": -0.6226581335067749, "logits/rejected": -0.606611430644989, "logps/chosen": -347.90966796875, "logps/rejected": -401.1394958496094, "loss": 0.6069, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3836399018764496, "rewards/margins": 0.22059115767478943, "rewards/rejected": -0.6042311191558838, "step": 410 }, { "epoch": 0.10076775431861804, "grad_norm": 24.57218142171684, "learning_rate": 4.999992108529978e-07, "logits/chosen": -0.5291169881820679, "logits/rejected": -0.5468065142631531, "logps/chosen": -444.72589111328125, "logps/rejected": -439.670654296875, "loss": 0.5982, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.48520898818969727, "rewards/margins": 0.3270217478275299, "rewards/rejected": -0.8122307062149048, "step": 420 }, { "epoch": 0.10316698656429943, "grad_norm": 34.823244275804946, "learning_rate": 4.999851817115532e-07, "logits/chosen": -0.6540865302085876, "logits/rejected": -0.5904898047447205, "logps/chosen": -351.2285461425781, "logps/rejected": -386.56890869140625, "loss": 0.6093, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4589855670928955, "rewards/margins": 0.3846796751022339, "rewards/rejected": -0.8436653017997742, "step": 430 }, { "epoch": 0.10556621880998081, "grad_norm": 33.199263240349794, "learning_rate": 4.999536171027889e-07, "logits/chosen": -0.5496717691421509, "logits/rejected": -0.5985559225082397, "logps/chosen": -409.6986389160156, "logps/rejected": -411.371826171875, "loss": 0.604, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5133577585220337, "rewards/margins": 0.20064587891101837, "rewards/rejected": -0.7140035629272461, "step": 440 }, { "epoch": 0.10796545105566219, "grad_norm": 28.95788929645283, "learning_rate": 4.999045192408369e-07, "logits/chosen": -0.5078392028808594, "logits/rejected": -0.4781821370124817, "logps/chosen": -352.42578125, "logps/rejected": -345.4123840332031, "loss": 0.6122, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5539526343345642, "rewards/margins": 0.1369965374469757, "rewards/rejected": -0.6909492611885071, "step": 450 }, { "epoch": 0.11036468330134357, "grad_norm": 24.918675200058328, "learning_rate": 4.998378915697171e-07, "logits/chosen": -0.5960583090782166, "logits/rejected": -0.5872009992599487, "logps/chosen": -367.5823669433594, "logps/rejected": -395.4332580566406, "loss": 0.583, "rewards/accuracies": 0.75, "rewards/chosen": -0.3061702251434326, "rewards/margins": 0.42768678069114685, "rewards/rejected": -0.7338569164276123, "step": 460 }, { "epoch": 0.11276391554702495, "grad_norm": 24.386517807951574, "learning_rate": 4.997537387630958e-07, "logits/chosen": -0.5429798364639282, "logits/rejected": -0.5464817881584167, "logps/chosen": -310.02203369140625, "logps/rejected": -340.80865478515625, "loss": 0.5738, "rewards/accuracies": 0.625, "rewards/chosen": -0.4528660178184509, "rewards/margins": 0.3107239603996277, "rewards/rejected": -0.7635899782180786, "step": 470 }, { "epoch": 0.11516314779270634, "grad_norm": 27.255184142896073, "learning_rate": 4.996520667239582e-07, "logits/chosen": -0.6526015996932983, "logits/rejected": -0.6507179737091064, "logps/chosen": -353.07098388671875, "logps/rejected": -445.12237548828125, "loss": 0.5705, "rewards/accuracies": 0.625, "rewards/chosen": -0.6045628786087036, "rewards/margins": 0.3956468403339386, "rewards/rejected": -1.0002095699310303, "step": 480 }, { "epoch": 0.11756238003838772, "grad_norm": 32.84348746795199, "learning_rate": 4.995328825841939e-07, "logits/chosen": -0.4966016709804535, "logits/rejected": -0.49989452958106995, "logps/chosen": -317.2383117675781, "logps/rejected": -374.27508544921875, "loss": 0.5899, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4817740321159363, "rewards/margins": 0.5593485236167908, "rewards/rejected": -1.041122555732727, "step": 490 }, { "epoch": 0.1199616122840691, "grad_norm": 29.672640234170935, "learning_rate": 4.993961947040967e-07, "logits/chosen": -0.525520920753479, "logits/rejected": -0.5563070178031921, "logps/chosen": -427.6673889160156, "logps/rejected": -412.65008544921875, "loss": 0.5935, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7312201261520386, "rewards/margins": 0.3187289237976074, "rewards/rejected": -1.0499489307403564, "step": 500 }, { "epoch": 0.12236084452975048, "grad_norm": 27.84738833817779, "learning_rate": 4.992420126717784e-07, "logits/chosen": -0.5528146028518677, "logits/rejected": -0.5479222536087036, "logps/chosen": -356.24041748046875, "logps/rejected": -422.6918029785156, "loss": 0.5781, "rewards/accuracies": 0.875, "rewards/chosen": -0.4140992760658264, "rewards/margins": 0.6510533094406128, "rewards/rejected": -1.065152645111084, "step": 510 }, { "epoch": 0.12476007677543186, "grad_norm": 32.48041619734842, "learning_rate": 4.990703473024958e-07, "logits/chosen": -0.45184358954429626, "logits/rejected": -0.48187708854675293, "logps/chosen": -417.84405517578125, "logps/rejected": -444.81353759765625, "loss": 0.5991, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.7138451933860779, "rewards/margins": 0.37000906467437744, "rewards/rejected": -1.083854079246521, "step": 520 }, { "epoch": 0.12715930902111325, "grad_norm": 28.128177801840295, "learning_rate": 4.98881210637893e-07, "logits/chosen": -0.42285671830177307, "logits/rejected": -0.4019806385040283, "logps/chosen": -320.9397277832031, "logps/rejected": -411.93255615234375, "loss": 0.5818, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.4967781603336334, "rewards/margins": 0.4927561283111572, "rewards/rejected": -0.9895342588424683, "step": 530 }, { "epoch": 0.1295585412667946, "grad_norm": 21.381239649867126, "learning_rate": 4.986746159451553e-07, "logits/chosen": -0.29445725679397583, "logits/rejected": -0.2827056646347046, "logps/chosen": -360.28509521484375, "logps/rejected": -394.31768798828125, "loss": 0.5976, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4901258945465088, "rewards/margins": 0.35657569766044617, "rewards/rejected": -0.8467016220092773, "step": 540 }, { "epoch": 0.131957773512476, "grad_norm": 23.053578304971253, "learning_rate": 4.984505777160795e-07, "logits/chosen": -0.2335212230682373, "logits/rejected": -0.2651960253715515, "logps/chosen": -433.4956970214844, "logps/rejected": -464.2955017089844, "loss": 0.5984, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6037947535514832, "rewards/margins": 0.3152288496494293, "rewards/rejected": -0.9190236330032349, "step": 550 }, { "epoch": 0.1343570057581574, "grad_norm": 28.905417781337384, "learning_rate": 4.982091116660574e-07, "logits/chosen": -0.321607768535614, "logits/rejected": -0.3338220715522766, "logps/chosen": -305.93658447265625, "logps/rejected": -300.59124755859375, "loss": 0.6154, "rewards/accuracies": 0.625, "rewards/chosen": -0.5145214796066284, "rewards/margins": 0.22794541716575623, "rewards/rejected": -0.7424668669700623, "step": 560 }, { "epoch": 0.13675623800383876, "grad_norm": 32.795386120218325, "learning_rate": 4.979502347329732e-07, "logits/chosen": -0.23663392663002014, "logits/rejected": -0.24166357517242432, "logps/chosen": -423.2027282714844, "logps/rejected": -491.15106201171875, "loss": 0.5998, "rewards/accuracies": 0.625, "rewards/chosen": -0.6477493643760681, "rewards/margins": 0.4360308051109314, "rewards/rejected": -1.08378005027771, "step": 570 }, { "epoch": 0.13915547024952016, "grad_norm": 37.84072378443296, "learning_rate": 4.976739650760151e-07, "logits/chosen": -0.29570311307907104, "logits/rejected": -0.3070180118083954, "logps/chosen": -375.17962646484375, "logps/rejected": -388.2039489746094, "loss": 0.586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4810148775577545, "rewards/margins": 0.3502056300640106, "rewards/rejected": -0.8312205076217651, "step": 580 }, { "epoch": 0.14155470249520152, "grad_norm": 45.677768580981564, "learning_rate": 4.97380322074402e-07, "logits/chosen": -0.2370149791240692, "logits/rejected": -0.25640061497688293, "logps/chosen": -349.70941162109375, "logps/rejected": -374.61456298828125, "loss": 0.5895, "rewards/accuracies": 0.625, "rewards/chosen": -0.6930117607116699, "rewards/margins": 0.2920604646205902, "rewards/rejected": -0.985072135925293, "step": 590 }, { "epoch": 0.14395393474088292, "grad_norm": 31.065347761695264, "learning_rate": 4.970693263260237e-07, "logits/chosen": -0.26885563135147095, "logits/rejected": -0.3041172921657562, "logps/chosen": -403.6191101074219, "logps/rejected": -410.84967041015625, "loss": 0.5853, "rewards/accuracies": 0.75, "rewards/chosen": -0.5729845762252808, "rewards/margins": 0.46888118982315063, "rewards/rejected": -1.0418657064437866, "step": 600 }, { "epoch": 0.1463531669865643, "grad_norm": 29.080698158567, "learning_rate": 4.967409996459966e-07, "logits/chosen": -0.2872675359249115, "logits/rejected": -0.3306855261325836, "logps/chosen": -405.076904296875, "logps/rejected": -423.62664794921875, "loss": 0.5751, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.49526625871658325, "rewards/margins": 0.3949028551578522, "rewards/rejected": -0.8901691436767578, "step": 610 }, { "epoch": 0.14875239923224567, "grad_norm": 27.549771571534542, "learning_rate": 4.963953650651326e-07, "logits/chosen": -0.15485969185829163, "logits/rejected": -0.16681411862373352, "logps/chosen": -478.8113708496094, "logps/rejected": -422.03955078125, "loss": 0.569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.63862544298172, "rewards/margins": 0.40379634499549866, "rewards/rejected": -1.042421817779541, "step": 620 }, { "epoch": 0.15115163147792707, "grad_norm": 28.772933296866565, "learning_rate": 4.960324468283248e-07, "logits/chosen": -0.20728620886802673, "logits/rejected": -0.2060108482837677, "logps/chosen": -367.0924377441406, "logps/rejected": -390.70458984375, "loss": 0.5636, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8087286949157715, "rewards/margins": 0.3034602999687195, "rewards/rejected": -1.1121888160705566, "step": 630 }, { "epoch": 0.15355086372360843, "grad_norm": 29.609222546231578, "learning_rate": 4.956522703928451e-07, "logits/chosen": -0.06690754741430283, "logits/rejected": -0.06723584234714508, "logps/chosen": -370.3538818359375, "logps/rejected": -409.23065185546875, "loss": 0.5409, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.788346529006958, "rewards/margins": 0.4131564199924469, "rewards/rejected": -1.2015029191970825, "step": 640 }, { "epoch": 0.15595009596928983, "grad_norm": 38.41016264507651, "learning_rate": 4.952548624265606e-07, "logits/chosen": -0.03009071573615074, "logits/rejected": 0.02059212513267994, "logps/chosen": -436.8095703125, "logps/rejected": -453.0166931152344, "loss": 0.6085, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8183758854866028, "rewards/margins": 0.35901501774787903, "rewards/rejected": -1.1773908138275146, "step": 650 }, { "epoch": 0.15834932821497122, "grad_norm": 25.869613582575887, "learning_rate": 4.948402508060607e-07, "logits/chosen": -0.0018309459555894136, "logits/rejected": -0.01893061026930809, "logps/chosen": -356.6624755859375, "logps/rejected": -409.0708923339844, "loss": 0.6026, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6256797313690186, "rewards/margins": 0.601173460483551, "rewards/rejected": -1.2268530130386353, "step": 660 }, { "epoch": 0.16074856046065258, "grad_norm": 35.22680312796026, "learning_rate": 4.944084646147038e-07, "logits/chosen": 0.0020178346894681454, "logits/rejected": 0.031680598855018616, "logps/chosen": -452.8055114746094, "logps/rejected": -465.51678466796875, "loss": 0.5999, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.6981381177902222, "rewards/margins": 0.3360704779624939, "rewards/rejected": -1.0342086553573608, "step": 670 }, { "epoch": 0.16314779270633398, "grad_norm": 28.0016914634874, "learning_rate": 4.939595341405754e-07, "logits/chosen": -0.039152443408966064, "logits/rejected": -0.05885768681764603, "logps/chosen": -401.278564453125, "logps/rejected": -409.3609924316406, "loss": 0.57, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7071236968040466, "rewards/margins": 0.3430066704750061, "rewards/rejected": -1.0501302480697632, "step": 680 }, { "epoch": 0.16554702495201534, "grad_norm": 30.023172826044828, "learning_rate": 4.93493490874365e-07, "logits/chosen": -0.00025105997337959707, "logits/rejected": 0.005772613920271397, "logps/chosen": -390.638427734375, "logps/rejected": -424.7112731933594, "loss": 0.5461, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7401353716850281, "rewards/margins": 0.30253323912620544, "rewards/rejected": -1.0426685810089111, "step": 690 }, { "epoch": 0.16794625719769674, "grad_norm": 41.393690265481474, "learning_rate": 4.93010367507156e-07, "logits/chosen": -0.051719047129154205, "logits/rejected": -0.06900392472743988, "logps/chosen": -346.08837890625, "logps/rejected": -374.042724609375, "loss": 0.5537, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7158280611038208, "rewards/margins": 0.5490631461143494, "rewards/rejected": -1.264891266822815, "step": 700 }, { "epoch": 0.17034548944337813, "grad_norm": 33.6357655925115, "learning_rate": 4.925101979281332e-07, "logits/chosen": 0.02222558856010437, "logits/rejected": 0.006278800778090954, "logps/chosen": -424.63726806640625, "logps/rejected": -441.04644775390625, "loss": 0.5799, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6551335453987122, "rewards/margins": 0.6523554921150208, "rewards/rejected": -1.3074891567230225, "step": 710 }, { "epoch": 0.1727447216890595, "grad_norm": 32.69679455555672, "learning_rate": 4.919930172222054e-07, "logits/chosen": -0.12917150557041168, "logits/rejected": -0.12720082700252533, "logps/chosen": -402.8379821777344, "logps/rejected": -441.12677001953125, "loss": 0.5295, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7797117829322815, "rewards/margins": 0.4280470311641693, "rewards/rejected": -1.2077586650848389, "step": 720 }, { "epoch": 0.1751439539347409, "grad_norm": 38.43936411357028, "learning_rate": 4.914588616675445e-07, "logits/chosen": -0.17864573001861572, "logits/rejected": -0.20761199295520782, "logps/chosen": -344.26312255859375, "logps/rejected": -408.573486328125, "loss": 0.5897, "rewards/accuracies": 0.75, "rewards/chosen": -0.555050253868103, "rewards/margins": 0.5430852174758911, "rewards/rejected": -1.098135232925415, "step": 730 }, { "epoch": 0.17754318618042225, "grad_norm": 36.631671098915504, "learning_rate": 4.909077687330404e-07, "logits/chosen": -0.11447083950042725, "logits/rejected": -0.09544442594051361, "logps/chosen": -418.90838623046875, "logps/rejected": -417.09844970703125, "loss": 0.5492, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.7083614468574524, "rewards/margins": 0.3665878176689148, "rewards/rejected": -1.0749492645263672, "step": 740 }, { "epoch": 0.17994241842610365, "grad_norm": 32.57669985590322, "learning_rate": 4.903397770756729e-07, "logits/chosen": -0.06074325367808342, "logits/rejected": -0.08299403637647629, "logps/chosen": -401.88800048828125, "logps/rejected": -449.99169921875, "loss": 0.5669, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.6976863145828247, "rewards/margins": 0.6443861722946167, "rewards/rejected": -1.342072606086731, "step": 750 }, { "epoch": 0.18234165067178504, "grad_norm": 27.495851946761935, "learning_rate": 4.897549265378004e-07, "logits/chosen": -0.18077705800533295, "logits/rejected": -0.15703561902046204, "logps/chosen": -486.8914489746094, "logps/rejected": -522.5025024414062, "loss": 0.559, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.925071120262146, "rewards/margins": 0.4355601668357849, "rewards/rejected": -1.3606312274932861, "step": 760 }, { "epoch": 0.1847408829174664, "grad_norm": 32.88428335628656, "learning_rate": 4.891532581443643e-07, "logits/chosen": -0.10509393364191055, "logits/rejected": -0.13191482424736023, "logps/chosen": -433.39697265625, "logps/rejected": -510.9073791503906, "loss": 0.5335, "rewards/accuracies": 0.875, "rewards/chosen": -0.7138081789016724, "rewards/margins": 0.8751919865608215, "rewards/rejected": -1.5890003442764282, "step": 770 }, { "epoch": 0.1871401151631478, "grad_norm": 34.828556425360944, "learning_rate": 4.885348141000122e-07, "logits/chosen": -0.02448561228811741, "logits/rejected": -0.08334103226661682, "logps/chosen": -395.90985107421875, "logps/rejected": -471.1104431152344, "loss": 0.56, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.878553569316864, "rewards/margins": 0.5636481046676636, "rewards/rejected": -1.4422016143798828, "step": 780 }, { "epoch": 0.18953934740882916, "grad_norm": 34.102875954970415, "learning_rate": 4.878996377861367e-07, "logits/chosen": -0.04264168441295624, "logits/rejected": -0.09717553108930588, "logps/chosen": -374.21063232421875, "logps/rejected": -424.62701416015625, "loss": 0.5366, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0093395709991455, "rewards/margins": 0.43130987882614136, "rewards/rejected": -1.4406496286392212, "step": 790 }, { "epoch": 0.19193857965451055, "grad_norm": 34.09209485411543, "learning_rate": 4.872477737578327e-07, "logits/chosen": -0.023031553253531456, "logits/rejected": -0.03800968453288078, "logps/chosen": -431.33782958984375, "logps/rejected": -534.5132446289062, "loss": 0.5265, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8465608358383179, "rewards/margins": 1.0085922479629517, "rewards/rejected": -1.8551530838012695, "step": 800 }, { "epoch": 0.19433781190019195, "grad_norm": 44.78458025907374, "learning_rate": 4.865792677407718e-07, "logits/chosen": -0.09794610738754272, "logits/rejected": -0.08297122269868851, "logps/chosen": -404.9684143066406, "logps/rejected": -423.43896484375, "loss": 0.5785, "rewards/accuracies": 0.625, "rewards/chosen": -0.8548682928085327, "rewards/margins": 0.37679168581962585, "rewards/rejected": -1.2316598892211914, "step": 810 }, { "epoch": 0.1967370441458733, "grad_norm": 37.30397170950818, "learning_rate": 4.858941666279955e-07, "logits/chosen": -0.20108501613140106, "logits/rejected": -0.15580318868160248, "logps/chosen": -440.3353576660156, "logps/rejected": -437.89337158203125, "loss": 0.5843, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7743014693260193, "rewards/margins": 0.3654334843158722, "rewards/rejected": -1.1397349834442139, "step": 820 }, { "epoch": 0.1991362763915547, "grad_norm": 37.97880335267858, "learning_rate": 4.851925184766247e-07, "logits/chosen": -0.07934032380580902, "logits/rejected": -0.06675902754068375, "logps/chosen": -400.7498779296875, "logps/rejected": -435.03387451171875, "loss": 0.5744, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8046972155570984, "rewards/margins": 0.6158983707427979, "rewards/rejected": -1.4205955266952515, "step": 830 }, { "epoch": 0.20153550863723607, "grad_norm": 34.877131137485314, "learning_rate": 4.844743725044897e-07, "logits/chosen": -0.1209510788321495, "logits/rejected": -0.12060485780239105, "logps/chosen": -390.33575439453125, "logps/rejected": -407.412841796875, "loss": 0.5509, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7886186838150024, "rewards/margins": 0.5530378222465515, "rewards/rejected": -1.3416564464569092, "step": 840 }, { "epoch": 0.20393474088291746, "grad_norm": 40.19277289158246, "learning_rate": 4.837397790866774e-07, "logits/chosen": -0.07084405422210693, "logits/rejected": -0.10281334072351456, "logps/chosen": -429.7625427246094, "logps/rejected": -490.245361328125, "loss": 0.5474, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.7108091115951538, "rewards/margins": 0.9831811785697937, "rewards/rejected": -1.6939903497695923, "step": 850 }, { "epoch": 0.20633397312859886, "grad_norm": 35.43380411461513, "learning_rate": 4.829887897519974e-07, "logits/chosen": 0.014303353615105152, "logits/rejected": -0.007743634283542633, "logps/chosen": -381.1875, "logps/rejected": -453.6239318847656, "loss": 0.5809, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8310438990592957, "rewards/margins": 0.5285369157791138, "rewards/rejected": -1.3595808744430542, "step": 860 }, { "epoch": 0.20873320537428022, "grad_norm": 30.773099092132018, "learning_rate": 4.82221457179368e-07, "logits/chosen": 0.005006339401006699, "logits/rejected": -0.01996953971683979, "logps/chosen": -400.3504638671875, "logps/rejected": -444.50653076171875, "loss": 0.5516, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6430622339248657, "rewards/margins": 0.6689059138298035, "rewards/rejected": -1.3119680881500244, "step": 870 }, { "epoch": 0.21113243761996162, "grad_norm": 30.941839797295746, "learning_rate": 4.814378351941206e-07, "logits/chosen": -0.03190199285745621, "logits/rejected": -0.032009296119213104, "logps/chosen": -378.9139099121094, "logps/rejected": -410.24896240234375, "loss": 0.5687, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.6023445129394531, "rewards/margins": 0.4468112885951996, "rewards/rejected": -1.049155831336975, "step": 880 }, { "epoch": 0.21353166986564298, "grad_norm": 30.2018268544055, "learning_rate": 4.806379787642241e-07, "logits/chosen": 0.03415294736623764, "logits/rejected": -0.008319585584104061, "logps/chosen": -373.29327392578125, "logps/rejected": -426.83856201171875, "loss": 0.6009, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6300404071807861, "rewards/margins": 0.5418477058410645, "rewards/rejected": -1.171887993812561, "step": 890 }, { "epoch": 0.21593090211132437, "grad_norm": 30.69767076541483, "learning_rate": 4.798219439964293e-07, "logits/chosen": -0.022300051525235176, "logits/rejected": -0.07942859828472137, "logps/chosen": -382.47088623046875, "logps/rejected": -428.5862731933594, "loss": 0.5421, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.79583740234375, "rewards/margins": 0.3617878556251526, "rewards/rejected": -1.1576253175735474, "step": 900 }, { "epoch": 0.21833013435700577, "grad_norm": 64.36072526993395, "learning_rate": 4.78989788132333e-07, "logits/chosen": -0.07167644053697586, "logits/rejected": -0.07725416123867035, "logps/chosen": -351.6461181640625, "logps/rejected": -432.82916259765625, "loss": 0.5306, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7295014262199402, "rewards/margins": 0.7431732416152954, "rewards/rejected": -1.4726746082305908, "step": 910 }, { "epoch": 0.22072936660268713, "grad_norm": 31.37167746375858, "learning_rate": 4.781415695443631e-07, "logits/chosen": 0.07153941690921783, "logits/rejected": 0.1024637222290039, "logps/chosen": -490.06402587890625, "logps/rejected": -516.02294921875, "loss": 0.5824, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3680822849273682, "rewards/margins": 0.20954158902168274, "rewards/rejected": -1.5776238441467285, "step": 920 }, { "epoch": 0.22312859884836853, "grad_norm": 29.004390037425598, "learning_rate": 4.772773477316836e-07, "logits/chosen": 0.03397312015295029, "logits/rejected": 0.03711385652422905, "logps/chosen": -467.2877502441406, "logps/rejected": -509.09716796875, "loss": 0.5473, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1240530014038086, "rewards/margins": 0.45423418283462524, "rewards/rejected": -1.578287124633789, "step": 930 }, { "epoch": 0.2255278310940499, "grad_norm": 41.30297632121209, "learning_rate": 4.7639718331602117e-07, "logits/chosen": 0.08684961497783661, "logits/rejected": 0.05986959859728813, "logps/chosen": -420.59814453125, "logps/rejected": -490.409912109375, "loss": 0.5313, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9049364924430847, "rewards/margins": 0.7652468681335449, "rewards/rejected": -1.6701834201812744, "step": 940 }, { "epoch": 0.22792706333973128, "grad_norm": 39.96596996172262, "learning_rate": 4.7550113803741275e-07, "logits/chosen": 0.13893774151802063, "logits/rejected": 0.16821300983428955, "logps/chosen": -432.7577209472656, "logps/rejected": -411.2371520996094, "loss": 0.5632, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9445673823356628, "rewards/margins": 0.5061396360397339, "rewards/rejected": -1.450706958770752, "step": 950 }, { "epoch": 0.23032629558541268, "grad_norm": 35.26144835245691, "learning_rate": 4.7458927474987454e-07, "logits/chosen": 0.13862411677837372, "logits/rejected": 0.17463508248329163, "logps/chosen": -470.4085998535156, "logps/rejected": -434.1971740722656, "loss": 0.5406, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8276304006576538, "rewards/margins": 0.33768096566200256, "rewards/rejected": -1.1653112173080444, "step": 960 }, { "epoch": 0.23272552783109404, "grad_norm": 34.457370464422794, "learning_rate": 4.7366165741699347e-07, "logits/chosen": 0.06780462712049484, "logits/rejected": 0.033076416701078415, "logps/chosen": -474.2489318847656, "logps/rejected": -491.00421142578125, "loss": 0.5508, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.8541940450668335, "rewards/margins": 0.41960257291793823, "rewards/rejected": -1.2737966775894165, "step": 970 }, { "epoch": 0.23512476007677544, "grad_norm": 50.081994664008306, "learning_rate": 4.727183511074401e-07, "logits/chosen": 0.12627606093883514, "logits/rejected": 0.1392831802368164, "logps/chosen": -422.95947265625, "logps/rejected": -465.1092834472656, "loss": 0.5446, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.8909593820571899, "rewards/margins": 0.47495001554489136, "rewards/rejected": -1.3659093379974365, "step": 980 }, { "epoch": 0.2375239923224568, "grad_norm": 33.56933991120958, "learning_rate": 4.717594219904043e-07, "logits/chosen": 0.11548285186290741, "logits/rejected": 0.17751248180866241, "logps/chosen": -428.980224609375, "logps/rejected": -429.6700134277344, "loss": 0.553, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9127674102783203, "rewards/margins": 0.5175878405570984, "rewards/rejected": -1.4303553104400635, "step": 990 }, { "epoch": 0.2399232245681382, "grad_norm": 38.76253931692222, "learning_rate": 4.7078493733095393e-07, "logits/chosen": 0.07841446250677109, "logits/rejected": 0.07714001089334488, "logps/chosen": -396.0744323730469, "logps/rejected": -459.9576721191406, "loss": 0.5356, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7617571353912354, "rewards/margins": 0.5821165442466736, "rewards/rejected": -1.3438737392425537, "step": 1000 }, { "epoch": 0.2423224568138196, "grad_norm": 40.40221370647514, "learning_rate": 4.6979496548531614e-07, "logits/chosen": 0.282027930021286, "logits/rejected": 0.2300875186920166, "logps/chosen": -417.5662536621094, "logps/rejected": -517.0253295898438, "loss": 0.556, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0011545419692993, "rewards/margins": 0.4213111400604248, "rewards/rejected": -1.4224655628204346, "step": 1010 }, { "epoch": 0.24472168905950095, "grad_norm": 38.63551061711667, "learning_rate": 4.6878957589608293e-07, "logits/chosen": 0.15491922199726105, "logits/rejected": 0.10176967084407806, "logps/chosen": -423.72412109375, "logps/rejected": -521.2840576171875, "loss": 0.5489, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8717803955078125, "rewards/margins": 0.6004728078842163, "rewards/rejected": -1.4722532033920288, "step": 1020 }, { "epoch": 0.24712092130518235, "grad_norm": 33.04785124844753, "learning_rate": 4.6776883908733956e-07, "logits/chosen": 0.3141445815563202, "logits/rejected": 0.40079420804977417, "logps/chosen": -444.20037841796875, "logps/rejected": -440.73992919921875, "loss": 0.5367, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.9564323425292969, "rewards/margins": 0.6285899877548218, "rewards/rejected": -1.5850223302841187, "step": 1030 }, { "epoch": 0.2495201535508637, "grad_norm": 56.89197225086117, "learning_rate": 4.667328266597178e-07, "logits/chosen": 0.32467955350875854, "logits/rejected": 0.3737574815750122, "logps/chosen": -425.14764404296875, "logps/rejected": -474.8291015625, "loss": 0.5182, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9110026359558105, "rewards/margins": 0.6014169454574585, "rewards/rejected": -1.5124194622039795, "step": 1040 }, { "epoch": 0.2519193857965451, "grad_norm": 42.632075100473685, "learning_rate": 4.6568161128537354e-07, "logits/chosen": 0.23409466445446014, "logits/rejected": 0.4846338629722595, "logps/chosen": -437.87469482421875, "logps/rejected": -416.75433349609375, "loss": 0.5475, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.0662238597869873, "rewards/margins": 0.3336094319820404, "rewards/rejected": -1.3998332023620605, "step": 1050 }, { "epoch": 0.2543186180422265, "grad_norm": 46.72130069794758, "learning_rate": 4.6461526670288877e-07, "logits/chosen": 0.4986523687839508, "logits/rejected": 0.5356402397155762, "logps/chosen": -453.13543701171875, "logps/rejected": -487.8929748535156, "loss": 0.5798, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.0444475412368774, "rewards/margins": 0.7141000032424927, "rewards/rejected": -1.7585475444793701, "step": 1060 }, { "epoch": 0.2567178502879079, "grad_norm": 30.901707992623376, "learning_rate": 4.635338677120994e-07, "logits/chosen": 0.6319410800933838, "logits/rejected": 0.5878476500511169, "logps/chosen": -435.103271484375, "logps/rejected": -522.3030395507812, "loss": 0.5107, "rewards/accuracies": 0.75, "rewards/chosen": -1.001800775527954, "rewards/margins": 0.7891290187835693, "rewards/rejected": -1.7909300327301025, "step": 1070 }, { "epoch": 0.2591170825335892, "grad_norm": 35.62993699091359, "learning_rate": 4.6243749016884835e-07, "logits/chosen": 0.6645074486732483, "logits/rejected": 0.6307970285415649, "logps/chosen": -460.1568298339844, "logps/rejected": -596.9638671875, "loss": 0.554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2296950817108154, "rewards/margins": 0.8476268649101257, "rewards/rejected": -2.077322006225586, "step": 1080 }, { "epoch": 0.2615163147792706, "grad_norm": 55.48184293718509, "learning_rate": 4.613262109796645e-07, "logits/chosen": 0.5279312133789062, "logits/rejected": 0.44912824034690857, "logps/chosen": -445.187744140625, "logps/rejected": -569.0379028320312, "loss": 0.5184, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.0215142965316772, "rewards/margins": 0.903215765953064, "rewards/rejected": -1.9247299432754517, "step": 1090 }, { "epoch": 0.263915547024952, "grad_norm": 35.82390141377677, "learning_rate": 4.602001080963678e-07, "logits/chosen": 0.5199450254440308, "logits/rejected": 0.580736517906189, "logps/chosen": -457.9462890625, "logps/rejected": -484.3863830566406, "loss": 0.5432, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.047241449356079, "rewards/margins": 0.6471258997917175, "rewards/rejected": -1.6943671703338623, "step": 1100 }, { "epoch": 0.2663147792706334, "grad_norm": 51.55318372805118, "learning_rate": 4.590592605106017e-07, "logits/chosen": 0.34312915802001953, "logits/rejected": 0.3462589979171753, "logps/chosen": -462.97137451171875, "logps/rejected": -475.85235595703125, "loss": 0.5757, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8622667193412781, "rewards/margins": 0.5276774168014526, "rewards/rejected": -1.389944076538086, "step": 1110 }, { "epoch": 0.2687140115163148, "grad_norm": 46.55805600175398, "learning_rate": 4.5790374824829165e-07, "logits/chosen": 0.5497294068336487, "logits/rejected": 0.5141938924789429, "logps/chosen": -329.0898132324219, "logps/rejected": -395.189208984375, "loss": 0.5305, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.8122035264968872, "rewards/margins": 0.555601179599762, "rewards/rejected": -1.367804765701294, "step": 1120 }, { "epoch": 0.27111324376199614, "grad_norm": 41.30209061097155, "learning_rate": 4.5673365236403216e-07, "logits/chosen": 0.5173945426940918, "logits/rejected": 0.538547158241272, "logps/chosen": -337.64508056640625, "logps/rejected": -434.7604064941406, "loss": 0.5406, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.6650754809379578, "rewards/margins": 0.7895157337188721, "rewards/rejected": -1.454591155052185, "step": 1130 }, { "epoch": 0.27351247600767753, "grad_norm": 35.39722075486902, "learning_rate": 4.5554905493540075e-07, "logits/chosen": 0.7431238293647766, "logits/rejected": 0.720431923866272, "logps/chosen": -369.39825439453125, "logps/rejected": -469.0665588378906, "loss": 0.4965, "rewards/accuracies": 0.75, "rewards/chosen": -0.8106037974357605, "rewards/margins": 0.9515643119812012, "rewards/rejected": -1.762168288230896, "step": 1140 }, { "epoch": 0.2759117082533589, "grad_norm": 80.31107636026294, "learning_rate": 4.5435003905720074e-07, "logits/chosen": 0.6994370222091675, "logits/rejected": 0.7717106938362122, "logps/chosen": -467.26922607421875, "logps/rejected": -501.1815490722656, "loss": 0.5376, "rewards/accuracies": 0.875, "rewards/chosen": -1.1343969106674194, "rewards/margins": 0.7365877628326416, "rewards/rejected": -1.870984673500061, "step": 1150 }, { "epoch": 0.2783109404990403, "grad_norm": 50.544256487524144, "learning_rate": 4.531366888356324e-07, "logits/chosen": 0.604827880859375, "logits/rejected": 0.5405411720275879, "logps/chosen": -349.20196533203125, "logps/rejected": -467.71160888671875, "loss": 0.5206, "rewards/accuracies": 0.75, "rewards/chosen": -0.9413111805915833, "rewards/margins": 0.8840651512145996, "rewards/rejected": -1.8253761529922485, "step": 1160 }, { "epoch": 0.2807101727447217, "grad_norm": 48.679810649088054, "learning_rate": 4.519090893823931e-07, "logits/chosen": 0.7196705341339111, "logits/rejected": 0.7503910660743713, "logps/chosen": -434.93377685546875, "logps/rejected": -479.38836669921875, "loss": 0.5375, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1369459629058838, "rewards/margins": 0.6235243678092957, "rewards/rejected": -1.7604703903198242, "step": 1170 }, { "epoch": 0.28310940499040305, "grad_norm": 42.39683927792113, "learning_rate": 4.5066732680870734e-07, "logits/chosen": 0.7495613694190979, "logits/rejected": 0.7793896794319153, "logps/chosen": -413.90557861328125, "logps/rejected": -447.45452880859375, "loss": 0.5181, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0040034055709839, "rewards/margins": 0.8347917795181274, "rewards/rejected": -1.8387953042984009, "step": 1180 }, { "epoch": 0.28550863723608444, "grad_norm": 53.13082069754931, "learning_rate": 4.494114882192862e-07, "logits/chosen": 0.4293566644191742, "logits/rejected": 0.44527220726013184, "logps/chosen": -425.08538818359375, "logps/rejected": -490.40765380859375, "loss": 0.504, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9455874562263489, "rewards/margins": 1.0670359134674072, "rewards/rejected": -2.0126233100891113, "step": 1190 }, { "epoch": 0.28790786948176583, "grad_norm": 49.87269165648676, "learning_rate": 4.4814166170621735e-07, "logits/chosen": 0.6792656183242798, "logits/rejected": 0.6856303811073303, "logps/chosen": -430.4750061035156, "logps/rejected": -503.75634765625, "loss": 0.5405, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.135371446609497, "rewards/margins": 1.0104650259017944, "rewards/rejected": -2.145836591720581, "step": 1200 }, { "epoch": 0.2903071017274472, "grad_norm": 37.72444617932776, "learning_rate": 4.468579363427858e-07, "logits/chosen": 0.41752809286117554, "logits/rejected": 0.4529293477535248, "logps/chosen": -450.9541931152344, "logps/rejected": -478.5503845214844, "loss": 0.554, "rewards/accuracies": 0.625, "rewards/chosen": -1.3859989643096924, "rewards/margins": 0.5862727165222168, "rewards/rejected": -1.9722716808319092, "step": 1210 }, { "epoch": 0.2927063339731286, "grad_norm": 49.47973747014418, "learning_rate": 4.4556040217722555e-07, "logits/chosen": 0.6199735403060913, "logits/rejected": 0.5173524618148804, "logps/chosen": -390.5383605957031, "logps/rejected": -529.1012573242188, "loss": 0.5218, "rewards/accuracies": 0.75, "rewards/chosen": -0.9236103892326355, "rewards/margins": 0.9195195436477661, "rewards/rejected": -1.8431298732757568, "step": 1220 }, { "epoch": 0.29510556621880996, "grad_norm": 43.77387295728714, "learning_rate": 4.442491502264033e-07, "logits/chosen": 0.5372000932693481, "logits/rejected": 0.5111404061317444, "logps/chosen": -398.32928466796875, "logps/rejected": -427.90142822265625, "loss": 0.5579, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.1015335321426392, "rewards/margins": 0.36569902300834656, "rewards/rejected": -1.467232584953308, "step": 1230 }, { "epoch": 0.29750479846449135, "grad_norm": 35.2179506302823, "learning_rate": 4.429242724694338e-07, "logits/chosen": 0.596865177154541, "logits/rejected": 0.5551019906997681, "logps/chosen": -403.04803466796875, "logps/rejected": -482.10455322265625, "loss": 0.5353, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8973898887634277, "rewards/margins": 0.7202876806259155, "rewards/rejected": -1.6176776885986328, "step": 1240 }, { "epoch": 0.29990403071017274, "grad_norm": 35.54165989722752, "learning_rate": 4.4158586184122817e-07, "logits/chosen": 0.6986425518989563, "logits/rejected": 0.7786028385162354, "logps/chosen": -455.0581970214844, "logps/rejected": -487.45220947265625, "loss": 0.5169, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9782212376594543, "rewards/margins": 0.773512065410614, "rewards/rejected": -1.751733422279358, "step": 1250 }, { "epoch": 0.30230326295585414, "grad_norm": 34.976845816469115, "learning_rate": 4.4023401222597443e-07, "logits/chosen": 0.5812339782714844, "logits/rejected": 0.6533055305480957, "logps/chosen": -456.7413635253906, "logps/rejected": -492.81500244140625, "loss": 0.4972, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0948355197906494, "rewards/margins": 0.6430230140686035, "rewards/rejected": -1.737858533859253, "step": 1260 }, { "epoch": 0.30470249520153553, "grad_norm": 52.15846550296518, "learning_rate": 4.3886881845055235e-07, "logits/chosen": 0.6851636171340942, "logits/rejected": 0.7039676904678345, "logps/chosen": -395.2878723144531, "logps/rejected": -475.46319580078125, "loss": 0.5178, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.8985649347305298, "rewards/margins": 0.9367402195930481, "rewards/rejected": -1.8353052139282227, "step": 1270 }, { "epoch": 0.30710172744721687, "grad_norm": 35.79256631055672, "learning_rate": 4.374903762778814e-07, "logits/chosen": 0.6985992193222046, "logits/rejected": 0.6866432428359985, "logps/chosen": -429.9559020996094, "logps/rejected": -467.31231689453125, "loss": 0.5222, "rewards/accuracies": 0.75, "rewards/chosen": -1.0913515090942383, "rewards/margins": 0.6997131109237671, "rewards/rejected": -1.7910646200180054, "step": 1280 }, { "epoch": 0.30950095969289826, "grad_norm": 68.77406798145645, "learning_rate": 4.3609878240020356e-07, "logits/chosen": 0.45225849747657776, "logits/rejected": 0.5497337579727173, "logps/chosen": -510.07659912109375, "logps/rejected": -510.8426208496094, "loss": 0.5356, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.366571307182312, "rewards/margins": 0.6978545784950256, "rewards/rejected": -2.0644257068634033, "step": 1290 }, { "epoch": 0.31190019193857965, "grad_norm": 36.68902407720006, "learning_rate": 4.346941344323005e-07, "logits/chosen": 0.585986316204071, "logits/rejected": 0.6672986745834351, "logps/chosen": -437.39324951171875, "logps/rejected": -430.4087829589844, "loss": 0.5562, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3243210315704346, "rewards/margins": 0.4856715798377991, "rewards/rejected": -1.8099925518035889, "step": 1300 }, { "epoch": 0.31429942418426104, "grad_norm": 38.77370809872286, "learning_rate": 4.332765309046467e-07, "logits/chosen": 0.7318406105041504, "logits/rejected": 0.7771567106246948, "logps/chosen": -450.19427490234375, "logps/rejected": -471.3865661621094, "loss": 0.5554, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.116821527481079, "rewards/margins": 0.6649090051651001, "rewards/rejected": -1.7817304134368896, "step": 1310 }, { "epoch": 0.31669865642994244, "grad_norm": 45.82746891169888, "learning_rate": 4.3184607125649754e-07, "logits/chosen": 0.49596285820007324, "logits/rejected": 0.5003286600112915, "logps/chosen": -430.02996826171875, "logps/rejected": -527.7648315429688, "loss": 0.5201, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8098013997077942, "rewards/margins": 0.9345352053642273, "rewards/rejected": -1.744336485862732, "step": 1320 }, { "epoch": 0.3190978886756238, "grad_norm": 37.74246685501154, "learning_rate": 4.304028558289141e-07, "logits/chosen": 0.38717252016067505, "logits/rejected": 0.39220350980758667, "logps/chosen": -416.00494384765625, "logps/rejected": -460.89739990234375, "loss": 0.4969, "rewards/accuracies": 0.75, "rewards/chosen": -0.7099177241325378, "rewards/margins": 0.6884833574295044, "rewards/rejected": -1.3984010219573975, "step": 1330 }, { "epoch": 0.32149712092130517, "grad_norm": 34.11722436437858, "learning_rate": 4.28946985857725e-07, "logits/chosen": 0.5080984234809875, "logits/rejected": 0.4866611063480377, "logps/chosen": -444.2494201660156, "logps/rejected": -542.6705932617188, "loss": 0.4997, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.9977186918258667, "rewards/margins": 1.2409141063690186, "rewards/rejected": -2.2386326789855957, "step": 1340 }, { "epoch": 0.32389635316698656, "grad_norm": 38.18058435916063, "learning_rate": 4.2747856346642445e-07, "logits/chosen": 0.4064346253871918, "logits/rejected": 0.4254288077354431, "logps/chosen": -389.40472412109375, "logps/rejected": -465.85906982421875, "loss": 0.4983, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9736809730529785, "rewards/margins": 0.8628204464912415, "rewards/rejected": -1.8365013599395752, "step": 1350 }, { "epoch": 0.32629558541266795, "grad_norm": 45.560969124424204, "learning_rate": 4.2599769165900933e-07, "logits/chosen": 0.4976237714290619, "logits/rejected": 0.4918050765991211, "logps/chosen": -478.25140380859375, "logps/rejected": -495.969482421875, "loss": 0.5657, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5931600332260132, "rewards/margins": 0.48865580558776855, "rewards/rejected": -2.0818159580230713, "step": 1360 }, { "epoch": 0.32869481765834935, "grad_norm": 35.94258808540943, "learning_rate": 4.245044743127535e-07, "logits/chosen": 0.5548725128173828, "logits/rejected": 0.46006709337234497, "logps/chosen": -428.947021484375, "logps/rejected": -524.8760986328125, "loss": 0.5207, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1485120058059692, "rewards/margins": 0.7874538898468018, "rewards/rejected": -1.9359657764434814, "step": 1370 }, { "epoch": 0.3310940499040307, "grad_norm": 42.265392991866655, "learning_rate": 4.229990161709214e-07, "logits/chosen": 0.547171950340271, "logits/rejected": 0.4217755198478699, "logps/chosen": -401.33447265625, "logps/rejected": -532.9482421875, "loss": 0.5491, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.8861383199691772, "rewards/margins": 1.117545485496521, "rewards/rejected": -2.0036838054656982, "step": 1380 }, { "epoch": 0.3334932821497121, "grad_norm": 32.39940263140558, "learning_rate": 4.214814228354204e-07, "logits/chosen": 0.4310382008552551, "logits/rejected": 0.47493353486061096, "logps/chosen": -467.65216064453125, "logps/rejected": -549.2056884765625, "loss": 0.5295, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1547861099243164, "rewards/margins": 1.1499736309051514, "rewards/rejected": -2.304759979248047, "step": 1390 }, { "epoch": 0.33589251439539347, "grad_norm": 37.76328528326702, "learning_rate": 4.1995180075939375e-07, "logits/chosen": 0.6290279626846313, "logits/rejected": 0.5864508748054504, "logps/chosen": -488.428466796875, "logps/rejected": -547.3450927734375, "loss": 0.5114, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2981139421463013, "rewards/margins": 0.8224126100540161, "rewards/rejected": -2.1205263137817383, "step": 1400 }, { "epoch": 0.33829174664107486, "grad_norm": 41.27479960235454, "learning_rate": 4.1841025723975297e-07, "logits/chosen": 0.42811208963394165, "logits/rejected": 0.41358089447021484, "logps/chosen": -445.8192443847656, "logps/rejected": -505.38653564453125, "loss": 0.4953, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9554083943367004, "rewards/margins": 0.7210197448730469, "rewards/rejected": -1.676428198814392, "step": 1410 }, { "epoch": 0.34069097888675626, "grad_norm": 37.87482486935292, "learning_rate": 4.168569004096516e-07, "logits/chosen": 0.4879208207130432, "logits/rejected": 0.37299996614456177, "logps/chosen": -421.3837890625, "logps/rejected": -540.9444580078125, "loss": 0.4969, "rewards/accuracies": 0.75, "rewards/chosen": -1.2045072317123413, "rewards/margins": 1.0760588645935059, "rewards/rejected": -2.2805662155151367, "step": 1420 }, { "epoch": 0.3430902111324376, "grad_norm": 34.143233451160405, "learning_rate": 4.152918392308997e-07, "logits/chosen": 0.4631095826625824, "logits/rejected": 0.44977670907974243, "logps/chosen": -420.1924743652344, "logps/rejected": -457.46673583984375, "loss": 0.4946, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1693400144577026, "rewards/margins": 0.540154218673706, "rewards/rejected": -1.7094943523406982, "step": 1430 }, { "epoch": 0.345489443378119, "grad_norm": 79.72399784718598, "learning_rate": 4.137151834863213e-07, "logits/chosen": 0.30308836698532104, "logits/rejected": 0.19191868603229523, "logps/chosen": -421.8958435058594, "logps/rejected": -534.3394775390625, "loss": 0.5464, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1978578567504883, "rewards/margins": 0.8276159167289734, "rewards/rejected": -2.0254738330841064, "step": 1440 }, { "epoch": 0.3478886756238004, "grad_norm": 53.81472585528722, "learning_rate": 4.121270437720526e-07, "logits/chosen": 0.2503531575202942, "logits/rejected": 0.20632532238960266, "logps/chosen": -388.9275817871094, "logps/rejected": -504.99627685546875, "loss": 0.5436, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.158739447593689, "rewards/margins": 0.6908172965049744, "rewards/rejected": -1.8495569229125977, "step": 1450 }, { "epoch": 0.3502879078694818, "grad_norm": 45.6784642712931, "learning_rate": 4.105275314897852e-07, "logits/chosen": 0.48888054490089417, "logits/rejected": 0.3766574263572693, "logps/chosen": -397.5686340332031, "logps/rejected": -535.5113525390625, "loss": 0.5107, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.021269679069519, "rewards/margins": 1.1115610599517822, "rewards/rejected": -2.1328306198120117, "step": 1460 }, { "epoch": 0.35268714011516317, "grad_norm": 42.50127277305204, "learning_rate": 4.089167588389508e-07, "logits/chosen": 0.35595473647117615, "logits/rejected": 0.4420366883277893, "logps/chosen": -525.4200439453125, "logps/rejected": -575.6399536132812, "loss": 0.5315, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.174477219581604, "rewards/margins": 0.9557849168777466, "rewards/rejected": -2.1302618980407715, "step": 1470 }, { "epoch": 0.3550863723608445, "grad_norm": 66.9134896066362, "learning_rate": 4.072948388088515e-07, "logits/chosen": 0.4660380482673645, "logits/rejected": 0.48526984453201294, "logps/chosen": -472.48773193359375, "logps/rejected": -540.60546875, "loss": 0.5512, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.309525966644287, "rewards/margins": 0.729021430015564, "rewards/rejected": -2.0385475158691406, "step": 1480 }, { "epoch": 0.3574856046065259, "grad_norm": 48.14455914875948, "learning_rate": 4.056618851707334e-07, "logits/chosen": 0.3936781585216522, "logits/rejected": 0.37658897042274475, "logps/chosen": -417.9375915527344, "logps/rejected": -505.556396484375, "loss": 0.4962, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.8807679414749146, "rewards/margins": 0.8251503109931946, "rewards/rejected": -1.7059180736541748, "step": 1490 }, { "epoch": 0.3598848368522073, "grad_norm": 44.71962261776299, "learning_rate": 4.0401801246980675e-07, "logits/chosen": 0.2104732245206833, "logits/rejected": 0.22102966904640198, "logps/chosen": -413.0994567871094, "logps/rejected": -452.8529357910156, "loss": 0.5258, "rewards/accuracies": 0.75, "rewards/chosen": -1.222390055656433, "rewards/margins": 0.6135789155960083, "rewards/rejected": -1.8359689712524414, "step": 1500 }, { "epoch": 0.3622840690978887, "grad_norm": 38.59038142711945, "learning_rate": 4.0236333601721043e-07, "logits/chosen": 0.36115556955337524, "logits/rejected": 0.27192938327789307, "logps/chosen": -518.89306640625, "logps/rejected": -567.1900024414062, "loss": 0.5451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.524860143661499, "rewards/margins": 0.5048703551292419, "rewards/rejected": -2.0297303199768066, "step": 1510 }, { "epoch": 0.3646833013435701, "grad_norm": 48.99560916590031, "learning_rate": 4.0069797188192364e-07, "logits/chosen": 0.2493390589952469, "logits/rejected": 0.25582900643348694, "logps/chosen": -457.5439453125, "logps/rejected": -511.53466796875, "loss": 0.5255, "rewards/accuracies": 0.75, "rewards/chosen": -1.0995330810546875, "rewards/margins": 0.8029910326004028, "rewards/rejected": -1.9025242328643799, "step": 1520 }, { "epoch": 0.3670825335892514, "grad_norm": 43.28635521609486, "learning_rate": 3.9902203688262417e-07, "logits/chosen": 0.24590995907783508, "logits/rejected": 0.2573690414428711, "logps/chosen": -447.503173828125, "logps/rejected": -495.9049377441406, "loss": 0.5052, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1626179218292236, "rewards/margins": 0.7147835493087769, "rewards/rejected": -1.87740159034729, "step": 1530 }, { "epoch": 0.3694817658349328, "grad_norm": 75.4063331165295, "learning_rate": 3.9733564857949365e-07, "logits/chosen": 0.36004549264907837, "logits/rejected": 0.39339983463287354, "logps/chosen": -538.8134765625, "logps/rejected": -569.4513549804688, "loss": 0.5428, "rewards/accuracies": 0.75, "rewards/chosen": -1.430328369140625, "rewards/margins": 0.773267388343811, "rewards/rejected": -2.2035956382751465, "step": 1540 }, { "epoch": 0.3718809980806142, "grad_norm": 47.00943225874421, "learning_rate": 3.9563892526597177e-07, "logits/chosen": 0.38262271881103516, "logits/rejected": 0.3127327561378479, "logps/chosen": -405.52008056640625, "logps/rejected": -523.7188720703125, "loss": 0.5275, "rewards/accuracies": 0.75, "rewards/chosen": -1.2701631784439087, "rewards/margins": 0.673926591873169, "rewards/rejected": -1.944089651107788, "step": 1550 }, { "epoch": 0.3742802303262956, "grad_norm": 44.77491303021576, "learning_rate": 3.9393198596045795e-07, "logits/chosen": 0.2474546879529953, "logits/rejected": 0.1317511945962906, "logps/chosen": -421.62994384765625, "logps/rejected": -519.5099487304688, "loss": 0.5383, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2208540439605713, "rewards/margins": 0.7671472430229187, "rewards/rejected": -1.9880012273788452, "step": 1560 }, { "epoch": 0.376679462571977, "grad_norm": 37.95179606415185, "learning_rate": 3.922149503979628e-07, "logits/chosen": 0.2700248658657074, "logits/rejected": 0.21610090136528015, "logps/chosen": -471.33056640625, "logps/rejected": -593.8278198242188, "loss": 0.5122, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2792729139328003, "rewards/margins": 1.1995410919189453, "rewards/rejected": -2.4788146018981934, "step": 1570 }, { "epoch": 0.3790786948176583, "grad_norm": 55.896865397911, "learning_rate": 3.904879390217095e-07, "logits/chosen": 0.12995900213718414, "logits/rejected": 0.12265945971012115, "logps/chosen": -443.599365234375, "logps/rejected": -492.68450927734375, "loss": 0.528, "rewards/accuracies": 0.75, "rewards/chosen": -1.2007102966308594, "rewards/margins": 0.6766700744628906, "rewards/rejected": -1.87738037109375, "step": 1580 }, { "epoch": 0.3814779270633397, "grad_norm": 49.93484321544338, "learning_rate": 3.8875107297468463e-07, "logits/chosen": 0.20564258098602295, "logits/rejected": 0.0780414491891861, "logps/chosen": -411.8665466308594, "logps/rejected": -573.74951171875, "loss": 0.5229, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.0018932819366455, "rewards/margins": 1.2348394393920898, "rewards/rejected": -2.2367329597473145, "step": 1590 }, { "epoch": 0.3838771593090211, "grad_norm": 38.069521505621516, "learning_rate": 3.87004474091141e-07, "logits/chosen": 0.3447803258895874, "logits/rejected": 0.3082936704158783, "logps/chosen": -405.9560852050781, "logps/rejected": -489.4607849121094, "loss": 0.541, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0926564931869507, "rewards/margins": 0.7233616709709167, "rewards/rejected": -1.8160178661346436, "step": 1600 }, { "epoch": 0.3862763915547025, "grad_norm": 44.24824114407542, "learning_rate": 3.8524826488805114e-07, "logits/chosen": 0.3052324950695038, "logits/rejected": 0.3181813657283783, "logps/chosen": -473.97796630859375, "logps/rejected": -500.7769470214844, "loss": 0.547, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2699439525604248, "rewards/margins": 0.7647022008895874, "rewards/rejected": -2.0346462726593018, "step": 1610 }, { "epoch": 0.3886756238003839, "grad_norm": 47.309718786937964, "learning_rate": 3.834825685565133e-07, "logits/chosen": 0.33559301495552063, "logits/rejected": 0.3656995892524719, "logps/chosen": -414.19256591796875, "logps/rejected": -421.0203552246094, "loss": 0.4953, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.065473198890686, "rewards/margins": 0.5177011489868164, "rewards/rejected": -1.5831743478775024, "step": 1620 }, { "epoch": 0.39107485604606523, "grad_norm": 42.86172629937328, "learning_rate": 3.8170750895311007e-07, "logits/chosen": 0.1855572611093521, "logits/rejected": 0.17679139971733093, "logps/chosen": -452.2789001464844, "logps/rejected": -500.349609375, "loss": 0.4908, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.9900191426277161, "rewards/margins": 0.7418977618217468, "rewards/rejected": -1.7319167852401733, "step": 1630 }, { "epoch": 0.3934740882917466, "grad_norm": 45.001223140761674, "learning_rate": 3.7992321059122045e-07, "logits/chosen": 0.2781444787979126, "logits/rejected": 0.30307430028915405, "logps/chosen": -414.05523681640625, "logps/rejected": -462.14239501953125, "loss": 0.5237, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1657673120498657, "rewards/margins": 0.6909239888191223, "rewards/rejected": -1.8566913604736328, "step": 1640 }, { "epoch": 0.395873320537428, "grad_norm": 60.544325020503095, "learning_rate": 3.7812979863228576e-07, "logits/chosen": 0.2274487465620041, "logits/rejected": 0.16551566123962402, "logps/chosen": -405.3382873535156, "logps/rejected": -493.6697692871094, "loss": 0.4928, "rewards/accuracies": 0.75, "rewards/chosen": -1.2897754907608032, "rewards/margins": 0.8161094784736633, "rewards/rejected": -2.1058847904205322, "step": 1650 }, { "epoch": 0.3982725527831094, "grad_norm": 50.449927443360075, "learning_rate": 3.763273988770296e-07, "logits/chosen": 0.40345683693885803, "logits/rejected": 0.39551275968551636, "logps/chosen": -453.79803466796875, "logps/rejected": -535.8180541992188, "loss": 0.4965, "rewards/accuracies": 0.75, "rewards/chosen": -1.3234025239944458, "rewards/margins": 0.8552868962287903, "rewards/rejected": -2.178689479827881, "step": 1660 }, { "epoch": 0.4006717850287908, "grad_norm": 45.22606638463477, "learning_rate": 3.7451613775663405e-07, "logits/chosen": 0.2254648655653, "logits/rejected": 0.15715382993221283, "logps/chosen": -444.4361267089844, "logps/rejected": -565.7696533203125, "loss": 0.5436, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3264541625976562, "rewards/margins": 1.2607003450393677, "rewards/rejected": -2.5871543884277344, "step": 1670 }, { "epoch": 0.40307101727447214, "grad_norm": 56.89213037695923, "learning_rate": 3.726961423238706e-07, "logits/chosen": 0.2933524250984192, "logits/rejected": 0.212088942527771, "logps/chosen": -426.76080322265625, "logps/rejected": -546.6845703125, "loss": 0.5149, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2648175954818726, "rewards/margins": 1.0558243989944458, "rewards/rejected": -2.3206419944763184, "step": 1680 }, { "epoch": 0.40547024952015354, "grad_norm": 48.93178210300578, "learning_rate": 3.708675402441882e-07, "logits/chosen": 0.2865277826786041, "logits/rejected": 0.37102895975112915, "logps/chosen": -484.88519287109375, "logps/rejected": -502.8384704589844, "loss": 0.5365, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2438859939575195, "rewards/margins": 0.6120424270629883, "rewards/rejected": -1.855928659439087, "step": 1690 }, { "epoch": 0.40786948176583493, "grad_norm": 41.4789900308926, "learning_rate": 3.6903045978675775e-07, "logits/chosen": 0.3034370541572571, "logits/rejected": 0.2625337243080139, "logps/chosen": -386.1392517089844, "logps/rejected": -470.08135986328125, "loss": 0.5013, "rewards/accuracies": 0.75, "rewards/chosen": -0.9028989672660828, "rewards/margins": 1.0875600576400757, "rewards/rejected": -1.9904590845108032, "step": 1700 }, { "epoch": 0.4102687140115163, "grad_norm": 35.84427094735192, "learning_rate": 3.6718502981547474e-07, "logits/chosen": 0.385175883769989, "logits/rejected": 0.2869270443916321, "logps/chosen": -436.753662109375, "logps/rejected": -548.1320190429688, "loss": 0.512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1051702499389648, "rewards/margins": 0.6421515345573425, "rewards/rejected": -1.7473220825195312, "step": 1710 }, { "epoch": 0.4126679462571977, "grad_norm": 36.76356281345392, "learning_rate": 3.6533137977991986e-07, "logits/chosen": 0.2681284248828888, "logits/rejected": 0.27597135305404663, "logps/chosen": -444.15826416015625, "logps/rejected": -524.8231201171875, "loss": 0.5344, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9779054522514343, "rewards/margins": 0.62675940990448, "rewards/rejected": -1.6046650409698486, "step": 1720 }, { "epoch": 0.41506717850287905, "grad_norm": 42.054568712185926, "learning_rate": 3.6346963970627865e-07, "logits/chosen": 0.3877958655357361, "logits/rejected": 0.2975226044654846, "logps/chosen": -420.9158630371094, "logps/rejected": -515.4686889648438, "loss": 0.5036, "rewards/accuracies": 0.75, "rewards/chosen": -1.0269657373428345, "rewards/margins": 0.7597817182540894, "rewards/rejected": -1.7867473363876343, "step": 1730 }, { "epoch": 0.41746641074856045, "grad_norm": 44.183218072360475, "learning_rate": 3.615999401882207e-07, "logits/chosen": 0.5101007223129272, "logits/rejected": 0.44053035974502563, "logps/chosen": -388.8902282714844, "logps/rejected": -512.8978881835938, "loss": 0.5121, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2427517175674438, "rewards/margins": 0.933813214302063, "rewards/rejected": -2.1765646934509277, "step": 1740 }, { "epoch": 0.41986564299424184, "grad_norm": 38.84095884357132, "learning_rate": 3.597224123777389e-07, "logits/chosen": 0.4116114675998688, "logits/rejected": 0.3661612570285797, "logps/chosen": -430.21990966796875, "logps/rejected": -544.9900512695312, "loss": 0.4882, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1606342792510986, "rewards/margins": 1.0220921039581299, "rewards/rejected": -2.1827263832092285, "step": 1750 }, { "epoch": 0.42226487523992323, "grad_norm": 48.48200071110331, "learning_rate": 3.5783718797595e-07, "logits/chosen": 0.3250165581703186, "logits/rejected": 0.41192755103111267, "logps/chosen": -487.70404052734375, "logps/rejected": -506.49530029296875, "loss": 0.5418, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.321012020111084, "rewards/margins": 0.7061235308647156, "rewards/rejected": -2.0271353721618652, "step": 1760 }, { "epoch": 0.4246641074856046, "grad_norm": 41.976087972205285, "learning_rate": 3.559443992238558e-07, "logits/chosen": 0.38490504026412964, "logits/rejected": 0.3506616950035095, "logps/chosen": -414.7301330566406, "logps/rejected": -553.3479614257812, "loss": 0.5277, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9950034022331238, "rewards/margins": 1.1345813274383545, "rewards/rejected": -2.129584550857544, "step": 1770 }, { "epoch": 0.42706333973128596, "grad_norm": 44.81649257476256, "learning_rate": 3.540441788930673e-07, "logits/chosen": 0.3962218165397644, "logits/rejected": 0.325061172246933, "logps/chosen": -467.65155029296875, "logps/rejected": -539.2551879882812, "loss": 0.4933, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1214022636413574, "rewards/margins": 1.1308571100234985, "rewards/rejected": -2.2522594928741455, "step": 1780 }, { "epoch": 0.42946257197696736, "grad_norm": 45.2856769850179, "learning_rate": 3.5213666027649123e-07, "logits/chosen": 0.33266204595565796, "logits/rejected": 0.3824441134929657, "logps/chosen": -480.2119140625, "logps/rejected": -476.08984375, "loss": 0.5238, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3972989320755005, "rewards/margins": 0.541749119758606, "rewards/rejected": -1.9390478134155273, "step": 1790 }, { "epoch": 0.43186180422264875, "grad_norm": 54.06455040727181, "learning_rate": 3.5022197717898017e-07, "logits/chosen": 0.19602210819721222, "logits/rejected": 0.23719044029712677, "logps/chosen": -394.2027587890625, "logps/rejected": -459.33221435546875, "loss": 0.4769, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.173718810081482, "rewards/margins": 0.8884655833244324, "rewards/rejected": -2.0621845722198486, "step": 1800 }, { "epoch": 0.43426103646833014, "grad_norm": 36.13993495552892, "learning_rate": 3.4830026390794633e-07, "logits/chosen": 0.16905806958675385, "logits/rejected": 0.14926643669605255, "logps/chosen": -505.55126953125, "logps/rejected": -551.2828979492188, "loss": 0.4745, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3583290576934814, "rewards/margins": 1.004029631614685, "rewards/rejected": -2.362358570098877, "step": 1810 }, { "epoch": 0.43666026871401153, "grad_norm": 32.25356254003183, "learning_rate": 3.4637165526394104e-07, "logits/chosen": 0.23928511142730713, "logits/rejected": 0.22237971425056458, "logps/chosen": -415.0269470214844, "logps/rejected": -494.46405029296875, "loss": 0.5278, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.063435435295105, "rewards/margins": 0.7565540671348572, "rewards/rejected": -1.819989562034607, "step": 1820 }, { "epoch": 0.43905950095969287, "grad_norm": 34.4419746511506, "learning_rate": 3.4443628653119814e-07, "logits/chosen": 0.27581119537353516, "logits/rejected": 0.24289298057556152, "logps/chosen": -466.73529052734375, "logps/rejected": -645.5926513671875, "loss": 0.5157, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2824543714523315, "rewards/margins": 1.4018195867538452, "rewards/rejected": -2.684274196624756, "step": 1830 }, { "epoch": 0.44145873320537427, "grad_norm": 41.075704870340594, "learning_rate": 3.424942934681453e-07, "logits/chosen": 0.27590471506118774, "logits/rejected": 0.33334219455718994, "logps/chosen": -408.6986083984375, "logps/rejected": -506.6996154785156, "loss": 0.5031, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9812175035476685, "rewards/margins": 1.1212607622146606, "rewards/rejected": -2.10247802734375, "step": 1840 }, { "epoch": 0.44385796545105566, "grad_norm": 51.35010815764105, "learning_rate": 3.405458122978804e-07, "logits/chosen": 0.28459858894348145, "logits/rejected": 0.24139773845672607, "logps/chosen": -467.11932373046875, "logps/rejected": -536.8555908203125, "loss": 0.4962, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.151214838027954, "rewards/margins": 0.9230279922485352, "rewards/rejected": -2.0742428302764893, "step": 1850 }, { "epoch": 0.44625719769673705, "grad_norm": 58.94555175884757, "learning_rate": 3.3859097969861633e-07, "logits/chosen": 0.3147757649421692, "logits/rejected": 0.296464741230011, "logps/chosen": -475.61175537109375, "logps/rejected": -521.8074951171875, "loss": 0.5255, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.389574646949768, "rewards/margins": 0.8023090362548828, "rewards/rejected": -2.1918835639953613, "step": 1860 }, { "epoch": 0.44865642994241844, "grad_norm": 43.58295874945141, "learning_rate": 3.366299327940936e-07, "logits/chosen": 0.2593730092048645, "logits/rejected": 0.1364545077085495, "logps/chosen": -485.0771484375, "logps/rejected": -578.6021728515625, "loss": 0.512, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2985343933105469, "rewards/margins": 0.7179661989212036, "rewards/rejected": -2.01650071144104, "step": 1870 }, { "epoch": 0.4510556621880998, "grad_norm": 33.90433109567072, "learning_rate": 3.3466280914396117e-07, "logits/chosen": 0.17524075508117676, "logits/rejected": 0.12327942997217178, "logps/chosen": -436.56536865234375, "logps/rejected": -551.0841064453125, "loss": 0.5111, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3039577007293701, "rewards/margins": 0.9412840604782104, "rewards/rejected": -2.24524188041687, "step": 1880 }, { "epoch": 0.4534548944337812, "grad_norm": 48.57573160920276, "learning_rate": 3.326897467341281e-07, "logits/chosen": 0.10545514523983002, "logits/rejected": 0.10264859348535538, "logps/chosen": -394.76776123046875, "logps/rejected": -494.9923400878906, "loss": 0.525, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1733514070510864, "rewards/margins": 0.8517268896102905, "rewards/rejected": -2.025078296661377, "step": 1890 }, { "epoch": 0.45585412667946257, "grad_norm": 52.20592211080183, "learning_rate": 3.3071088396708335e-07, "logits/chosen": 0.16945740580558777, "logits/rejected": 0.12676987051963806, "logps/chosen": -370.2272033691406, "logps/rejected": -489.0442810058594, "loss": 0.503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9613567590713501, "rewards/margins": 1.0801159143447876, "rewards/rejected": -2.0414726734161377, "step": 1900 }, { "epoch": 0.45825335892514396, "grad_norm": 39.11646762477483, "learning_rate": 3.2872635965218824e-07, "logits/chosen": 0.36154884099960327, "logits/rejected": 0.3266182541847229, "logps/chosen": -472.8251037597656, "logps/rejected": -570.266845703125, "loss": 0.524, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5062012672424316, "rewards/margins": 0.7707003355026245, "rewards/rejected": -2.2769012451171875, "step": 1910 }, { "epoch": 0.46065259117082535, "grad_norm": 33.57125004563553, "learning_rate": 3.2673631299593905e-07, "logits/chosen": 0.20721454918384552, "logits/rejected": 0.24143996834754944, "logps/chosen": -474.7398376464844, "logps/rejected": -535.4746704101562, "loss": 0.5204, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3117005825042725, "rewards/margins": 0.8077449798583984, "rewards/rejected": -2.119445562362671, "step": 1920 }, { "epoch": 0.4630518234165067, "grad_norm": 49.103327518032735, "learning_rate": 3.247408835922024e-07, "logits/chosen": 0.3439037799835205, "logits/rejected": 0.2654734253883362, "logps/chosen": -527.8143310546875, "logps/rejected": -611.921875, "loss": 0.506, "rewards/accuracies": 0.625, "rewards/chosen": -1.5940402746200562, "rewards/margins": 0.856569766998291, "rewards/rejected": -2.4506099224090576, "step": 1930 }, { "epoch": 0.4654510556621881, "grad_norm": 49.876953655813104, "learning_rate": 3.2274021141242306e-07, "logits/chosen": 0.43298500776290894, "logits/rejected": 0.4217461049556732, "logps/chosen": -458.80535888671875, "logps/rejected": -546.6775512695312, "loss": 0.4867, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2974848747253418, "rewards/margins": 0.8723615407943726, "rewards/rejected": -2.169846534729004, "step": 1940 }, { "epoch": 0.4678502879078695, "grad_norm": 59.65377964792493, "learning_rate": 3.2073443679580613e-07, "logits/chosen": 0.2417244166135788, "logits/rejected": 0.23499338328838348, "logps/chosen": -469.7822265625, "logps/rejected": -545.030517578125, "loss": 0.4895, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3010753393173218, "rewards/margins": 0.6901552081108093, "rewards/rejected": -1.9912303686141968, "step": 1950 }, { "epoch": 0.47024952015355087, "grad_norm": 48.010287716369675, "learning_rate": 3.1872370043947194e-07, "logits/chosen": 0.44626665115356445, "logits/rejected": 0.40652981400489807, "logps/chosen": -418.04608154296875, "logps/rejected": -536.6441650390625, "loss": 0.464, "rewards/accuracies": 0.875, "rewards/chosen": -0.8487616777420044, "rewards/margins": 1.3238131999969482, "rewards/rejected": -2.1725752353668213, "step": 1960 }, { "epoch": 0.47264875239923226, "grad_norm": 46.983751656355615, "learning_rate": 3.167081433885874e-07, "logits/chosen": 0.4636153280735016, "logits/rejected": 0.4149314761161804, "logps/chosen": -560.605712890625, "logps/rejected": -700.2041015625, "loss": 0.4653, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5527485609054565, "rewards/margins": 0.9842365384101868, "rewards/rejected": -2.536984920501709, "step": 1970 }, { "epoch": 0.4750479846449136, "grad_norm": 46.832783839770435, "learning_rate": 3.14687907026472e-07, "logits/chosen": 0.34756892919540405, "logits/rejected": 0.3631365895271301, "logps/chosen": -452.46533203125, "logps/rejected": -579.3941650390625, "loss": 0.5059, "rewards/accuracies": 0.875, "rewards/chosen": -1.4736191034317017, "rewards/margins": 1.1666433811187744, "rewards/rejected": -2.6402623653411865, "step": 1980 }, { "epoch": 0.477447216890595, "grad_norm": 51.65835958499199, "learning_rate": 3.126631330646801e-07, "logits/chosen": 0.28329282999038696, "logits/rejected": 0.23793701827526093, "logps/chosen": -574.2579956054688, "logps/rejected": -644.44677734375, "loss": 0.5244, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.125730037689209, "rewards/margins": 0.6223888397216797, "rewards/rejected": -2.7481188774108887, "step": 1990 }, { "epoch": 0.4798464491362764, "grad_norm": 56.94846979096699, "learning_rate": 3.1063396353306097e-07, "logits/chosen": 0.370736300945282, "logits/rejected": 0.43973660469055176, "logps/chosen": -495.07171630859375, "logps/rejected": -519.3072509765625, "loss": 0.5009, "rewards/accuracies": 0.75, "rewards/chosen": -1.2859165668487549, "rewards/margins": 0.9244076609611511, "rewards/rejected": -2.2103240489959717, "step": 2000 }, { "epoch": 0.4798464491362764, "eval_logits/chosen": 0.5136142373085022, "eval_logits/rejected": 0.4682252109050751, "eval_logps/chosen": -468.3976135253906, "eval_logps/rejected": -586.2582397460938, "eval_loss": 0.49979615211486816, "eval_rewards/accuracies": 0.7803571224212646, "eval_rewards/chosen": -1.4972540140151978, "eval_rewards/margins": 1.1174662113189697, "eval_rewards/rejected": -2.614720106124878, "eval_runtime": 185.1772, "eval_samples_per_second": 24.09, "eval_steps_per_second": 0.378, "step": 2000 }, { "epoch": 0.4822456813819578, "grad_norm": 61.64399490626884, "learning_rate": 3.0860054076979535e-07, "logits/chosen": 0.34754273295402527, "logits/rejected": 0.3305366635322571, "logps/chosen": -490.9940490722656, "logps/rejected": -570.1460571289062, "loss": 0.4823, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5707345008850098, "rewards/margins": 1.0668280124664307, "rewards/rejected": -2.6375622749328613, "step": 2010 }, { "epoch": 0.4846449136276392, "grad_norm": 54.18063166333372, "learning_rate": 3.065630074114115e-07, "logits/chosen": 0.3459337651729584, "logits/rejected": 0.36747267842292786, "logps/chosen": -486.1033630371094, "logps/rejected": -554.5687866210938, "loss": 0.5237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3654518127441406, "rewards/margins": 1.115387201309204, "rewards/rejected": -2.4808387756347656, "step": 2020 }, { "epoch": 0.4870441458733205, "grad_norm": 48.37481533662561, "learning_rate": 3.0452150638277947e-07, "logits/chosen": 0.3800879120826721, "logits/rejected": 0.3224307894706726, "logps/chosen": -418.9652404785156, "logps/rejected": -507.47747802734375, "loss": 0.5304, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2180391550064087, "rewards/margins": 0.8009985685348511, "rewards/rejected": -2.0190374851226807, "step": 2030 }, { "epoch": 0.4894433781190019, "grad_norm": 35.82205388348395, "learning_rate": 3.024761808870856e-07, "logits/chosen": 0.43575650453567505, "logits/rejected": 0.3246951997280121, "logps/chosen": -394.49700927734375, "logps/rejected": -528.8324584960938, "loss": 0.4962, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9007269740104675, "rewards/margins": 1.4299715757369995, "rewards/rejected": -2.3306984901428223, "step": 2040 }, { "epoch": 0.4918426103646833, "grad_norm": 69.46282819499118, "learning_rate": 3.004271743957875e-07, "logits/chosen": 0.1757555603981018, "logits/rejected": 0.11282005161046982, "logps/chosen": -492.5611877441406, "logps/rejected": -602.2737426757812, "loss": 0.5181, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.534406304359436, "rewards/margins": 0.9544004201889038, "rewards/rejected": -2.48880672454834, "step": 2050 }, { "epoch": 0.4942418426103647, "grad_norm": 45.79139932334549, "learning_rate": 2.983746306385499e-07, "logits/chosen": 0.26720863580703735, "logits/rejected": 0.22653250396251678, "logps/chosen": -450.50970458984375, "logps/rejected": -577.3936767578125, "loss": 0.481, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3142211437225342, "rewards/margins": 1.129241704940796, "rewards/rejected": -2.44346284866333, "step": 2060 }, { "epoch": 0.4966410748560461, "grad_norm": 43.321422686785745, "learning_rate": 2.963186935931628e-07, "logits/chosen": 0.3077266812324524, "logits/rejected": 0.2476225346326828, "logps/chosen": -480.34619140625, "logps/rejected": -587.4752197265625, "loss": 0.489, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.302478551864624, "rewards/margins": 1.1229597330093384, "rewards/rejected": -2.425438404083252, "step": 2070 }, { "epoch": 0.4990403071017274, "grad_norm": 45.25903246638121, "learning_rate": 2.9425950747544176e-07, "logits/chosen": 0.2362133264541626, "logits/rejected": 0.20862069725990295, "logps/chosen": -528.6657104492188, "logps/rejected": -640.0977172851562, "loss": 0.4865, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.531398057937622, "rewards/margins": 1.434666633605957, "rewards/rejected": -2.966064929962158, "step": 2080 }, { "epoch": 0.5014395393474088, "grad_norm": 46.57096517661485, "learning_rate": 2.921972167291119e-07, "logits/chosen": 0.1148526519536972, "logits/rejected": 0.0883648619055748, "logps/chosen": -483.4339904785156, "logps/rejected": -601.8978271484375, "loss": 0.4907, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3184032440185547, "rewards/margins": 0.981913685798645, "rewards/rejected": -2.3003170490264893, "step": 2090 }, { "epoch": 0.5038387715930902, "grad_norm": 39.63447443095947, "learning_rate": 2.9013196601567567e-07, "logits/chosen": 0.08627250045537949, "logits/rejected": 0.10158304125070572, "logps/chosen": -421.615478515625, "logps/rejected": -528.127685546875, "loss": 0.5409, "rewards/accuracies": 0.75, "rewards/chosen": -1.1398550271987915, "rewards/margins": 0.95441073179245, "rewards/rejected": -2.0942656993865967, "step": 2100 }, { "epoch": 0.5062380038387716, "grad_norm": 36.05433569174663, "learning_rate": 2.8806390020426555e-07, "logits/chosen": 0.0710478127002716, "logits/rejected": 0.05162844806909561, "logps/chosen": -453.2339782714844, "logps/rejected": -557.0374145507812, "loss": 0.4948, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1150130033493042, "rewards/margins": 1.007678747177124, "rewards/rejected": -2.1226916313171387, "step": 2110 }, { "epoch": 0.508637236084453, "grad_norm": 50.26433068268168, "learning_rate": 2.8599316436148187e-07, "logits/chosen": 0.24139384925365448, "logits/rejected": 0.21716871857643127, "logps/chosen": -447.4005432128906, "logps/rejected": -504.754638671875, "loss": 0.4837, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.421812653541565, "rewards/margins": 0.5791618227958679, "rewards/rejected": -2.000974655151367, "step": 2120 }, { "epoch": 0.5110364683301344, "grad_norm": 48.79279161854594, "learning_rate": 2.8391990374121723e-07, "logits/chosen": 0.14107191562652588, "logits/rejected": 0.05996360257267952, "logps/chosen": -447.6856384277344, "logps/rejected": -572.6692504882812, "loss": 0.5309, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3457109928131104, "rewards/margins": 1.006974458694458, "rewards/rejected": -2.3526854515075684, "step": 2130 }, { "epoch": 0.5134357005758158, "grad_norm": 49.554326824350056, "learning_rate": 2.818442637744669e-07, "logits/chosen": 0.14974358677864075, "logits/rejected": 0.07151228934526443, "logps/chosen": -468.7398986816406, "logps/rejected": -561.0519409179688, "loss": 0.529, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4705628156661987, "rewards/margins": 0.9486227035522461, "rewards/rejected": -2.4191856384277344, "step": 2140 }, { "epoch": 0.5158349328214972, "grad_norm": 49.03414708222374, "learning_rate": 2.797663900591284e-07, "logits/chosen": 0.12192866951227188, "logits/rejected": 0.1623045951128006, "logps/chosen": -484.0595703125, "logps/rejected": -536.2966918945312, "loss": 0.4726, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.466080904006958, "rewards/margins": 0.9258912205696106, "rewards/rejected": -2.3919718265533447, "step": 2150 }, { "epoch": 0.5182341650671785, "grad_norm": 47.623947511820035, "learning_rate": 2.776864283497874e-07, "logits/chosen": 0.2551673352718353, "logits/rejected": 0.25919514894485474, "logps/chosen": -450.1902770996094, "logps/rejected": -603.4628295898438, "loss": 0.5047, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.427159309387207, "rewards/margins": 1.6337556838989258, "rewards/rejected": -3.0609147548675537, "step": 2160 }, { "epoch": 0.5206333973128598, "grad_norm": 39.32544622434657, "learning_rate": 2.756045245474943e-07, "logits/chosen": 0.1113271713256836, "logits/rejected": 0.06980106979608536, "logps/chosen": -477.89288330078125, "logps/rejected": -590.3850708007812, "loss": 0.5136, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3194444179534912, "rewards/margins": 0.8708696365356445, "rewards/rejected": -2.1903140544891357, "step": 2170 }, { "epoch": 0.5230326295585412, "grad_norm": 41.32084808239206, "learning_rate": 2.7352082468952977e-07, "logits/chosen": 0.18554797768592834, "logits/rejected": 0.10465570539236069, "logps/chosen": -464.09027099609375, "logps/rejected": -614.1561279296875, "loss": 0.5172, "rewards/accuracies": 0.75, "rewards/chosen": -1.5660805702209473, "rewards/margins": 1.2777998447418213, "rewards/rejected": -2.8438801765441895, "step": 2180 }, { "epoch": 0.5254318618042226, "grad_norm": 67.92788558845768, "learning_rate": 2.7143547493916e-07, "logits/chosen": 0.18377096951007843, "logits/rejected": 0.10271792113780975, "logps/chosen": -409.5820007324219, "logps/rejected": -570.1399536132812, "loss": 0.4842, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.100378394126892, "rewards/margins": 1.5573487281799316, "rewards/rejected": -2.657727003097534, "step": 2190 }, { "epoch": 0.527831094049904, "grad_norm": 50.707037903665324, "learning_rate": 2.693486215753853e-07, "logits/chosen": 0.12866708636283875, "logits/rejected": 0.06411238014698029, "logps/chosen": -418.26715087890625, "logps/rejected": -512.0023193359375, "loss": 0.5342, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2301840782165527, "rewards/margins": 1.0781285762786865, "rewards/rejected": -2.30831241607666, "step": 2200 }, { "epoch": 0.5302303262955854, "grad_norm": 46.86231101360818, "learning_rate": 2.6726041098267805e-07, "logits/chosen": -0.031896281987428665, "logits/rejected": -0.030716899782419205, "logps/chosen": -487.5838928222656, "logps/rejected": -485.93646240234375, "loss": 0.5532, "rewards/accuracies": 0.625, "rewards/chosen": -1.3748475313186646, "rewards/margins": 0.40763726830482483, "rewards/rejected": -1.7824846506118774, "step": 2210 }, { "epoch": 0.5326295585412668, "grad_norm": 72.55957637434223, "learning_rate": 2.6517098964071507e-07, "logits/chosen": 0.23304399847984314, "logits/rejected": 0.22425612807273865, "logps/chosen": -444.8831481933594, "logps/rejected": -506.5235900878906, "loss": 0.546, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1518399715423584, "rewards/margins": 0.4722941517829895, "rewards/rejected": -1.6241340637207031, "step": 2220 }, { "epoch": 0.5350287907869482, "grad_norm": 43.727526427782365, "learning_rate": 2.630805041141023e-07, "logits/chosen": 0.2689264118671417, "logits/rejected": 0.2254217565059662, "logps/chosen": -403.8577880859375, "logps/rejected": -512.3160400390625, "loss": 0.5046, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.058393120765686, "rewards/margins": 0.9750925302505493, "rewards/rejected": -2.0334856510162354, "step": 2230 }, { "epoch": 0.5374280230326296, "grad_norm": 49.11098211804801, "learning_rate": 2.609891010420941e-07, "logits/chosen": 0.18164226412773132, "logits/rejected": 0.17124636471271515, "logps/chosen": -454.2110290527344, "logps/rejected": -558.4046630859375, "loss": 0.465, "rewards/accuracies": 0.875, "rewards/chosen": -1.1852877140045166, "rewards/margins": 1.1636625528335571, "rewards/rejected": -2.3489503860473633, "step": 2240 }, { "epoch": 0.539827255278311, "grad_norm": 43.26353709722887, "learning_rate": 2.5889692712830674e-07, "logits/chosen": 0.052560679614543915, "logits/rejected": 0.03842206671833992, "logps/chosen": -396.25408935546875, "logps/rejected": -478.73236083984375, "loss": 0.4734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9645735621452332, "rewards/margins": 0.9311714172363281, "rewards/rejected": -1.895745038986206, "step": 2250 }, { "epoch": 0.5422264875239923, "grad_norm": 47.10267589353339, "learning_rate": 2.5680412913042843e-07, "logits/chosen": 0.23019644618034363, "logits/rejected": 0.179383784532547, "logps/chosen": -421.8323669433594, "logps/rejected": -528.0892333984375, "loss": 0.5044, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2181730270385742, "rewards/margins": 1.0768169164657593, "rewards/rejected": -2.294990062713623, "step": 2260 }, { "epoch": 0.5446257197696737, "grad_norm": 49.07465366967735, "learning_rate": 2.5471085384992404e-07, "logits/chosen": 0.21075716614723206, "logits/rejected": 0.0905676931142807, "logps/chosen": -402.1150207519531, "logps/rejected": -584.5428466796875, "loss": 0.4919, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.061937928199768, "rewards/margins": 1.6330102682113647, "rewards/rejected": -2.694948196411133, "step": 2270 }, { "epoch": 0.5470249520153551, "grad_norm": 47.06580983617911, "learning_rate": 2.526172481217381e-07, "logits/chosen": 0.28002408146858215, "logits/rejected": 0.19437995553016663, "logps/chosen": -421.2408142089844, "logps/rejected": -556.0794067382812, "loss": 0.5198, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5300331115722656, "rewards/margins": 1.192158579826355, "rewards/rejected": -2.722191572189331, "step": 2280 }, { "epoch": 0.5494241842610365, "grad_norm": 42.697690556320396, "learning_rate": 2.5052345880399456e-07, "logits/chosen": 0.336375892162323, "logits/rejected": 0.33653944730758667, "logps/chosen": -417.27496337890625, "logps/rejected": -494.6957092285156, "loss": 0.4616, "rewards/accuracies": 0.75, "rewards/chosen": -1.37350594997406, "rewards/margins": 0.7126041650772095, "rewards/rejected": -2.0861101150512695, "step": 2290 }, { "epoch": 0.5518234165067178, "grad_norm": 44.24690759792965, "learning_rate": 2.4842963276769555e-07, "logits/chosen": 0.46479305624961853, "logits/rejected": 0.34474366903305054, "logps/chosen": -428.14227294921875, "logps/rejected": -594.80224609375, "loss": 0.5059, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.475941777229309, "rewards/margins": 1.2353615760803223, "rewards/rejected": -2.711303472518921, "step": 2300 }, { "epoch": 0.5542226487523992, "grad_norm": 42.732671934213585, "learning_rate": 2.463359168864189e-07, "logits/chosen": 0.25363442301750183, "logits/rejected": 0.3057165741920471, "logps/chosen": -501.9913635253906, "logps/rejected": -549.3098754882812, "loss": 0.5308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.290345311164856, "rewards/margins": 1.010578989982605, "rewards/rejected": -2.300924301147461, "step": 2310 }, { "epoch": 0.5566218809980806, "grad_norm": 56.4377037562831, "learning_rate": 2.4424245802601555e-07, "logits/chosen": 0.2584269642829895, "logits/rejected": 0.18541845679283142, "logps/chosen": -429.4263610839844, "logps/rejected": -564.8827514648438, "loss": 0.4823, "rewards/accuracies": 0.75, "rewards/chosen": -1.1343291997909546, "rewards/margins": 0.8212429285049438, "rewards/rejected": -1.9555721282958984, "step": 2320 }, { "epoch": 0.559021113243762, "grad_norm": 43.562067174648554, "learning_rate": 2.421494030343072e-07, "logits/chosen": 0.3927503228187561, "logits/rejected": 0.4579402506351471, "logps/chosen": -454.2933044433594, "logps/rejected": -463.19879150390625, "loss": 0.5602, "rewards/accuracies": 0.75, "rewards/chosen": -1.2649152278900146, "rewards/margins": 0.6680216193199158, "rewards/rejected": -1.9329369068145752, "step": 2330 }, { "epoch": 0.5614203454894434, "grad_norm": 58.65475476508653, "learning_rate": 2.400568987307861e-07, "logits/chosen": 0.4964686334133148, "logits/rejected": 0.5107001662254333, "logps/chosen": -432.31341552734375, "logps/rejected": -462.37957763671875, "loss": 0.4724, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.243574857711792, "rewards/margins": 0.5827276110649109, "rewards/rejected": -1.8263022899627686, "step": 2340 }, { "epoch": 0.5638195777351248, "grad_norm": 58.315209990127244, "learning_rate": 2.379650918963156e-07, "logits/chosen": 0.3746911585330963, "logits/rejected": 0.3054753541946411, "logps/chosen": -421.2218322753906, "logps/rejected": -541.8524169921875, "loss": 0.4768, "rewards/accuracies": 0.75, "rewards/chosen": -1.4736093282699585, "rewards/margins": 1.0763620138168335, "rewards/rejected": -2.549971342086792, "step": 2350 }, { "epoch": 0.5662188099808061, "grad_norm": 48.959910400597586, "learning_rate": 2.3587412926283438e-07, "logits/chosen": 0.35963717103004456, "logits/rejected": 0.28781235218048096, "logps/chosen": -480.2315368652344, "logps/rejected": -566.6637573242188, "loss": 0.5414, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1883009672164917, "rewards/margins": 1.270986795425415, "rewards/rejected": -2.459287643432617, "step": 2360 }, { "epoch": 0.5686180422264875, "grad_norm": 30.962931166603095, "learning_rate": 2.337841575030642e-07, "logits/chosen": 0.35713425278663635, "logits/rejected": 0.30424803495407104, "logps/chosen": -492.8209533691406, "logps/rejected": -574.771240234375, "loss": 0.5061, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.396402359008789, "rewards/margins": 0.7614862322807312, "rewards/rejected": -2.157888889312744, "step": 2370 }, { "epoch": 0.5710172744721689, "grad_norm": 42.85766086532651, "learning_rate": 2.316953232202206e-07, "logits/chosen": 0.550395131111145, "logits/rejected": 0.6783905029296875, "logps/chosen": -430.48162841796875, "logps/rejected": -421.599853515625, "loss": 0.485, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2630993127822876, "rewards/margins": 0.6726707220077515, "rewards/rejected": -1.9357702732086182, "step": 2380 }, { "epoch": 0.5734165067178503, "grad_norm": 38.41377863495817, "learning_rate": 2.2960777293772958e-07, "logits/chosen": 0.5615749359130859, "logits/rejected": 0.6018954515457153, "logps/chosen": -397.6216125488281, "logps/rejected": -480.299560546875, "loss": 0.4712, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.1725983619689941, "rewards/margins": 0.9747906923294067, "rewards/rejected": -2.1473889350891113, "step": 2390 }, { "epoch": 0.5758157389635317, "grad_norm": 40.03422172905619, "learning_rate": 2.2752165308894974e-07, "logits/chosen": 0.46104907989501953, "logits/rejected": 0.44198736548423767, "logps/chosen": -378.5218505859375, "logps/rejected": -456.65576171875, "loss": 0.479, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1959477663040161, "rewards/margins": 0.9340691566467285, "rewards/rejected": -2.130016803741455, "step": 2400 }, { "epoch": 0.5782149712092131, "grad_norm": 54.11084171812038, "learning_rate": 2.254371100069005e-07, "logits/chosen": 0.457451730966568, "logits/rejected": 0.320446252822876, "logps/chosen": -431.6908264160156, "logps/rejected": -538.86865234375, "loss": 0.4874, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1469987630844116, "rewards/margins": 0.8605014085769653, "rewards/rejected": -2.007500171661377, "step": 2410 }, { "epoch": 0.5806142034548945, "grad_norm": 54.79494157401916, "learning_rate": 2.2335428991399725e-07, "logits/chosen": 0.47143587470054626, "logits/rejected": 0.4143534302711487, "logps/chosen": -398.853271484375, "logps/rejected": -594.490966796875, "loss": 0.5197, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.449357509613037, "rewards/margins": 1.8377138376235962, "rewards/rejected": -3.287071704864502, "step": 2420 }, { "epoch": 0.5830134357005758, "grad_norm": 47.37355935293041, "learning_rate": 2.2127333891179458e-07, "logits/chosen": 0.4510342478752136, "logits/rejected": 0.36793094873428345, "logps/chosen": -419.24163818359375, "logps/rejected": -571.1874389648438, "loss": 0.5193, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3948280811309814, "rewards/margins": 1.2814536094665527, "rewards/rejected": -2.676281690597534, "step": 2430 }, { "epoch": 0.5854126679462572, "grad_norm": 65.89730578952388, "learning_rate": 2.1919440297073782e-07, "logits/chosen": 0.3510410785675049, "logits/rejected": 0.3182118535041809, "logps/chosen": -415.6324157714844, "logps/rejected": -527.152099609375, "loss": 0.5265, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5300906896591187, "rewards/margins": 1.0138437747955322, "rewards/rejected": -2.5439343452453613, "step": 2440 }, { "epoch": 0.5878119001919386, "grad_norm": 42.30705937238165, "learning_rate": 2.1711762791992368e-07, "logits/chosen": 0.43873363733291626, "logits/rejected": 0.46004414558410645, "logps/chosen": -474.68341064453125, "logps/rejected": -534.9579467773438, "loss": 0.5254, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1993557214736938, "rewards/margins": 0.8993379473686218, "rewards/rejected": -2.098693370819092, "step": 2450 }, { "epoch": 0.5902111324376199, "grad_norm": 45.61922027456477, "learning_rate": 2.1504315943687114e-07, "logits/chosen": 0.18166793882846832, "logits/rejected": 0.07724637538194656, "logps/chosen": -408.96893310546875, "logps/rejected": -581.2828979492188, "loss": 0.4625, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0965474843978882, "rewards/margins": 1.2784209251403809, "rewards/rejected": -2.3749685287475586, "step": 2460 }, { "epoch": 0.5926103646833013, "grad_norm": 53.517726559327514, "learning_rate": 2.1297114303730248e-07, "logits/chosen": 0.3896231949329376, "logits/rejected": 0.2409631460905075, "logps/chosen": -423.19287109375, "logps/rejected": -586.328857421875, "loss": 0.5399, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.327804446220398, "rewards/margins": 1.076370120048523, "rewards/rejected": -2.404174327850342, "step": 2470 }, { "epoch": 0.5950095969289827, "grad_norm": 41.01802265556329, "learning_rate": 2.1090172406493616e-07, "logits/chosen": 0.3331597149372101, "logits/rejected": 0.2225189208984375, "logps/chosen": -397.5386657714844, "logps/rejected": -519.840087890625, "loss": 0.4603, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.131462812423706, "rewards/margins": 0.950838565826416, "rewards/rejected": -2.082301378250122, "step": 2480 }, { "epoch": 0.5974088291746641, "grad_norm": 60.21604361600221, "learning_rate": 2.0883504768129146e-07, "logits/chosen": 0.30570241808891296, "logits/rejected": 0.24165184795856476, "logps/chosen": -461.2522888183594, "logps/rejected": -565.5457763671875, "loss": 0.5168, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1941462755203247, "rewards/margins": 1.1086372137069702, "rewards/rejected": -2.302783489227295, "step": 2490 }, { "epoch": 0.5998080614203455, "grad_norm": 45.67541969535949, "learning_rate": 2.0677125885550571e-07, "logits/chosen": 0.4085448384284973, "logits/rejected": 0.48327702283859253, "logps/chosen": -436.59857177734375, "logps/rejected": -471.85498046875, "loss": 0.4864, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.352922797203064, "rewards/margins": 0.7829147577285767, "rewards/rejected": -2.1358375549316406, "step": 2500 }, { "epoch": 0.6022072936660269, "grad_norm": 60.523710599155514, "learning_rate": 2.0471050235416587e-07, "logits/chosen": 0.14623039960861206, "logits/rejected": 0.19062075018882751, "logps/chosen": -451.1435546875, "logps/rejected": -491.1160583496094, "loss": 0.4579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3129417896270752, "rewards/margins": 0.9290571212768555, "rewards/rejected": -2.2419991493225098, "step": 2510 }, { "epoch": 0.6046065259117083, "grad_norm": 52.23271499985374, "learning_rate": 2.026529227311532e-07, "logits/chosen": 0.29617246985435486, "logits/rejected": 0.2822147011756897, "logps/chosen": -423.54315185546875, "logps/rejected": -501.5276794433594, "loss": 0.5351, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.437732458114624, "rewards/margins": 0.6575521230697632, "rewards/rejected": -2.0952847003936768, "step": 2520 }, { "epoch": 0.6070057581573897, "grad_norm": 44.954316731149845, "learning_rate": 2.005986643175036e-07, "logits/chosen": 0.3328186571598053, "logits/rejected": 0.2537630498409271, "logps/chosen": -454.51580810546875, "logps/rejected": -574.1419677734375, "loss": 0.4529, "rewards/accuracies": 0.75, "rewards/chosen": -1.1176398992538452, "rewards/margins": 1.3029248714447021, "rewards/rejected": -2.420564889907837, "step": 2530 }, { "epoch": 0.6094049904030711, "grad_norm": 63.16908223607974, "learning_rate": 1.9854787121128328e-07, "logits/chosen": 0.31036069989204407, "logits/rejected": 0.34982046484947205, "logps/chosen": -397.23980712890625, "logps/rejected": -403.78509521484375, "loss": 0.5048, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.217882752418518, "rewards/margins": 0.5209786891937256, "rewards/rejected": -1.7388614416122437, "step": 2540 }, { "epoch": 0.6118042226487524, "grad_norm": 54.60861450055549, "learning_rate": 1.9650068726748106e-07, "logits/chosen": 0.3659752309322357, "logits/rejected": 0.35895493626594543, "logps/chosen": -461.1573181152344, "logps/rejected": -573.6448364257812, "loss": 0.5214, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.474023699760437, "rewards/margins": 1.084517240524292, "rewards/rejected": -2.5585405826568604, "step": 2550 }, { "epoch": 0.6142034548944337, "grad_norm": 60.46600684768552, "learning_rate": 1.9445725608791718e-07, "logits/chosen": 0.34406715631484985, "logits/rejected": 0.28216245770454407, "logps/chosen": -460.77978515625, "logps/rejected": -619.3160400390625, "loss": 0.497, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3329025506973267, "rewards/margins": 1.6671111583709717, "rewards/rejected": -3.000014066696167, "step": 2560 }, { "epoch": 0.6166026871401151, "grad_norm": 47.40884309447939, "learning_rate": 1.924177210111705e-07, "logits/chosen": 0.29457220435142517, "logits/rejected": 0.29915186762809753, "logps/chosen": -407.5345153808594, "logps/rejected": -541.1593017578125, "loss": 0.5182, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.2408500909805298, "rewards/margins": 1.2775036096572876, "rewards/rejected": -2.5183534622192383, "step": 2570 }, { "epoch": 0.6190019193857965, "grad_norm": 45.99798362644753, "learning_rate": 1.9038222510252364e-07, "logits/chosen": 0.25425729155540466, "logits/rejected": 0.24261541664600372, "logps/chosen": -444.73992919921875, "logps/rejected": -504.9520568847656, "loss": 0.499, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1787656545639038, "rewards/margins": 0.8464619517326355, "rewards/rejected": -2.0252277851104736, "step": 2580 }, { "epoch": 0.6214011516314779, "grad_norm": 52.02289887758591, "learning_rate": 1.883509111439277e-07, "logits/chosen": 0.3976004123687744, "logits/rejected": 0.28759509325027466, "logps/chosen": -438.90008544921875, "logps/rejected": -641.5413818359375, "loss": 0.5341, "rewards/accuracies": 0.875, "rewards/chosen": -1.4814860820770264, "rewards/margins": 1.3605000972747803, "rewards/rejected": -2.8419861793518066, "step": 2590 }, { "epoch": 0.6238003838771593, "grad_norm": 32.303683781858304, "learning_rate": 1.8632392162398665e-07, "logits/chosen": 0.23672600090503693, "logits/rejected": 0.15976786613464355, "logps/chosen": -484.119873046875, "logps/rejected": -645.5545654296875, "loss": 0.4698, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -1.1118555068969727, "rewards/margins": 1.7390865087509155, "rewards/rejected": -2.8509418964385986, "step": 2600 }, { "epoch": 0.6261996161228407, "grad_norm": 45.29250569251351, "learning_rate": 1.84301398727962e-07, "logits/chosen": 0.4794914722442627, "logits/rejected": 0.37679189443588257, "logps/chosen": -368.31109619140625, "logps/rejected": -579.0586547851562, "loss": 0.496, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0693645477294922, "rewards/margins": 1.7801597118377686, "rewards/rejected": -2.8495242595672607, "step": 2610 }, { "epoch": 0.6285988483685221, "grad_norm": 62.710557092048646, "learning_rate": 1.8228348432779966e-07, "logits/chosen": 0.2735206186771393, "logits/rejected": 0.24361078441143036, "logps/chosen": -426.20111083984375, "logps/rejected": -496.5086364746094, "loss": 0.5445, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3294744491577148, "rewards/margins": 0.8788881301879883, "rewards/rejected": -2.208362579345703, "step": 2620 }, { "epoch": 0.6309980806142035, "grad_norm": 73.52463716987671, "learning_rate": 1.8027031997217773e-07, "logits/chosen": 0.3862006962299347, "logits/rejected": 0.27332574129104614, "logps/chosen": -411.3087463378906, "logps/rejected": -538.5333251953125, "loss": 0.4613, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4152649641036987, "rewards/margins": 1.0575337409973145, "rewards/rejected": -2.4727988243103027, "step": 2630 }, { "epoch": 0.6333973128598849, "grad_norm": 47.969317887923054, "learning_rate": 1.7826204687657758e-07, "logits/chosen": 0.2865044176578522, "logits/rejected": 0.33233708143234253, "logps/chosen": -468.67108154296875, "logps/rejected": -502.1822814941406, "loss": 0.4738, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1685740947723389, "rewards/margins": 0.8533760905265808, "rewards/rejected": -2.0219502449035645, "step": 2640 }, { "epoch": 0.6357965451055663, "grad_norm": 46.75296720560617, "learning_rate": 1.762588059133781e-07, "logits/chosen": 0.3442167043685913, "logits/rejected": 0.4161573350429535, "logps/chosen": -470.08807373046875, "logps/rejected": -546.142578125, "loss": 0.4768, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.227698564529419, "rewards/margins": 1.140878677368164, "rewards/rejected": -2.368577003479004, "step": 2650 }, { "epoch": 0.6381957773512476, "grad_norm": 50.401385190215464, "learning_rate": 1.7426073760197406e-07, "logits/chosen": 0.10545764863491058, "logits/rejected": 0.012745514512062073, "logps/chosen": -432.08685302734375, "logps/rejected": -608.319091796875, "loss": 0.4988, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.235762119293213, "rewards/margins": 1.4522688388824463, "rewards/rejected": -2.688030958175659, "step": 2660 }, { "epoch": 0.6405950095969289, "grad_norm": 45.71956495654583, "learning_rate": 1.7226798209891935e-07, "logits/chosen": 0.20430830121040344, "logits/rejected": 0.2912927269935608, "logps/chosen": -453.98944091796875, "logps/rejected": -510.1253967285156, "loss": 0.4619, "rewards/accuracies": 0.875, "rewards/chosen": -1.4355871677398682, "rewards/margins": 1.2053136825561523, "rewards/rejected": -2.6409008502960205, "step": 2670 }, { "epoch": 0.6429942418426103, "grad_norm": 45.15170026766255, "learning_rate": 1.7028067918809535e-07, "logits/chosen": 0.3014266788959503, "logits/rejected": 0.22763225436210632, "logps/chosen": -384.4528503417969, "logps/rejected": -595.7291870117188, "loss": 0.4945, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.171205997467041, "rewards/margins": 1.6346750259399414, "rewards/rejected": -2.8058810234069824, "step": 2680 }, { "epoch": 0.6453934740882917, "grad_norm": 64.83143640863342, "learning_rate": 1.6829896827090584e-07, "logits/chosen": 0.21800704300403595, "logits/rejected": 0.220147043466568, "logps/chosen": -443.7188415527344, "logps/rejected": -480.36907958984375, "loss": 0.5184, "rewards/accuracies": 0.625, "rewards/chosen": -1.3957051038742065, "rewards/margins": 0.6850441694259644, "rewards/rejected": -2.080749273300171, "step": 2690 }, { "epoch": 0.6477927063339731, "grad_norm": 37.274772223125495, "learning_rate": 1.6632298835649844e-07, "logits/chosen": 0.3046364486217499, "logits/rejected": 0.18461188673973083, "logps/chosen": -469.7455139160156, "logps/rejected": -623.6341552734375, "loss": 0.477, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3073039054870605, "rewards/margins": 1.1427420377731323, "rewards/rejected": -2.4500460624694824, "step": 2700 }, { "epoch": 0.6501919385796545, "grad_norm": 91.33316289592031, "learning_rate": 1.6435287805201364e-07, "logits/chosen": 0.46277904510498047, "logits/rejected": 0.40250563621520996, "logps/chosen": -462.3423767089844, "logps/rejected": -543.9591064453125, "loss": 0.5314, "rewards/accuracies": 0.75, "rewards/chosen": -1.518293023109436, "rewards/margins": 0.8617793321609497, "rewards/rejected": -2.3800723552703857, "step": 2710 }, { "epoch": 0.6525911708253359, "grad_norm": 44.87565160003366, "learning_rate": 1.6238877555286207e-07, "logits/chosen": 0.35751184821128845, "logits/rejected": 0.29743391275405884, "logps/chosen": -469.69561767578125, "logps/rejected": -606.2034301757812, "loss": 0.4479, "rewards/accuracies": 0.875, "rewards/chosen": -1.2520592212677002, "rewards/margins": 1.2662980556488037, "rewards/rejected": -2.518357276916504, "step": 2720 }, { "epoch": 0.6549904030710173, "grad_norm": 44.32240198316999, "learning_rate": 1.60430818633031e-07, "logits/chosen": 0.16691644489765167, "logits/rejected": 0.14331945776939392, "logps/chosen": -449.48876953125, "logps/rejected": -561.8910522460938, "loss": 0.4539, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.131084680557251, "rewards/margins": 1.254732370376587, "rewards/rejected": -2.385816812515259, "step": 2730 }, { "epoch": 0.6573896353166987, "grad_norm": 39.01641038215159, "learning_rate": 1.5847914463541939e-07, "logits/chosen": 0.3676902651786804, "logits/rejected": 0.34273606538772583, "logps/chosen": -374.8681640625, "logps/rejected": -478.32330322265625, "loss": 0.4745, "rewards/accuracies": 0.75, "rewards/chosen": -1.1049184799194336, "rewards/margins": 0.8594606518745422, "rewards/rejected": -1.964379072189331, "step": 2740 }, { "epoch": 0.6597888675623801, "grad_norm": 35.89167519955917, "learning_rate": 1.5653389046220427e-07, "logits/chosen": 0.3571329414844513, "logits/rejected": 0.27262359857559204, "logps/chosen": -399.9897155761719, "logps/rejected": -535.5849609375, "loss": 0.4737, "rewards/accuracies": 0.875, "rewards/chosen": -1.1511871814727783, "rewards/margins": 1.0949671268463135, "rewards/rejected": -2.246154308319092, "step": 2750 }, { "epoch": 0.6621880998080614, "grad_norm": 74.88730166916955, "learning_rate": 1.545951925652375e-07, "logits/chosen": 0.3250289559364319, "logits/rejected": 0.39422863721847534, "logps/chosen": -507.34735107421875, "logps/rejected": -564.6778564453125, "loss": 0.4953, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2604314088821411, "rewards/margins": 1.2383835315704346, "rewards/rejected": -2.4988150596618652, "step": 2760 }, { "epoch": 0.6645873320537428, "grad_norm": 43.31844151941509, "learning_rate": 1.5266318693647423e-07, "logits/chosen": 0.38096925616264343, "logits/rejected": 0.4018251299858093, "logps/chosen": -460.6954040527344, "logps/rejected": -567.9718017578125, "loss": 0.4693, "rewards/accuracies": 0.75, "rewards/chosen": -1.2558958530426025, "rewards/margins": 1.0993343591690063, "rewards/rejected": -2.3552298545837402, "step": 2770 }, { "epoch": 0.6669865642994242, "grad_norm": 72.85191786475721, "learning_rate": 1.5073800909843353e-07, "logits/chosen": 0.25220975279808044, "logits/rejected": 0.3646177649497986, "logps/chosen": -450.23193359375, "logps/rejected": -503.99127197265625, "loss": 0.4707, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2164779901504517, "rewards/margins": 1.0934855937957764, "rewards/rejected": -2.3099634647369385, "step": 2780 }, { "epoch": 0.6693857965451055, "grad_norm": 63.40322968247712, "learning_rate": 1.488197940946922e-07, "logits/chosen": 0.23376190662384033, "logits/rejected": 0.22447574138641357, "logps/chosen": -456.8228454589844, "logps/rejected": -523.5567626953125, "loss": 0.4689, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1302400827407837, "rewards/margins": 1.2681411504745483, "rewards/rejected": -2.398381233215332, "step": 2790 }, { "epoch": 0.6717850287907869, "grad_norm": 66.69199487516, "learning_rate": 1.4690867648041167e-07, "logits/chosen": 0.16230645775794983, "logits/rejected": 0.1882302314043045, "logps/chosen": -434.5381774902344, "logps/rejected": -552.4396362304688, "loss": 0.5021, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1278281211853027, "rewards/margins": 1.5061180591583252, "rewards/rejected": -2.633945941925049, "step": 2800 }, { "epoch": 0.6741842610364683, "grad_norm": 46.91530215907862, "learning_rate": 1.4500479031289987e-07, "logits/chosen": 0.15237310528755188, "logits/rejected": 0.1518753319978714, "logps/chosen": -466.0179138183594, "logps/rejected": -572.3275146484375, "loss": 0.517, "rewards/accuracies": 0.875, "rewards/chosen": -1.2266777753829956, "rewards/margins": 1.2364604473114014, "rewards/rejected": -2.4631385803222656, "step": 2810 }, { "epoch": 0.6765834932821497, "grad_norm": 55.81983630093274, "learning_rate": 1.4310826914220747e-07, "logits/chosen": 0.17195823788642883, "logits/rejected": 0.16844519972801208, "logps/chosen": -536.1735229492188, "logps/rejected": -609.2791748046875, "loss": 0.5282, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5774872303009033, "rewards/margins": 0.9089029431343079, "rewards/rejected": -2.4863903522491455, "step": 2820 }, { "epoch": 0.6789827255278311, "grad_norm": 53.80796549341863, "learning_rate": 1.412192460017597e-07, "logits/chosen": 0.1955575793981552, "logits/rejected": 0.12785163521766663, "logps/chosen": -444.7312927246094, "logps/rejected": -568.8245849609375, "loss": 0.5024, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.395450472831726, "rewards/margins": 1.22873055934906, "rewards/rejected": -2.624181032180786, "step": 2830 }, { "epoch": 0.6813819577735125, "grad_norm": 44.72827252256254, "learning_rate": 1.3933785339902504e-07, "logits/chosen": 0.27861329913139343, "logits/rejected": 0.13766932487487793, "logps/chosen": -376.69805908203125, "logps/rejected": -530.3146362304688, "loss": 0.5003, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1893621683120728, "rewards/margins": 1.1048251390457153, "rewards/rejected": -2.294187307357788, "step": 2840 }, { "epoch": 0.6837811900191939, "grad_norm": 38.90141505727746, "learning_rate": 1.374642233062197e-07, "logits/chosen": 0.1925538331270218, "logits/rejected": 0.17995335161685944, "logps/chosen": -486.0704650878906, "logps/rejected": -545.6419067382812, "loss": 0.5175, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2210705280303955, "rewards/margins": 1.031243085861206, "rewards/rejected": -2.2523136138916016, "step": 2850 }, { "epoch": 0.6861804222648752, "grad_norm": 38.934538649501114, "learning_rate": 1.355984871510511e-07, "logits/chosen": 0.24185729026794434, "logits/rejected": 0.16981028020381927, "logps/chosen": -488.8545837402344, "logps/rejected": -586.666015625, "loss": 0.4586, "rewards/accuracies": 0.875, "rewards/chosen": -1.2682818174362183, "rewards/margins": 0.9824774861335754, "rewards/rejected": -2.2507593631744385, "step": 2860 }, { "epoch": 0.6885796545105566, "grad_norm": 45.29750933331666, "learning_rate": 1.3374077580749783e-07, "logits/chosen": 0.29279276728630066, "logits/rejected": 0.1869848519563675, "logps/chosen": -351.5196838378906, "logps/rejected": -475.97308349609375, "loss": 0.5016, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.038088321685791, "rewards/margins": 1.0932366847991943, "rewards/rejected": -2.1313250064849854, "step": 2870 }, { "epoch": 0.690978886756238, "grad_norm": 48.530711516673115, "learning_rate": 1.3189121958663024e-07, "logits/chosen": 0.1910950392484665, "logits/rejected": 0.2789291739463806, "logps/chosen": -532.8553466796875, "logps/rejected": -549.0731201171875, "loss": 0.502, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6616315841674805, "rewards/margins": 0.5708137154579163, "rewards/rejected": -2.232445240020752, "step": 2880 }, { "epoch": 0.6933781190019194, "grad_norm": 49.658508332103274, "learning_rate": 1.3004994822746895e-07, "logits/chosen": 0.08187554031610489, "logits/rejected": 0.053650178015232086, "logps/chosen": -428.33087158203125, "logps/rejected": -530.8117065429688, "loss": 0.5162, "rewards/accuracies": 0.75, "rewards/chosen": -1.238884687423706, "rewards/margins": 0.9637983441352844, "rewards/rejected": -2.2026829719543457, "step": 2890 }, { "epoch": 0.6957773512476008, "grad_norm": 49.423150125943955, "learning_rate": 1.2821709088788434e-07, "logits/chosen": 0.2585221827030182, "logits/rejected": 0.17918451130390167, "logps/chosen": -400.4766845703125, "logps/rejected": -514.9324951171875, "loss": 0.5073, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2878179550170898, "rewards/margins": 1.140520691871643, "rewards/rejected": -2.4283385276794434, "step": 2900 }, { "epoch": 0.6981765834932822, "grad_norm": 59.81963849634542, "learning_rate": 1.2639277613553736e-07, "logits/chosen": 0.39327603578567505, "logits/rejected": 0.3338584899902344, "logps/chosen": -380.1231994628906, "logps/rejected": -466.969482421875, "loss": 0.4731, "rewards/accuracies": 0.75, "rewards/chosen": -1.2669470310211182, "rewards/margins": 0.8520339727401733, "rewards/rejected": -2.11898136138916, "step": 2910 }, { "epoch": 0.7005758157389635, "grad_norm": 47.91120083091996, "learning_rate": 1.2457713193885975e-07, "logits/chosen": 0.23712964355945587, "logits/rejected": 0.10274624824523926, "logps/chosen": -363.88116455078125, "logps/rejected": -501.58477783203125, "loss": 0.4744, "rewards/accuracies": 0.75, "rewards/chosen": -1.3259981870651245, "rewards/margins": 1.0507750511169434, "rewards/rejected": -2.3767733573913574, "step": 2920 }, { "epoch": 0.7029750479846449, "grad_norm": 56.163566511516365, "learning_rate": 1.2277028565807838e-07, "logits/chosen": 0.2799941599369049, "logits/rejected": 0.2706086039543152, "logps/chosen": -432.5113220214844, "logps/rejected": -512.887451171875, "loss": 0.5009, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1611391305923462, "rewards/margins": 0.9016637802124023, "rewards/rejected": -2.062802791595459, "step": 2930 }, { "epoch": 0.7053742802303263, "grad_norm": 62.43277712323061, "learning_rate": 1.209723640362815e-07, "logits/chosen": 0.16554930806159973, "logits/rejected": 0.1359563171863556, "logps/chosen": -462.38568115234375, "logps/rejected": -574.197998046875, "loss": 0.5577, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3521636724472046, "rewards/margins": 1.263946294784546, "rewards/rejected": -2.616110324859619, "step": 2940 }, { "epoch": 0.7077735124760077, "grad_norm": 33.88826120125574, "learning_rate": 1.191834931905277e-07, "logits/chosen": 0.20565947890281677, "logits/rejected": 0.13917942345142365, "logps/chosen": -520.4049072265625, "logps/rejected": -632.361328125, "loss": 0.4599, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5277377367019653, "rewards/margins": 1.1449778079986572, "rewards/rejected": -2.672715425491333, "step": 2950 }, { "epoch": 0.710172744721689, "grad_norm": 45.009587506259074, "learning_rate": 1.1740379860299988e-07, "logits/chosen": 0.2947765588760376, "logits/rejected": 0.23910513520240784, "logps/chosen": -472.980712890625, "logps/rejected": -579.03125, "loss": 0.497, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3068562746047974, "rewards/margins": 0.9038209915161133, "rewards/rejected": -2.210677146911621, "step": 2960 }, { "epoch": 0.7125719769673704, "grad_norm": 47.44511342924861, "learning_rate": 1.1563340511220254e-07, "logits/chosen": 0.2019500434398651, "logits/rejected": 0.2147335559129715, "logps/chosen": -510.0350646972656, "logps/rejected": -596.2499389648438, "loss": 0.5062, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.336971402168274, "rewards/margins": 1.1666083335876465, "rewards/rejected": -2.503579616546631, "step": 2970 }, { "epoch": 0.7149712092130518, "grad_norm": 42.57941151152834, "learning_rate": 1.1387243690420556e-07, "logits/chosen": 0.23384490609169006, "logits/rejected": 0.20733702182769775, "logps/chosen": -481.803955078125, "logps/rejected": -632.8770751953125, "loss": 0.4655, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2207635641098022, "rewards/margins": 1.6195507049560547, "rewards/rejected": -2.8403146266937256, "step": 2980 }, { "epoch": 0.7173704414587332, "grad_norm": 64.95455682456064, "learning_rate": 1.1212101750393235e-07, "logits/chosen": 0.3023291528224945, "logits/rejected": 0.30834710597991943, "logps/chosen": -450.0244140625, "logps/rejected": -551.0206298828125, "loss": 0.4357, "rewards/accuracies": 0.75, "rewards/chosen": -1.409967303276062, "rewards/margins": 1.2566007375717163, "rewards/rejected": -2.666567802429199, "step": 2990 }, { "epoch": 0.7197696737044146, "grad_norm": 46.170115289110555, "learning_rate": 1.1037926976649562e-07, "logits/chosen": 0.22152157127857208, "logits/rejected": 0.16806095838546753, "logps/chosen": -476.97320556640625, "logps/rejected": -616.3040161132812, "loss": 0.5408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4931066036224365, "rewards/margins": 1.1700246334075928, "rewards/rejected": -2.6631312370300293, "step": 3000 }, { "epoch": 0.722168905950096, "grad_norm": 57.89732258915135, "learning_rate": 1.0864731586857936e-07, "logits/chosen": 0.3043791949748993, "logits/rejected": 0.36210864782333374, "logps/chosen": -495.12164306640625, "logps/rejected": -574.9592895507812, "loss": 0.46, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4336802959442139, "rewards/margins": 1.215421199798584, "rewards/rejected": -2.649101495742798, "step": 3010 }, { "epoch": 0.7245681381957774, "grad_norm": 49.111012867250984, "learning_rate": 1.0692527729986839e-07, "logits/chosen": 0.11315940320491791, "logits/rejected": 0.11848314106464386, "logps/chosen": -460.1648864746094, "logps/rejected": -543.1566162109375, "loss": 0.4285, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2527508735656738, "rewards/margins": 1.067068338394165, "rewards/rejected": -2.3198189735412598, "step": 3020 }, { "epoch": 0.7269673704414588, "grad_norm": 57.35606077595889, "learning_rate": 1.0521327485452692e-07, "logits/chosen": 0.347392201423645, "logits/rejected": 0.3210673928260803, "logps/chosen": -450.1835021972656, "logps/rejected": -524.6959228515625, "loss": 0.4912, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2702230215072632, "rewards/margins": 1.0865848064422607, "rewards/rejected": -2.3568077087402344, "step": 3030 }, { "epoch": 0.7293666026871402, "grad_norm": 56.3165468310005, "learning_rate": 1.0351142862272468e-07, "logits/chosen": 0.209666445851326, "logits/rejected": 0.20282307267189026, "logps/chosen": -423.931396484375, "logps/rejected": -572.3831176757812, "loss": 0.4984, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4169180393218994, "rewards/margins": 1.6322886943817139, "rewards/rejected": -3.049206256866455, "step": 3040 }, { "epoch": 0.7317658349328215, "grad_norm": 47.57484794011745, "learning_rate": 1.0181985798221343e-07, "logits/chosen": 0.3013080060482025, "logits/rejected": 0.2218068540096283, "logps/chosen": -470.23480224609375, "logps/rejected": -589.8034057617188, "loss": 0.5126, "rewards/accuracies": 0.75, "rewards/chosen": -1.3861111402511597, "rewards/margins": 1.0853662490844727, "rewards/rejected": -2.4714770317077637, "step": 3050 }, { "epoch": 0.7341650671785028, "grad_norm": 48.06894623911944, "learning_rate": 1.0013868158995329e-07, "logits/chosen": 0.3860154449939728, "logits/rejected": 0.3630084991455078, "logps/chosen": -442.2177734375, "logps/rejected": -527.9283447265625, "loss": 0.4642, "rewards/accuracies": 0.875, "rewards/chosen": -1.2381409406661987, "rewards/margins": 1.22637939453125, "rewards/rejected": -2.4645204544067383, "step": 3060 }, { "epoch": 0.7365642994241842, "grad_norm": 51.169486765513234, "learning_rate": 9.84680173737887e-08, "logits/chosen": 0.2769750952720642, "logits/rejected": 0.2646028995513916, "logps/chosen": -475.0011291503906, "logps/rejected": -548.9890747070312, "loss": 0.4765, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4719974994659424, "rewards/margins": 1.0918984413146973, "rewards/rejected": -2.5638959407806396, "step": 3070 }, { "epoch": 0.7389635316698656, "grad_norm": 58.59714970661162, "learning_rate": 9.680798252417713e-08, "logits/chosen": 0.2717548906803131, "logits/rejected": 0.2305576503276825, "logps/chosen": -379.20928955078125, "logps/rejected": -515.2022094726562, "loss": 0.492, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.101180076599121, "rewards/margins": 0.944907009601593, "rewards/rejected": -2.0460872650146484, "step": 3080 }, { "epoch": 0.741362763915547, "grad_norm": 61.65719550385752, "learning_rate": 9.515869348596808e-08, "logits/chosen": 0.11913663148880005, "logits/rejected": 0.09342759847640991, "logps/chosen": -497.87109375, "logps/rejected": -598.5771484375, "loss": 0.4878, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4787302017211914, "rewards/margins": 1.3738664388656616, "rewards/rejected": -2.8525967597961426, "step": 3090 }, { "epoch": 0.7437619961612284, "grad_norm": 41.218790972775324, "learning_rate": 9.352026595023493e-08, "logits/chosen": 0.10429096221923828, "logits/rejected": 0.1541799008846283, "logps/chosen": -517.5309448242188, "logps/rejected": -543.2216796875, "loss": 0.4985, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4588356018066406, "rewards/margins": 0.6233514547348022, "rewards/rejected": -2.0821871757507324, "step": 3100 }, { "epoch": 0.7461612284069098, "grad_norm": 64.90615052640716, "learning_rate": 9.189281484616004e-08, "logits/chosen": 0.22654108703136444, "logits/rejected": 0.1651889979839325, "logps/chosen": -402.1455383300781, "logps/rejected": -556.6002807617188, "loss": 0.5169, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.490733027458191, "rewards/margins": 0.9633838534355164, "rewards/rejected": -2.4541170597076416, "step": 3110 }, { "epoch": 0.7485604606525912, "grad_norm": 55.8925921234848, "learning_rate": 9.027645433297249e-08, "logits/chosen": 0.11542461812496185, "logits/rejected": 0.17937800288200378, "logps/chosen": -566.5679321289062, "logps/rejected": -637.2242431640625, "loss": 0.5183, "rewards/accuracies": 0.75, "rewards/chosen": -1.827355146408081, "rewards/margins": 1.0894877910614014, "rewards/rejected": -2.9168429374694824, "step": 3120 }, { "epoch": 0.7509596928982726, "grad_norm": 54.00400306277147, "learning_rate": 8.867129779194066e-08, "logits/chosen": 0.16981378197669983, "logits/rejected": 0.16173888742923737, "logps/chosen": -371.0014953613281, "logps/rejected": -522.0635986328125, "loss": 0.4864, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.9543946981430054, "rewards/margins": 1.564900517463684, "rewards/rejected": -2.5192952156066895, "step": 3130 }, { "epoch": 0.753358925143954, "grad_norm": 54.599194100775584, "learning_rate": 8.707745781841866e-08, "logits/chosen": 0.14470471441745758, "logits/rejected": 0.1468985676765442, "logps/chosen": -400.3184509277344, "logps/rejected": -513.9260864257812, "loss": 0.5147, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.221286416053772, "rewards/margins": 1.1234190464019775, "rewards/rejected": -2.344705581665039, "step": 3140 }, { "epoch": 0.7557581573896354, "grad_norm": 34.75601738944086, "learning_rate": 8.549504621394831e-08, "logits/chosen": 0.15695925056934357, "logits/rejected": 0.14711011946201324, "logps/chosen": -413.1109313964844, "logps/rejected": -539.1842041015625, "loss": 0.427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.069771409034729, "rewards/margins": 1.3148638010025024, "rewards/rejected": -2.3846354484558105, "step": 3150 }, { "epoch": 0.7581573896353166, "grad_norm": 53.105874033112755, "learning_rate": 8.392417397841703e-08, "logits/chosen": 0.26591944694519043, "logits/rejected": 0.26246827840805054, "logps/chosen": -447.65557861328125, "logps/rejected": -544.1351318359375, "loss": 0.4888, "rewards/accuracies": 0.75, "rewards/chosen": -1.2605499029159546, "rewards/margins": 0.8398078083992004, "rewards/rejected": -2.1003577709198, "step": 3160 }, { "epoch": 0.760556621880998, "grad_norm": 47.78743297051488, "learning_rate": 8.236495130227083e-08, "logits/chosen": 0.24016205966472626, "logits/rejected": 0.31962883472442627, "logps/chosen": -456.42724609375, "logps/rejected": -570.5203247070312, "loss": 0.4993, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.1726109981536865, "rewards/margins": 1.4965537786483765, "rewards/rejected": -2.6691648960113525, "step": 3170 }, { "epoch": 0.7629558541266794, "grad_norm": 47.980852069933555, "learning_rate": 8.081748755878612e-08, "logits/chosen": 0.2495994120836258, "logits/rejected": 0.2759885787963867, "logps/chosen": -469.076171875, "logps/rejected": -524.9566040039062, "loss": 0.4968, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3896160125732422, "rewards/margins": 0.9030616879463196, "rewards/rejected": -2.292677879333496, "step": 3180 }, { "epoch": 0.7653550863723608, "grad_norm": 44.55472031419905, "learning_rate": 7.928189129639632e-08, "logits/chosen": 0.2707396149635315, "logits/rejected": 0.21236738562583923, "logps/chosen": -433.556884765625, "logps/rejected": -539.517578125, "loss": 0.4522, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2524731159210205, "rewards/margins": 0.9954684972763062, "rewards/rejected": -2.247941493988037, "step": 3190 }, { "epoch": 0.7677543186180422, "grad_norm": 77.95104725852434, "learning_rate": 7.775827023107834e-08, "logits/chosen": 0.18351641297340393, "logits/rejected": 0.18833932280540466, "logps/chosen": -446.3948669433594, "logps/rejected": -545.6039428710938, "loss": 0.5189, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3583651781082153, "rewards/margins": 0.7764785289764404, "rewards/rejected": -2.1348438262939453, "step": 3200 }, { "epoch": 0.7701535508637236, "grad_norm": 60.91165565345474, "learning_rate": 7.624673123879682e-08, "logits/chosen": 0.03742004930973053, "logits/rejected": 0.07750044018030167, "logps/chosen": -426.2344665527344, "logps/rejected": -510.4020080566406, "loss": 0.5048, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3010095357894897, "rewards/margins": 0.9792767763137817, "rewards/rejected": -2.2802863121032715, "step": 3210 }, { "epoch": 0.772552783109405, "grad_norm": 43.5674600823645, "learning_rate": 7.474738034800663e-08, "logits/chosen": 0.13723036646842957, "logits/rejected": 0.04767593368887901, "logps/chosen": -369.3411560058594, "logps/rejected": -486.69439697265625, "loss": 0.4954, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0865916013717651, "rewards/margins": 1.3701813220977783, "rewards/rejected": -2.456772804260254, "step": 3220 }, { "epoch": 0.7749520153550864, "grad_norm": 65.92599613926842, "learning_rate": 7.326032273221606e-08, "logits/chosen": 0.23154711723327637, "logits/rejected": 0.1886422336101532, "logps/chosen": -478.3605041503906, "logps/rejected": -570.8961791992188, "loss": 0.4831, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2519872188568115, "rewards/margins": 1.1383593082427979, "rewards/rejected": -2.3903465270996094, "step": 3230 }, { "epoch": 0.7773512476007678, "grad_norm": 45.83026398609644, "learning_rate": 7.178566270260872e-08, "logits/chosen": 0.31105470657348633, "logits/rejected": 0.22554393112659454, "logps/chosen": -447.1808166503906, "logps/rejected": -576.032958984375, "loss": 0.5129, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2397021055221558, "rewards/margins": 0.951197624206543, "rewards/rejected": -2.190899610519409, "step": 3240 }, { "epoch": 0.7797504798464492, "grad_norm": 50.507648401741996, "learning_rate": 7.032350370072709e-08, "logits/chosen": 0.19485214352607727, "logits/rejected": 0.18930187821388245, "logps/chosen": -456.4967346191406, "logps/rejected": -569.6973266601562, "loss": 0.4481, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2981529235839844, "rewards/margins": 1.2217051982879639, "rewards/rejected": -2.5198581218719482, "step": 3250 }, { "epoch": 0.7821497120921305, "grad_norm": 40.68087545077646, "learning_rate": 6.887394829121596e-08, "logits/chosen": 0.2527236044406891, "logits/rejected": 0.20923948287963867, "logps/chosen": -455.47454833984375, "logps/rejected": -632.2030029296875, "loss": 0.4543, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2574024200439453, "rewards/margins": 1.9225542545318604, "rewards/rejected": -3.1799566745758057, "step": 3260 }, { "epoch": 0.7845489443378119, "grad_norm": 37.54734198368332, "learning_rate": 6.743709815462833e-08, "logits/chosen": 0.10011599957942963, "logits/rejected": 0.1131478101015091, "logps/chosen": -462.3744201660156, "logps/rejected": -519.977783203125, "loss": 0.4665, "rewards/accuracies": 0.75, "rewards/chosen": -1.3748726844787598, "rewards/margins": 0.9147384762763977, "rewards/rejected": -2.2896108627319336, "step": 3270 }, { "epoch": 0.7869481765834933, "grad_norm": 51.364098272276145, "learning_rate": 6.601305408029287e-08, "logits/chosen": 0.41624197363853455, "logits/rejected": 0.4190692901611328, "logps/chosen": -458.64141845703125, "logps/rejected": -567.7257080078125, "loss": 0.4664, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5802090167999268, "rewards/margins": 1.1085751056671143, "rewards/rejected": -2.688784122467041, "step": 3280 }, { "epoch": 0.7893474088291746, "grad_norm": 48.68347118403701, "learning_rate": 6.460191595924366e-08, "logits/chosen": 0.23670163750648499, "logits/rejected": 0.21305176615715027, "logps/chosen": -472.20654296875, "logps/rejected": -575.2575073242188, "loss": 0.4741, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4002991914749146, "rewards/margins": 1.0437225103378296, "rewards/rejected": -2.444021701812744, "step": 3290 }, { "epoch": 0.791746641074856, "grad_norm": 56.29885219772071, "learning_rate": 6.320378277721342e-08, "logits/chosen": 0.3236589729785919, "logits/rejected": 0.2942892014980316, "logps/chosen": -485.74609375, "logps/rejected": -548.2264404296875, "loss": 0.4937, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.748716950416565, "rewards/margins": 0.7379652261734009, "rewards/rejected": -2.4866819381713867, "step": 3300 }, { "epoch": 0.7941458733205374, "grad_norm": 47.30513911873481, "learning_rate": 6.181875260769032e-08, "logits/chosen": 0.21434447169303894, "logits/rejected": 0.29501864314079285, "logps/chosen": -473.3141174316406, "logps/rejected": -513.0935668945312, "loss": 0.4824, "rewards/accuracies": 0.75, "rewards/chosen": -1.042864441871643, "rewards/margins": 1.146907925605774, "rewards/rejected": -2.189772129058838, "step": 3310 }, { "epoch": 0.7965451055662188, "grad_norm": 43.836702306292864, "learning_rate": 6.044692260503797e-08, "logits/chosen": 0.2978779673576355, "logits/rejected": 0.2920413911342621, "logps/chosen": -517.2478637695312, "logps/rejected": -626.5977172851562, "loss": 0.4366, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4270732402801514, "rewards/margins": 1.378542184829712, "rewards/rejected": -2.805615186691284, "step": 3320 }, { "epoch": 0.7989443378119002, "grad_norm": 49.783712311366116, "learning_rate": 5.9088388997680984e-08, "logits/chosen": 0.15503938496112823, "logits/rejected": 0.19135913252830505, "logps/chosen": -540.2918090820312, "logps/rejected": -589.90185546875, "loss": 0.4691, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.3969385623931885, "rewards/margins": 1.2311842441558838, "rewards/rejected": -2.6281230449676514, "step": 3330 }, { "epoch": 0.8013435700575816, "grad_norm": 52.57420620553349, "learning_rate": 5.774324708135439e-08, "logits/chosen": 0.2751420736312866, "logits/rejected": 0.28755050897598267, "logps/chosen": -397.3004150390625, "logps/rejected": -484.384521484375, "loss": 0.4858, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2007606029510498, "rewards/margins": 1.0477242469787598, "rewards/rejected": -2.2484848499298096, "step": 3340 }, { "epoch": 0.803742802303263, "grad_norm": 40.1972577695682, "learning_rate": 5.641159121241953e-08, "logits/chosen": 0.32921257615089417, "logits/rejected": 0.24844393134117126, "logps/chosen": -387.3114013671875, "logps/rejected": -536.5883178710938, "loss": 0.4903, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1569197177886963, "rewards/margins": 1.0728175640106201, "rewards/rejected": -2.2297370433807373, "step": 3350 }, { "epoch": 0.8061420345489443, "grad_norm": 41.69598167340838, "learning_rate": 5.5093514801245106e-08, "logits/chosen": 0.3076106905937195, "logits/rejected": 0.2400285303592682, "logps/chosen": -443.32000732421875, "logps/rejected": -577.48388671875, "loss": 0.4817, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3214082717895508, "rewards/margins": 0.9961814880371094, "rewards/rejected": -2.317589521408081, "step": 3360 }, { "epoch": 0.8085412667946257, "grad_norm": 38.422027437084395, "learning_rate": 5.378911030565453e-08, "logits/chosen": 0.3213488757610321, "logits/rejected": 0.26428383588790894, "logps/chosen": -506.6258239746094, "logps/rejected": -639.1539916992188, "loss": 0.4834, "rewards/accuracies": 0.75, "rewards/chosen": -1.5233229398727417, "rewards/margins": 1.1094070672988892, "rewards/rejected": -2.6327297687530518, "step": 3370 }, { "epoch": 0.8109404990403071, "grad_norm": 44.33236145563771, "learning_rate": 5.249846922444101e-08, "logits/chosen": 0.3445442318916321, "logits/rejected": 0.2675052285194397, "logps/chosen": -402.5491638183594, "logps/rejected": -543.9547729492188, "loss": 0.462, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3842418193817139, "rewards/margins": 1.5147311687469482, "rewards/rejected": -2.898972988128662, "step": 3380 }, { "epoch": 0.8133397312859885, "grad_norm": 58.63982281658398, "learning_rate": 5.122168209094865e-08, "logits/chosen": 0.38930395245552063, "logits/rejected": 0.36614999175071716, "logps/chosen": -429.3528747558594, "logps/rejected": -487.43408203125, "loss": 0.4705, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5928863286972046, "rewards/margins": 0.6625052094459534, "rewards/rejected": -2.2553915977478027, "step": 3390 }, { "epoch": 0.8157389635316699, "grad_norm": 41.613843164350314, "learning_rate": 4.995883846672222e-08, "logits/chosen": 0.14363157749176025, "logits/rejected": 0.2796134054660797, "logps/chosen": -592.6302490234375, "logps/rejected": -587.6798706054688, "loss": 0.4708, "rewards/accuracies": 0.75, "rewards/chosen": -1.5286242961883545, "rewards/margins": 0.759235143661499, "rewards/rejected": -2.2878596782684326, "step": 3400 }, { "epoch": 0.8181381957773513, "grad_norm": 49.604272632089646, "learning_rate": 4.871002693522486e-08, "logits/chosen": 0.2720317244529724, "logits/rejected": 0.25077277421951294, "logps/chosen": -463.8601989746094, "logps/rejected": -517.4568481445312, "loss": 0.4856, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3305190801620483, "rewards/margins": 0.8297308683395386, "rewards/rejected": -2.160250186920166, "step": 3410 }, { "epoch": 0.8205374280230326, "grad_norm": 40.03296260410171, "learning_rate": 4.7475335095623956e-08, "logits/chosen": 0.34070852398872375, "logits/rejected": 0.2651143968105316, "logps/chosen": -466.72686767578125, "logps/rejected": -563.7396240234375, "loss": 0.4721, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4763226509094238, "rewards/margins": 1.2001755237579346, "rewards/rejected": -2.6764981746673584, "step": 3420 }, { "epoch": 0.822936660268714, "grad_norm": 80.18666057349425, "learning_rate": 4.6254849556646714e-08, "logits/chosen": 0.22728531062602997, "logits/rejected": 0.229964017868042, "logps/chosen": -496.54852294921875, "logps/rejected": -587.7725830078125, "loss": 0.4966, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.4686188697814941, "rewards/margins": 1.2523690462112427, "rewards/rejected": -2.7209877967834473, "step": 3430 }, { "epoch": 0.8253358925143954, "grad_norm": 52.794660060456266, "learning_rate": 4.504865593050483e-08, "logits/chosen": 0.27111780643463135, "logits/rejected": 0.2475912868976593, "logps/chosen": -477.52685546875, "logps/rejected": -583.2651977539062, "loss": 0.5015, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5320662260055542, "rewards/margins": 0.9184083938598633, "rewards/rejected": -2.450474500656128, "step": 3440 }, { "epoch": 0.8277351247600768, "grad_norm": 63.34169787369902, "learning_rate": 4.385683882688895e-08, "logits/chosen": 0.15275821089744568, "logits/rejected": 0.20857541263103485, "logps/chosen": -512.3521728515625, "logps/rejected": -510.0169982910156, "loss": 0.5622, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6563478708267212, "rewards/margins": 0.5664867162704468, "rewards/rejected": -2.222834587097168, "step": 3450 }, { "epoch": 0.8301343570057581, "grad_norm": 59.152646437947276, "learning_rate": 4.2679481847033985e-08, "logits/chosen": 0.3345550298690796, "logits/rejected": 0.3184022009372711, "logps/chosen": -458.943115234375, "logps/rejected": -580.5496826171875, "loss": 0.5152, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.367032527923584, "rewards/margins": 1.1215214729309082, "rewards/rejected": -2.488554000854492, "step": 3460 }, { "epoch": 0.8325335892514395, "grad_norm": 41.522970046635024, "learning_rate": 4.151666757785435e-08, "logits/chosen": 0.25053077936172485, "logits/rejected": 0.21285638213157654, "logps/chosen": -415.4532165527344, "logps/rejected": -565.3435668945312, "loss": 0.467, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.0187116861343384, "rewards/margins": 1.5517404079437256, "rewards/rejected": -2.5704522132873535, "step": 3470 }, { "epoch": 0.8349328214971209, "grad_norm": 50.23472101986963, "learning_rate": 4.036847758615136e-08, "logits/chosen": 0.23763033747673035, "logits/rejected": 0.23918600380420685, "logps/chosen": -477.56292724609375, "logps/rejected": -576.02490234375, "loss": 0.5032, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8143908977508545, "rewards/margins": 0.8722183108329773, "rewards/rejected": -2.6866097450256348, "step": 3480 }, { "epoch": 0.8373320537428023, "grad_norm": 45.74910365878837, "learning_rate": 3.923499241289113e-08, "logits/chosen": 0.160926952958107, "logits/rejected": 0.19261090457439423, "logps/chosen": -533.9952392578125, "logps/rejected": -552.327880859375, "loss": 0.5377, "rewards/accuracies": 0.75, "rewards/chosen": -1.6078903675079346, "rewards/margins": 0.8734768033027649, "rewards/rejected": -2.4813671112060547, "step": 3490 }, { "epoch": 0.8397312859884837, "grad_norm": 47.87346283993082, "learning_rate": 3.811629156755541e-08, "logits/chosen": 0.1999920904636383, "logits/rejected": 0.14960861206054688, "logps/chosen": -488.1973571777344, "logps/rejected": -596.3568115234375, "loss": 0.4956, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2728978395462036, "rewards/margins": 1.2190895080566406, "rewards/rejected": -2.4919872283935547, "step": 3500 }, { "epoch": 0.8421305182341651, "grad_norm": 41.03513231238894, "learning_rate": 3.701245352256391e-08, "logits/chosen": 0.2294701635837555, "logits/rejected": 0.25733810663223267, "logps/chosen": -478.146484375, "logps/rejected": -508.96856689453125, "loss": 0.4831, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1556288003921509, "rewards/margins": 0.6834120750427246, "rewards/rejected": -1.839040756225586, "step": 3510 }, { "epoch": 0.8445297504798465, "grad_norm": 40.5674011892533, "learning_rate": 3.592355570776984e-08, "logits/chosen": 0.1878044307231903, "logits/rejected": 0.14977982640266418, "logps/chosen": -398.69970703125, "logps/rejected": -515.438232421875, "loss": 0.4747, "rewards/accuracies": 0.875, "rewards/chosen": -1.043157935142517, "rewards/margins": 1.0835729837417603, "rewards/rejected": -2.1267309188842773, "step": 3520 }, { "epoch": 0.8469289827255279, "grad_norm": 42.604163064101506, "learning_rate": 3.484967450502904e-08, "logits/chosen": 0.3040066361427307, "logits/rejected": 0.23765726387500763, "logps/chosen": -383.44561767578125, "logps/rejected": -547.45703125, "loss": 0.4804, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1220704317092896, "rewards/margins": 1.1863175630569458, "rewards/rejected": -2.3083879947662354, "step": 3530 }, { "epoch": 0.8493282149712092, "grad_norm": 59.806153925908724, "learning_rate": 3.3790885242841296e-08, "logits/chosen": 0.13462401926517487, "logits/rejected": 0.1024751216173172, "logps/chosen": -459.56915283203125, "logps/rejected": -603.9002685546875, "loss": 0.4678, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4029033184051514, "rewards/margins": 1.477888584136963, "rewards/rejected": -2.8807921409606934, "step": 3540 }, { "epoch": 0.8517274472168906, "grad_norm": 58.73779989635176, "learning_rate": 3.274726219106677e-08, "logits/chosen": 0.09248481690883636, "logits/rejected": 0.07832972705364227, "logps/chosen": -512.8543701171875, "logps/rejected": -601.7901611328125, "loss": 0.4885, "rewards/accuracies": 0.75, "rewards/chosen": -1.3987281322479248, "rewards/margins": 1.0073614120483398, "rewards/rejected": -2.4060897827148438, "step": 3550 }, { "epoch": 0.8541266794625719, "grad_norm": 47.36729249212975, "learning_rate": 3.171887855571642e-08, "logits/chosen": 0.23542580008506775, "logits/rejected": 0.21016255021095276, "logps/chosen": -400.10943603515625, "logps/rejected": -472.996337890625, "loss": 0.4859, "rewards/accuracies": 0.75, "rewards/chosen": -1.2242614030838013, "rewards/margins": 0.7899783253669739, "rewards/rejected": -2.014239549636841, "step": 3560 }, { "epoch": 0.8565259117082533, "grad_norm": 51.17436258863895, "learning_rate": 3.070580647381643e-08, "logits/chosen": 0.2268662452697754, "logits/rejected": 0.17909319698810577, "logps/chosen": -437.57421875, "logps/rejected": -550.803466796875, "loss": 0.4995, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3558123111724854, "rewards/margins": 1.194657802581787, "rewards/rejected": -2.5504701137542725, "step": 3570 }, { "epoch": 0.8589251439539347, "grad_norm": 48.13040334135157, "learning_rate": 2.9708117008348576e-08, "logits/chosen": 0.31328874826431274, "logits/rejected": 0.3502875864505768, "logps/chosen": -517.9609985351562, "logps/rejected": -542.474365234375, "loss": 0.4897, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4639475345611572, "rewards/margins": 0.7750081419944763, "rewards/rejected": -2.2389559745788574, "step": 3580 }, { "epoch": 0.8613243761996161, "grad_norm": 53.220249607806664, "learning_rate": 2.8725880143264992e-08, "logits/chosen": 0.21370474994182587, "logits/rejected": 0.17975714802742004, "logps/chosen": -469.7068786621094, "logps/rejected": -589.1580200195312, "loss": 0.5243, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5573484897613525, "rewards/margins": 0.7886762022972107, "rewards/rejected": -2.346024513244629, "step": 3590 }, { "epoch": 0.8637236084452975, "grad_norm": 67.90288894206734, "learning_rate": 2.775916477857948e-08, "logits/chosen": 0.25214099884033203, "logits/rejected": 0.19312720000743866, "logps/chosen": -414.92059326171875, "logps/rejected": -506.6708984375, "loss": 0.4783, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.330328345298767, "rewards/margins": 0.9338465929031372, "rewards/rejected": -2.2641749382019043, "step": 3600 }, { "epoch": 0.8661228406909789, "grad_norm": 59.11695302836589, "learning_rate": 2.680803872553408e-08, "logits/chosen": 0.2528062164783478, "logits/rejected": 0.17121002078056335, "logps/chosen": -428.0210876464844, "logps/rejected": -563.393310546875, "loss": 0.4888, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2556774616241455, "rewards/margins": 1.5917272567749023, "rewards/rejected": -2.8474044799804688, "step": 3610 }, { "epoch": 0.8685220729366603, "grad_norm": 59.52967993062111, "learning_rate": 2.5872568701842706e-08, "logits/chosen": 0.32945194840431213, "logits/rejected": 0.2652639150619507, "logps/chosen": -392.31329345703125, "logps/rejected": -495.3326721191406, "loss": 0.539, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3325417041778564, "rewards/margins": 0.8470155000686646, "rewards/rejected": -2.1795573234558105, "step": 3620 }, { "epoch": 0.8709213051823417, "grad_norm": 53.81677624528546, "learning_rate": 2.495282032701096e-08, "logits/chosen": 0.15500156581401825, "logits/rejected": 0.2495473325252533, "logps/chosen": -334.62774658203125, "logps/rejected": -434.7308044433594, "loss": 0.5105, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1157186031341553, "rewards/margins": 1.1813395023345947, "rewards/rejected": -2.29705810546875, "step": 3630 }, { "epoch": 0.8733205374280231, "grad_norm": 70.3049018186209, "learning_rate": 2.4048858117733133e-08, "logits/chosen": 0.16910839080810547, "logits/rejected": 0.169979065656662, "logps/chosen": -436.8203125, "logps/rejected": -540.2689819335938, "loss": 0.4643, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.1331539154052734, "rewards/margins": 1.6183888912200928, "rewards/rejected": -2.751542568206787, "step": 3640 }, { "epoch": 0.8757197696737045, "grad_norm": 49.87789467243074, "learning_rate": 2.3160745483366938e-08, "logits/chosen": 0.23682577908039093, "logits/rejected": 0.1723048985004425, "logps/chosen": -431.7490234375, "logps/rejected": -562.2601318359375, "loss": 0.4639, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3692805767059326, "rewards/margins": 1.0158613920211792, "rewards/rejected": -2.3851418495178223, "step": 3650 }, { "epoch": 0.8781190019193857, "grad_norm": 47.658615941206975, "learning_rate": 2.2288544721485197e-08, "logits/chosen": 0.14381949603557587, "logits/rejected": 0.03533410280942917, "logps/chosen": -387.8703918457031, "logps/rejected": -524.4107666015625, "loss": 0.4813, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.0450143814086914, "rewards/margins": 1.2733433246612549, "rewards/rejected": -2.3183577060699463, "step": 3660 }, { "epoch": 0.8805182341650671, "grad_norm": 45.82488725036134, "learning_rate": 2.1432317013506117e-08, "logits/chosen": 0.10933347791433334, "logits/rejected": 0.12345802783966064, "logps/chosen": -458.113037109375, "logps/rejected": -490.5302734375, "loss": 0.5397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4664695262908936, "rewards/margins": 0.8134799003601074, "rewards/rejected": -2.27994966506958, "step": 3670 }, { "epoch": 0.8829174664107485, "grad_norm": 53.31879445002021, "learning_rate": 2.0592122420401704e-08, "logits/chosen": 0.22227077186107635, "logits/rejected": 0.24705934524536133, "logps/chosen": -430.19537353515625, "logps/rejected": -503.33050537109375, "loss": 0.4986, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5509039163589478, "rewards/margins": 0.648668646812439, "rewards/rejected": -2.199572801589966, "step": 3680 }, { "epoch": 0.8853166986564299, "grad_norm": 42.381048234129516, "learning_rate": 1.976801987848459e-08, "logits/chosen": 0.2069139927625656, "logits/rejected": 0.16672655940055847, "logps/chosen": -472.5472106933594, "logps/rejected": -602.9320068359375, "loss": 0.4888, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4228280782699585, "rewards/margins": 1.2275440692901611, "rewards/rejected": -2.65037202835083, "step": 3690 }, { "epoch": 0.8877159309021113, "grad_norm": 53.40883833426912, "learning_rate": 1.8960067195273987e-08, "logits/chosen": 0.22911398112773895, "logits/rejected": 0.21664564311504364, "logps/chosen": -400.25030517578125, "logps/rejected": -505.628173828125, "loss": 0.5018, "rewards/accuracies": 0.875, "rewards/chosen": -1.1031681299209595, "rewards/margins": 1.1834853887557983, "rewards/rejected": -2.286653757095337, "step": 3700 }, { "epoch": 0.8901151631477927, "grad_norm": 41.251861300500764, "learning_rate": 1.816832104544072e-08, "logits/chosen": 0.30456072092056274, "logits/rejected": 0.29536372423171997, "logps/chosen": -486.6631774902344, "logps/rejected": -542.3421630859375, "loss": 0.4891, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.6421973705291748, "rewards/margins": 0.8107506632804871, "rewards/rejected": -2.4529478549957275, "step": 3710 }, { "epoch": 0.8925143953934741, "grad_norm": 39.96769073144664, "learning_rate": 1.7392836966831553e-08, "logits/chosen": 0.20969875156879425, "logits/rejected": 0.1758739948272705, "logps/chosen": -437.6871643066406, "logps/rejected": -546.804443359375, "loss": 0.4412, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -1.2321717739105225, "rewards/margins": 1.4391019344329834, "rewards/rejected": -2.671273708343506, "step": 3720 }, { "epoch": 0.8949136276391555, "grad_norm": 49.70645527143697, "learning_rate": 1.663366935657373e-08, "logits/chosen": 0.2884444296360016, "logits/rejected": 0.3402741551399231, "logps/chosen": -414.3851623535156, "logps/rejected": -516.3787841796875, "loss": 0.5216, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3819457292556763, "rewards/margins": 0.948479950428009, "rewards/rejected": -2.33042573928833, "step": 3730 }, { "epoch": 0.8973128598848369, "grad_norm": 77.94886524477812, "learning_rate": 1.5890871467258898e-08, "logits/chosen": 0.19290375709533691, "logits/rejected": 0.21824567019939423, "logps/chosen": -533.5081787109375, "logps/rejected": -581.3971557617188, "loss": 0.4969, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4534004926681519, "rewards/margins": 0.8920512199401855, "rewards/rejected": -2.345451831817627, "step": 3740 }, { "epoch": 0.8997120921305183, "grad_norm": 41.01203397728858, "learning_rate": 1.5164495403207967e-08, "logits/chosen": 0.1695217341184616, "logits/rejected": 0.035564176738262177, "logps/chosen": -487.5433654785156, "logps/rejected": -645.5303344726562, "loss": 0.4676, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4484670162200928, "rewards/margins": 1.3183784484863281, "rewards/rejected": -2.766845464706421, "step": 3750 }, { "epoch": 0.9021113243761996, "grad_norm": 40.21247020861889, "learning_rate": 1.4454592116815962e-08, "logits/chosen": 0.2717417776584625, "logits/rejected": 0.2026948183774948, "logps/chosen": -468.3108825683594, "logps/rejected": -587.5950927734375, "loss": 0.4599, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3242508172988892, "rewards/margins": 1.0534656047821045, "rewards/rejected": -2.3777167797088623, "step": 3760 }, { "epoch": 0.904510556621881, "grad_norm": 36.5293014274636, "learning_rate": 1.3761211404977934e-08, "logits/chosen": 0.21695688366889954, "logits/rejected": 0.18997912108898163, "logps/chosen": -414.2664489746094, "logps/rejected": -547.6370849609375, "loss": 0.4415, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.205294132232666, "rewards/margins": 1.4340205192565918, "rewards/rejected": -2.639314651489258, "step": 3770 }, { "epoch": 0.9069097888675623, "grad_norm": 57.20162040882379, "learning_rate": 1.3084401905596177e-08, "logits/chosen": 0.12880149483680725, "logits/rejected": 0.14128455519676208, "logps/chosen": -481.64605712890625, "logps/rejected": -535.501708984375, "loss": 0.4888, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1130110025405884, "rewards/margins": 1.1385244131088257, "rewards/rejected": -2.251535654067993, "step": 3780 }, { "epoch": 0.9093090211132437, "grad_norm": 45.23521207048333, "learning_rate": 1.2424211094168053e-08, "logits/chosen": 0.3405439257621765, "logits/rejected": 0.3810498118400574, "logps/chosen": -528.0676879882812, "logps/rejected": -599.46044921875, "loss": 0.4764, "rewards/accuracies": 0.75, "rewards/chosen": -1.4664819240570068, "rewards/margins": 0.8699405789375305, "rewards/rejected": -2.3364224433898926, "step": 3790 }, { "epoch": 0.9117082533589251, "grad_norm": 42.750810945395436, "learning_rate": 1.1780685280456143e-08, "logits/chosen": 0.22092266380786896, "logits/rejected": 0.1667570322751999, "logps/chosen": -535.6447143554688, "logps/rejected": -663.0042724609375, "loss": 0.5469, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7906440496444702, "rewards/margins": 1.2009087800979614, "rewards/rejected": -2.9915525913238525, "step": 3800 }, { "epoch": 0.9141074856046065, "grad_norm": 45.02882150214674, "learning_rate": 1.1153869605239564e-08, "logits/chosen": 0.3357655704021454, "logits/rejected": 0.39680781960487366, "logps/chosen": -468.96661376953125, "logps/rejected": -499.510009765625, "loss": 0.4881, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.1827433109283447, "rewards/margins": 0.8699227571487427, "rewards/rejected": -2.052665948867798, "step": 3810 }, { "epoch": 0.9165067178502879, "grad_norm": 53.835458478805826, "learning_rate": 1.0543808037147606e-08, "logits/chosen": 0.19844678044319153, "logits/rejected": 0.09387796372175217, "logps/chosen": -430.8998107910156, "logps/rejected": -596.3431396484375, "loss": 0.4637, "rewards/accuracies": 0.875, "rewards/chosen": -1.182877779006958, "rewards/margins": 1.630902886390686, "rewards/rejected": -2.8137805461883545, "step": 3820 }, { "epoch": 0.9189059500959693, "grad_norm": 45.731621829576106, "learning_rate": 9.95054336957557e-09, "logits/chosen": 0.20105035603046417, "logits/rejected": 0.12556061148643494, "logps/chosen": -441.2509765625, "logps/rejected": -534.109375, "loss": 0.4648, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1896703243255615, "rewards/margins": 0.8572039604187012, "rewards/rejected": -2.046874523162842, "step": 3830 }, { "epoch": 0.9213051823416507, "grad_norm": 59.067862218302, "learning_rate": 9.37411721768286e-09, "logits/chosen": 0.39653897285461426, "logits/rejected": 0.27279889583587646, "logps/chosen": -486.5269470214844, "logps/rejected": -648.8412475585938, "loss": 0.46, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6371396780014038, "rewards/margins": 1.1990723609924316, "rewards/rejected": -2.836211919784546, "step": 3840 }, { "epoch": 0.9237044145873321, "grad_norm": 47.43074874048961, "learning_rate": 8.81457001547392e-09, "logits/chosen": 0.2673342823982239, "logits/rejected": 0.2015964239835739, "logps/chosen": -492.92254638671875, "logps/rejected": -605.4844970703125, "loss": 0.4934, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.6292178630828857, "rewards/margins": 1.0606807470321655, "rewards/rejected": -2.689898729324341, "step": 3850 }, { "epoch": 0.9261036468330134, "grad_norm": 38.22467054106717, "learning_rate": 8.271941012961942e-09, "logits/chosen": 0.35539960861206055, "logits/rejected": 0.2722089886665344, "logps/chosen": -419.780517578125, "logps/rejected": -596.482666015625, "loss": 0.4606, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4746735095977783, "rewards/margins": 1.2210102081298828, "rewards/rejected": -2.695683717727661, "step": 3860 }, { "epoch": 0.9285028790786948, "grad_norm": 50.37493253511501, "learning_rate": 7.746268273415568e-09, "logits/chosen": 0.3808482885360718, "logits/rejected": 0.2647871673107147, "logps/chosen": -485.0458984375, "logps/rejected": -578.0924072265625, "loss": 0.4923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.5579640865325928, "rewards/margins": 0.5145239233970642, "rewards/rejected": -2.0724880695343018, "step": 3870 }, { "epoch": 0.9309021113243762, "grad_norm": 48.99088949948664, "learning_rate": 7.237588670689076e-09, "logits/chosen": 0.08190400898456573, "logits/rejected": 0.12344332039356232, "logps/chosen": -428.4112243652344, "logps/rejected": -517.3125610351562, "loss": 0.4641, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.215342402458191, "rewards/margins": 1.1562786102294922, "rewards/rejected": -2.3716208934783936, "step": 3880 }, { "epoch": 0.9333013435700576, "grad_norm": 44.444359990708264, "learning_rate": 6.745937886635606e-09, "logits/chosen": 0.22676298022270203, "logits/rejected": 0.14976339042186737, "logps/chosen": -487.2351989746094, "logps/rejected": -613.9521484375, "loss": 0.465, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.407539963722229, "rewards/margins": 1.2937225103378296, "rewards/rejected": -2.7012624740600586, "step": 3890 }, { "epoch": 0.935700575815739, "grad_norm": 40.06142700499872, "learning_rate": 6.271350408604409e-09, "logits/chosen": 0.2837770879268646, "logits/rejected": 0.2296113520860672, "logps/chosen": -382.27227783203125, "logps/rejected": -569.7482299804688, "loss": 0.4645, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.0186858177185059, "rewards/margins": 1.615103006362915, "rewards/rejected": -2.633788585662842, "step": 3900 }, { "epoch": 0.9380998080614203, "grad_norm": 73.29538152762231, "learning_rate": 5.813859527021487e-09, "logits/chosen": 0.35343560576438904, "logits/rejected": 0.2977786660194397, "logps/chosen": -445.3648376464844, "logps/rejected": -555.0151977539062, "loss": 0.4829, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2847732305526733, "rewards/margins": 1.3373976945877075, "rewards/rejected": -2.62217116355896, "step": 3910 }, { "epoch": 0.9404990403071017, "grad_norm": 55.32984913756992, "learning_rate": 5.373497333054616e-09, "logits/chosen": 0.2757224440574646, "logits/rejected": 0.27316632866859436, "logps/chosen": -503.87371826171875, "logps/rejected": -564.5245361328125, "loss": 0.515, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.4553347826004028, "rewards/margins": 0.799089252948761, "rewards/rejected": -2.2544240951538086, "step": 3920 }, { "epoch": 0.9428982725527831, "grad_norm": 45.13547537051501, "learning_rate": 4.950294716362213e-09, "logits/chosen": 0.2402069866657257, "logits/rejected": 0.2745649814605713, "logps/chosen": -531.6744995117188, "logps/rejected": -642.5697021484375, "loss": 0.4806, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5781127214431763, "rewards/margins": 1.1812816858291626, "rewards/rejected": -2.7593941688537598, "step": 3930 }, { "epoch": 0.9452975047984645, "grad_norm": 41.45524037338652, "learning_rate": 4.544281362926422e-09, "logits/chosen": 0.1885417103767395, "logits/rejected": 0.1404399871826172, "logps/chosen": -493.8916931152344, "logps/rejected": -607.7820434570312, "loss": 0.4847, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.2237544059753418, "rewards/margins": 1.2443654537200928, "rewards/rejected": -2.4681198596954346, "step": 3940 }, { "epoch": 0.9476967370441459, "grad_norm": 41.32543731890712, "learning_rate": 4.15548575297095e-09, "logits/chosen": 0.13838523626327515, "logits/rejected": 0.12014584243297577, "logps/chosen": -423.9912109375, "logps/rejected": -555.3060302734375, "loss": 0.4492, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3068205118179321, "rewards/margins": 1.4025375843048096, "rewards/rejected": -2.709357976913452, "step": 3950 }, { "epoch": 0.9500959692898272, "grad_norm": 38.663387459727744, "learning_rate": 3.7839351589631366e-09, "logits/chosen": 0.20229902863502502, "logits/rejected": 0.06122536584734917, "logps/chosen": -423.82379150390625, "logps/rejected": -579.1092529296875, "loss": 0.4703, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3485114574432373, "rewards/margins": 0.9370707273483276, "rewards/rejected": -2.2855823040008545, "step": 3960 }, { "epoch": 0.9524952015355086, "grad_norm": 55.20292972374471, "learning_rate": 3.4296556437010405e-09, "logits/chosen": 0.20910441875457764, "logits/rejected": 0.18343612551689148, "logps/chosen": -397.52239990234375, "logps/rejected": -482.45513916015625, "loss": 0.4914, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3418649435043335, "rewards/margins": 0.8949347734451294, "rewards/rejected": -2.236799716949463, "step": 3970 }, { "epoch": 0.95489443378119, "grad_norm": 52.42663168427878, "learning_rate": 3.092672058485124e-09, "logits/chosen": 0.2784040868282318, "logits/rejected": 0.22552700340747833, "logps/chosen": -437.07122802734375, "logps/rejected": -581.2984008789062, "loss": 0.5278, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.39797043800354, "rewards/margins": 1.417875051498413, "rewards/rejected": -2.815845489501953, "step": 3980 }, { "epoch": 0.9572936660268714, "grad_norm": 51.730419941201816, "learning_rate": 2.7730080413750356e-09, "logits/chosen": 0.3203295171260834, "logits/rejected": 0.33414626121520996, "logps/chosen": -470.68963623046875, "logps/rejected": -590.4835815429688, "loss": 0.4926, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3279026746749878, "rewards/margins": 1.245241641998291, "rewards/rejected": -2.5731444358825684, "step": 3990 }, { "epoch": 0.9596928982725528, "grad_norm": 52.110269896703294, "learning_rate": 2.4706860155316033e-09, "logits/chosen": 0.2101161777973175, "logits/rejected": 0.23008927702903748, "logps/chosen": -545.7728271484375, "logps/rejected": -637.5755615234375, "loss": 0.4895, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4393521547317505, "rewards/margins": 0.8712779879570007, "rewards/rejected": -2.3106300830841064, "step": 4000 }, { "epoch": 0.9596928982725528, "eval_logits/chosen": 0.38506969809532166, "eval_logits/rejected": 0.3408171534538269, "eval_logps/chosen": -459.0677185058594, "eval_logps/rejected": -584.910400390625, "eval_loss": 0.4760858714580536, "eval_rewards/accuracies": 0.7982142567634583, "eval_rewards/chosen": -1.4039554595947266, "eval_rewards/margins": 1.1972852945327759, "eval_rewards/rejected": -2.601240873336792, "eval_runtime": 172.2382, "eval_samples_per_second": 25.9, "eval_steps_per_second": 0.406, "step": 4000 }, { "epoch": 0.9620921305182342, "grad_norm": 48.027804731217394, "learning_rate": 2.185727187643843e-09, "logits/chosen": 0.17230884730815887, "logits/rejected": 0.11436843872070312, "logps/chosen": -407.50506591796875, "logps/rejected": -561.4027709960938, "loss": 0.5232, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3693194389343262, "rewards/margins": 1.4761823415756226, "rewards/rejected": -2.8455021381378174, "step": 4010 }, { "epoch": 0.9644913627639156, "grad_norm": 57.935705040777876, "learning_rate": 1.9181515464413434e-09, "logits/chosen": 0.15140806138515472, "logits/rejected": 0.09838312864303589, "logps/chosen": -575.939208984375, "logps/rejected": -703.3623657226562, "loss": 0.4954, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.2815978527069092, "rewards/margins": 1.3098185062408447, "rewards/rejected": -2.591416597366333, "step": 4020 }, { "epoch": 0.966890595009597, "grad_norm": 40.72962697033489, "learning_rate": 1.6679778612923302e-09, "logits/chosen": 0.21621087193489075, "logits/rejected": 0.2728949785232544, "logps/chosen": -515.5426635742188, "logps/rejected": -592.4903564453125, "loss": 0.4629, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.5008609294891357, "rewards/margins": 0.7948905229568481, "rewards/rejected": -2.2957513332366943, "step": 4030 }, { "epoch": 0.9692898272552783, "grad_norm": 54.67990587779175, "learning_rate": 1.43522368088686e-09, "logits/chosen": 0.29817652702331543, "logits/rejected": 0.22439947724342346, "logps/chosen": -469.2783203125, "logps/rejected": -633.0770263671875, "loss": 0.5304, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5123900175094604, "rewards/margins": 1.635197401046753, "rewards/rejected": -3.147587299346924, "step": 4040 }, { "epoch": 0.9716890595009597, "grad_norm": 70.3894278582445, "learning_rate": 1.2199053320059993e-09, "logits/chosen": 0.3103833794593811, "logits/rejected": 0.2175188809633255, "logps/chosen": -478.85443115234375, "logps/rejected": -599.4166259765625, "loss": 0.4973, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4971873760223389, "rewards/margins": 1.086474061012268, "rewards/rejected": -2.5836615562438965, "step": 4050 }, { "epoch": 0.974088291746641, "grad_norm": 45.89117778001179, "learning_rate": 1.0220379183764338e-09, "logits/chosen": 0.1872117817401886, "logits/rejected": 0.14962831139564514, "logps/chosen": -379.21612548828125, "logps/rejected": -526.4472045898438, "loss": 0.4727, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1573994159698486, "rewards/margins": 1.4220101833343506, "rewards/rejected": -2.5794098377227783, "step": 4060 }, { "epoch": 0.9764875239923224, "grad_norm": 42.58928630808853, "learning_rate": 8.416353196111503e-10, "logits/chosen": 0.4299827218055725, "logits/rejected": 0.3653213679790497, "logps/chosen": -455.606689453125, "logps/rejected": -537.0999755859375, "loss": 0.5395, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.646712064743042, "rewards/margins": 0.9391171336174011, "rewards/rejected": -2.585829257965088, "step": 4070 }, { "epoch": 0.9788867562380038, "grad_norm": 53.46584271337103, "learning_rate": 6.787101902356873e-10, "logits/chosen": 0.3689078986644745, "logits/rejected": 0.34390968084335327, "logps/chosen": -460.3603515625, "logps/rejected": -581.0008544921875, "loss": 0.461, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3799049854278564, "rewards/margins": 0.9097992181777954, "rewards/rejected": -2.2897043228149414, "step": 4080 }, { "epoch": 0.9812859884836852, "grad_norm": 61.26328575901746, "learning_rate": 5.332739588005953e-10, "logits/chosen": 0.1865283101797104, "logits/rejected": 0.08630210161209106, "logps/chosen": -390.46929931640625, "logps/rejected": -543.6526489257812, "loss": 0.4761, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.2067457437515259, "rewards/margins": 1.2890859842300415, "rewards/rejected": -2.4958317279815674, "step": 4090 }, { "epoch": 0.9836852207293666, "grad_norm": 49.7010925580212, "learning_rate": 4.053368270797164e-10, "logits/chosen": 0.34013232588768005, "logits/rejected": 0.23841337859630585, "logps/chosen": -435.8818359375, "logps/rejected": -555.6300659179688, "loss": 0.452, "rewards/accuracies": 0.75, "rewards/chosen": -1.4993178844451904, "rewards/margins": 1.1681034564971924, "rewards/rejected": -2.6674208641052246, "step": 4100 }, { "epoch": 0.986084452975048, "grad_norm": 39.60831889767418, "learning_rate": 2.949077693545354e-10, "logits/chosen": 0.3429808020591736, "logits/rejected": 0.28340935707092285, "logps/chosen": -493.12799072265625, "logps/rejected": -603.4589233398438, "loss": 0.5203, "rewards/accuracies": 0.625, "rewards/chosen": -1.5291836261749268, "rewards/margins": 0.7797685861587524, "rewards/rejected": -2.3089520931243896, "step": 4110 }, { "epoch": 0.9884836852207294, "grad_norm": 48.86243638343189, "learning_rate": 2.0199453178471047e-10, "logits/chosen": 0.2578023374080658, "logits/rejected": 0.28469234704971313, "logps/chosen": -521.9082641601562, "logps/rejected": -584.6770629882812, "loss": 0.4724, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -1.4713417291641235, "rewards/margins": 1.0106664896011353, "rewards/rejected": -2.482008457183838, "step": 4120 }, { "epoch": 0.9908829174664108, "grad_norm": 40.74851597282627, "learning_rate": 1.266036318647301e-10, "logits/chosen": 0.24952539801597595, "logits/rejected": 0.21447113156318665, "logps/chosen": -515.4271240234375, "logps/rejected": -623.4884033203125, "loss": 0.4614, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3878891468048096, "rewards/margins": 1.3714239597320557, "rewards/rejected": -2.7593131065368652, "step": 4130 }, { "epoch": 0.9932821497120922, "grad_norm": 59.7391917924102, "learning_rate": 6.874035796672339e-11, "logits/chosen": 0.20685334503650665, "logits/rejected": 0.19621731340885162, "logps/chosen": -468.5065002441406, "logps/rejected": -590.8399047851562, "loss": 0.511, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1164958477020264, "rewards/margins": 1.485654592514038, "rewards/rejected": -2.6021504402160645, "step": 4140 }, { "epoch": 0.9956813819577736, "grad_norm": 64.53279604006218, "learning_rate": 2.8408768969423458e-11, "logits/chosen": 0.16596756875514984, "logits/rejected": 0.11380906403064728, "logps/chosen": -485.75006103515625, "logps/rejected": -593.44287109375, "loss": 0.4651, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3494365215301514, "rewards/margins": 1.0383247137069702, "rewards/rejected": -2.387761354446411, "step": 4150 }, { "epoch": 0.9980806142034548, "grad_norm": 64.97328454417662, "learning_rate": 5.611693973617271e-12, "logits/chosen": 0.3674852252006531, "logits/rejected": 0.3302612006664276, "logps/chosen": -414.0726623535156, "logps/rejected": -535.21337890625, "loss": 0.5188, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.3726593255996704, "rewards/margins": 0.9844037890434265, "rewards/rejected": -2.357063055038452, "step": 4160 }, { "epoch": 1.0, "step": 4168, "total_flos": 0.0, "train_loss": 0.5273771832863338, "train_runtime": 14157.4064, "train_samples_per_second": 9.42, "train_steps_per_second": 0.294 } ], "logging_steps": 10, "max_steps": 4168, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 5000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }