{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999183606825047, "eval_steps": 100, "global_step": 1531, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000653114539962446, "grad_norm": 6.562415939922599, "learning_rate": 1.2987012987012988e-09, "logits/chosen": -0.8478949069976807, "logits/rejected": -0.799842894077301, "logps/chosen": -449.73687744140625, "logps/rejected": -466.4884338378906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.001306229079924892, "grad_norm": 6.748473020918118, "learning_rate": 2.5974025974025976e-09, "logits/chosen": -0.8114207983016968, "logits/rejected": -0.803877592086792, "logps/chosen": -473.9585876464844, "logps/rejected": -491.62945556640625, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0019593436198873378, "grad_norm": 7.844471184175687, "learning_rate": 3.8961038961038956e-09, "logits/chosen": -0.7223482728004456, "logits/rejected": -0.7460772395133972, "logps/chosen": -452.0518798828125, "logps/rejected": -520.4376831054688, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0006803108262829483, "rewards/margins": 0.000374476658180356, "rewards/rejected": -0.0010547875426709652, "step": 3 }, { "epoch": 0.002612458159849784, "grad_norm": 6.158505172940123, "learning_rate": 5.194805194805195e-09, "logits/chosen": -0.7972275614738464, "logits/rejected": -0.7775567770004272, "logps/chosen": -464.388916015625, "logps/rejected": -450.9453125, "loss": 0.6932, "rewards/accuracies": 0.5625, "rewards/chosen": 0.002797861350700259, "rewards/margins": 0.0031175969634205103, "rewards/rejected": -0.000319735670927912, "step": 4 }, { "epoch": 0.0032655726998122294, "grad_norm": 5.924997097764666, "learning_rate": 6.493506493506492e-09, "logits/chosen": -0.7506823539733887, "logits/rejected": -0.7407633066177368, "logps/chosen": -448.05523681640625, "logps/rejected": -451.1369934082031, "loss": 0.6933, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0005750393029302359, "rewards/margins": 0.0015734098851680756, "rewards/rejected": -0.0009983705822378397, "step": 5 }, { "epoch": 0.0039186872397746755, "grad_norm": 7.131146828068107, "learning_rate": 7.792207792207791e-09, "logits/chosen": -0.7589026689529419, "logits/rejected": -0.7306044101715088, "logps/chosen": -467.2422790527344, "logps/rejected": -473.4179382324219, "loss": 0.6934, "rewards/accuracies": 0.5, "rewards/chosen": 0.0016507150139659643, "rewards/margins": -0.0006250384030863643, "rewards/rejected": 0.0022757528349757195, "step": 6 }, { "epoch": 0.0045718017797371216, "grad_norm": 6.131111671725116, "learning_rate": 9.09090909090909e-09, "logits/chosen": -0.8041960597038269, "logits/rejected": -0.774326503276825, "logps/chosen": -538.4379272460938, "logps/rejected": -494.1611022949219, "loss": 0.6936, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0003780794213525951, "rewards/margins": 0.00037171837175264955, "rewards/rejected": -0.0007497979095205665, "step": 7 }, { "epoch": 0.005224916319699568, "grad_norm": 6.367194577822853, "learning_rate": 1.038961038961039e-08, "logits/chosen": -0.7674342393875122, "logits/rejected": -0.7461612820625305, "logps/chosen": -502.4452819824219, "logps/rejected": -518.5669555664062, "loss": 0.6934, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0011391757288947701, "rewards/margins": 0.0007867790409363806, "rewards/rejected": -0.0019259547116234899, "step": 8 }, { "epoch": 0.005878030859662013, "grad_norm": 6.463891351458318, "learning_rate": 1.1688311688311687e-08, "logits/chosen": -0.850688636302948, "logits/rejected": -0.8408986926078796, "logps/chosen": -448.7418212890625, "logps/rejected": -451.31463623046875, "loss": 0.6933, "rewards/accuracies": 0.65625, "rewards/chosen": 0.002334318123757839, "rewards/margins": 0.002614659955725074, "rewards/rejected": -0.0002803421812132001, "step": 9 }, { "epoch": 0.006531145399624459, "grad_norm": 6.550118208844482, "learning_rate": 1.2987012987012985e-08, "logits/chosen": -0.8453483581542969, "logits/rejected": -0.8150793313980103, "logps/chosen": -507.4414367675781, "logps/rejected": -457.73309326171875, "loss": 0.6932, "rewards/accuracies": 0.53125, "rewards/chosen": 0.002283277688547969, "rewards/margins": 0.00012453558156266809, "rewards/rejected": 0.0021587416995316744, "step": 10 }, { "epoch": 0.007184259939586905, "grad_norm": 6.40311214722953, "learning_rate": 1.4285714285714284e-08, "logits/chosen": -0.5852610468864441, "logits/rejected": -0.5376417636871338, "logps/chosen": -440.3765869140625, "logps/rejected": -440.29327392578125, "loss": 0.693, "rewards/accuracies": 0.40625, "rewards/chosen": 0.00031277656671591103, "rewards/margins": -0.00043199292849749327, "rewards/rejected": 0.0007447696407325566, "step": 11 }, { "epoch": 0.007837374479549351, "grad_norm": 6.088349060239392, "learning_rate": 1.5584415584415582e-08, "logits/chosen": -0.8110294938087463, "logits/rejected": -0.7939715385437012, "logps/chosen": -440.26983642578125, "logps/rejected": -439.900390625, "loss": 0.6924, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00481039984151721, "rewards/margins": 0.0019957683980464935, "rewards/rejected": 0.00281463167630136, "step": 12 }, { "epoch": 0.008490489019511797, "grad_norm": 5.861510809151376, "learning_rate": 1.6883116883116882e-08, "logits/chosen": -0.9300619959831238, "logits/rejected": -0.9246059060096741, "logps/chosen": -537.093017578125, "logps/rejected": -536.7426147460938, "loss": 0.6924, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0025931550189852715, "rewards/margins": 0.0013525104150176048, "rewards/rejected": -0.003945665434002876, "step": 13 }, { "epoch": 0.009143603559474243, "grad_norm": 6.593136523571926, "learning_rate": 1.818181818181818e-08, "logits/chosen": -0.8136120438575745, "logits/rejected": -0.8396366834640503, "logps/chosen": -511.1154479980469, "logps/rejected": -519.767578125, "loss": 0.694, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0005540727870538831, "rewards/margins": -0.002040934283286333, "rewards/rejected": 0.0025950074195861816, "step": 14 }, { "epoch": 0.00979671809943669, "grad_norm": 5.9190226098756344, "learning_rate": 1.948051948051948e-08, "logits/chosen": -0.8912380933761597, "logits/rejected": -0.8504163026809692, "logps/chosen": -480.95562744140625, "logps/rejected": -442.54010009765625, "loss": 0.6938, "rewards/accuracies": 0.34375, "rewards/chosen": -0.001571483677253127, "rewards/margins": -0.0010143854888156056, "rewards/rejected": -0.0005570981884375215, "step": 15 }, { "epoch": 0.010449832639399135, "grad_norm": 6.038795223633495, "learning_rate": 2.077922077922078e-08, "logits/chosen": -0.897087812423706, "logits/rejected": -0.8558621406555176, "logps/chosen": -535.7008666992188, "logps/rejected": -501.1187438964844, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": -0.0005735683371312916, "rewards/margins": 0.00031663902336731553, "rewards/rejected": -0.0008902073604986072, "step": 16 }, { "epoch": 0.011102947179361581, "grad_norm": 5.647231054712397, "learning_rate": 2.2077922077922077e-08, "logits/chosen": -0.7808964252471924, "logits/rejected": -0.7509868144989014, "logps/chosen": -404.5117492675781, "logps/rejected": -420.18280029296875, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.00039049627957865596, "rewards/margins": -0.00038562045665457845, "rewards/rejected": 0.0007761167944408953, "step": 17 }, { "epoch": 0.011756061719324026, "grad_norm": 6.240516998552548, "learning_rate": 2.3376623376623374e-08, "logits/chosen": -0.9235209822654724, "logits/rejected": -0.8612209558486938, "logps/chosen": -483.8492431640625, "logps/rejected": -480.3951721191406, "loss": 0.6934, "rewards/accuracies": 0.625, "rewards/chosen": 0.0007323266472667456, "rewards/margins": 0.0006075193523429334, "rewards/rejected": 0.00012480735313147306, "step": 18 }, { "epoch": 0.012409176259286472, "grad_norm": 6.121429252846795, "learning_rate": 2.4675324675324673e-08, "logits/chosen": -0.8684954643249512, "logits/rejected": -0.7898334264755249, "logps/chosen": -421.7978210449219, "logps/rejected": -404.7839660644531, "loss": 0.6934, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0008550119819119573, "rewards/margins": 0.000968189153354615, "rewards/rejected": -0.0001131771132349968, "step": 19 }, { "epoch": 0.013062290799248918, "grad_norm": 6.339785767547404, "learning_rate": 2.597402597402597e-08, "logits/chosen": -0.8148159980773926, "logits/rejected": -0.8194824457168579, "logps/chosen": -458.73248291015625, "logps/rejected": -466.71539306640625, "loss": 0.6934, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00043350690975785255, "rewards/margins": -0.002935664728283882, "rewards/rejected": 0.002502157585695386, "step": 20 }, { "epoch": 0.013715405339211364, "grad_norm": 5.9420982708921235, "learning_rate": 2.727272727272727e-08, "logits/chosen": -0.8213762640953064, "logits/rejected": -0.7422791719436646, "logps/chosen": -494.7904357910156, "logps/rejected": -440.2748718261719, "loss": 0.6933, "rewards/accuracies": 0.3125, "rewards/chosen": -0.005373182240873575, "rewards/margins": -0.0054640937596559525, "rewards/rejected": 9.091137326322496e-05, "step": 21 }, { "epoch": 0.01436851987917381, "grad_norm": 7.466100405088569, "learning_rate": 2.857142857142857e-08, "logits/chosen": -0.7983863353729248, "logits/rejected": -0.7951775789260864, "logps/chosen": -435.16485595703125, "logps/rejected": -436.73638916015625, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": 0.0032159374095499516, "rewards/margins": 0.0017969273030757904, "rewards/rejected": 0.0014190103393048048, "step": 22 }, { "epoch": 0.015021634419136256, "grad_norm": 8.73962952417513, "learning_rate": 2.987012987012987e-08, "logits/chosen": -0.7669256925582886, "logits/rejected": -0.7493557929992676, "logps/chosen": -468.0224304199219, "logps/rejected": -496.3000793457031, "loss": 0.6927, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0022595857735723257, "rewards/margins": 0.0015146515797823668, "rewards/rejected": 0.0007449342519976199, "step": 23 }, { "epoch": 0.015674748959098702, "grad_norm": 6.272301382957476, "learning_rate": 3.1168831168831165e-08, "logits/chosen": -0.8932624459266663, "logits/rejected": -0.738869845867157, "logps/chosen": -465.3074035644531, "logps/rejected": -401.3009948730469, "loss": 0.6934, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0013363815378397703, "rewards/margins": -0.0026393865700811148, "rewards/rejected": 0.0013030050322413445, "step": 24 }, { "epoch": 0.016327863499061148, "grad_norm": 6.417594194874059, "learning_rate": 3.246753246753247e-08, "logits/chosen": -0.6995882391929626, "logits/rejected": -0.7010672688484192, "logps/chosen": -469.766845703125, "logps/rejected": -522.3395385742188, "loss": 0.6933, "rewards/accuracies": 0.375, "rewards/chosen": -0.0015485978219658136, "rewards/margins": -0.0024081014562398195, "rewards/rejected": 0.0008595038088969886, "step": 25 }, { "epoch": 0.016980978039023594, "grad_norm": 6.367600858757605, "learning_rate": 3.3766233766233764e-08, "logits/chosen": -0.9031739234924316, "logits/rejected": -0.8351707458496094, "logps/chosen": -466.1416320800781, "logps/rejected": -425.1600646972656, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0006817459943704307, "rewards/margins": -0.0016306330217048526, "rewards/rejected": 0.0023123789578676224, "step": 26 }, { "epoch": 0.01763409257898604, "grad_norm": 6.118336209796513, "learning_rate": 3.506493506493507e-08, "logits/chosen": -0.8085826635360718, "logits/rejected": -0.7773007750511169, "logps/chosen": -501.96881103515625, "logps/rejected": -469.5330810546875, "loss": 0.6937, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0007913708686828613, "rewards/margins": -0.0013373160036280751, "rewards/rejected": 0.0005459451349452138, "step": 27 }, { "epoch": 0.018287207118948486, "grad_norm": 8.745785820519497, "learning_rate": 3.636363636363636e-08, "logits/chosen": -0.7346066832542419, "logits/rejected": -0.7083489894866943, "logps/chosen": -427.6324768066406, "logps/rejected": -452.82135009765625, "loss": 0.6941, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0010156225180253386, "rewards/margins": -0.000818071304820478, "rewards/rejected": 0.0018336940556764603, "step": 28 }, { "epoch": 0.018940321658910932, "grad_norm": 5.868861469611495, "learning_rate": 3.766233766233766e-08, "logits/chosen": -0.8541309833526611, "logits/rejected": -0.7489965558052063, "logps/chosen": -494.1967468261719, "logps/rejected": -451.6412048339844, "loss": 0.6932, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002935814904049039, "rewards/margins": 0.0030139421578496695, "rewards/rejected": -7.812735566403717e-05, "step": 29 }, { "epoch": 0.01959343619887338, "grad_norm": 5.717710270817341, "learning_rate": 3.896103896103896e-08, "logits/chosen": -0.876485288143158, "logits/rejected": -0.8216814398765564, "logps/chosen": -465.61505126953125, "logps/rejected": -459.4190673828125, "loss": 0.6929, "rewards/accuracies": 0.5, "rewards/chosen": 0.0021222876384854317, "rewards/margins": 0.0020624399185180664, "rewards/rejected": 5.984783638268709e-05, "step": 30 }, { "epoch": 0.020246550738835824, "grad_norm": 6.281100624641187, "learning_rate": 4.025974025974026e-08, "logits/chosen": -0.8063470125198364, "logits/rejected": -0.7959135174751282, "logps/chosen": -491.56195068359375, "logps/rejected": -522.5487670898438, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0031092308927327394, "rewards/margins": 0.004373883828520775, "rewards/rejected": -0.001264653168618679, "step": 31 }, { "epoch": 0.02089966527879827, "grad_norm": 8.396501231198874, "learning_rate": 4.155844155844156e-08, "logits/chosen": -0.8137073516845703, "logits/rejected": -0.7688996195793152, "logps/chosen": -487.014892578125, "logps/rejected": -456.2672119140625, "loss": 0.6927, "rewards/accuracies": 0.625, "rewards/chosen": -2.9263464966788888e-05, "rewards/margins": 0.002564668655395508, "rewards/rejected": -0.002593932207673788, "step": 32 }, { "epoch": 0.021552779818760717, "grad_norm": 6.330099918769276, "learning_rate": 4.285714285714285e-08, "logits/chosen": -0.8605762124061584, "logits/rejected": -0.877137303352356, "logps/chosen": -533.037841796875, "logps/rejected": -569.402099609375, "loss": 0.6925, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0005539129488170147, "rewards/margins": 0.003927340265363455, "rewards/rejected": -0.0044812532141804695, "step": 33 }, { "epoch": 0.022205894358723163, "grad_norm": 6.7333044733098255, "learning_rate": 4.4155844155844154e-08, "logits/chosen": -0.8076195120811462, "logits/rejected": -0.8017356395721436, "logps/chosen": -468.20135498046875, "logps/rejected": -545.4854736328125, "loss": 0.693, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0006238793721422553, "rewards/margins": 0.001298372633755207, "rewards/rejected": -0.0019222521223127842, "step": 34 }, { "epoch": 0.02285900889868561, "grad_norm": 6.246072765738932, "learning_rate": 4.545454545454545e-08, "logits/chosen": -0.9315503835678101, "logits/rejected": -0.8312739133834839, "logps/chosen": -506.8136291503906, "logps/rejected": -494.281982421875, "loss": 0.6928, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008896064246073365, "rewards/margins": 0.0015592812560498714, "rewards/rejected": -0.0024488880299031734, "step": 35 }, { "epoch": 0.02351212343864805, "grad_norm": 6.247743209477438, "learning_rate": 4.675324675324675e-08, "logits/chosen": -0.8887131810188293, "logits/rejected": -0.797776460647583, "logps/chosen": -443.1953125, "logps/rejected": -482.0925598144531, "loss": 0.6926, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00042538653360679746, "rewards/margins": 0.0009448549244552851, "rewards/rejected": -0.0013702415162697434, "step": 36 }, { "epoch": 0.024165237978610497, "grad_norm": 6.910073571698978, "learning_rate": 4.805194805194805e-08, "logits/chosen": -0.7385239601135254, "logits/rejected": -0.7428927421569824, "logps/chosen": -448.6010437011719, "logps/rejected": -474.8780517578125, "loss": 0.6936, "rewards/accuracies": 0.5, "rewards/chosen": 0.0011567330220714211, "rewards/margins": 0.0007090972503647208, "rewards/rejected": 0.00044763548066839576, "step": 37 }, { "epoch": 0.024818352518572943, "grad_norm": 6.379108387846267, "learning_rate": 4.9350649350649346e-08, "logits/chosen": -0.8199461698532104, "logits/rejected": -0.8401196002960205, "logps/chosen": -460.55938720703125, "logps/rejected": -528.7261962890625, "loss": 0.693, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0014551591593772173, "rewards/margins": -0.0002124977036146447, "rewards/rejected": -0.00124266161583364, "step": 38 }, { "epoch": 0.02547146705853539, "grad_norm": 8.352936998804818, "learning_rate": 5.064935064935064e-08, "logits/chosen": -0.8554872870445251, "logits/rejected": -0.7549471855163574, "logps/chosen": -521.4730834960938, "logps/rejected": -480.829345703125, "loss": 0.6933, "rewards/accuracies": 0.5, "rewards/chosen": -0.0004938221536576748, "rewards/margins": -0.0012401770800352097, "rewards/rejected": 0.0007463552174158394, "step": 39 }, { "epoch": 0.026124581598497836, "grad_norm": 5.6761908104871175, "learning_rate": 5.194805194805194e-08, "logits/chosen": -0.816625714302063, "logits/rejected": -0.7816446423530579, "logps/chosen": -458.86163330078125, "logps/rejected": -457.4176025390625, "loss": 0.6931, "rewards/accuracies": 0.625, "rewards/chosen": 0.000727543723769486, "rewards/margins": 0.002859849948436022, "rewards/rejected": -0.0021323063410818577, "step": 40 }, { "epoch": 0.02677769613846028, "grad_norm": 6.57427304979193, "learning_rate": 5.324675324675324e-08, "logits/chosen": -0.8133624792098999, "logits/rejected": -0.7977904677391052, "logps/chosen": -475.57403564453125, "logps/rejected": -483.3184509277344, "loss": 0.6933, "rewards/accuracies": 0.375, "rewards/chosen": -0.0008566378382965922, "rewards/margins": -0.0026502918917685747, "rewards/rejected": 0.0017936539370566607, "step": 41 }, { "epoch": 0.027430810678422728, "grad_norm": 6.268682392513088, "learning_rate": 5.454545454545454e-08, "logits/chosen": -0.9565770030021667, "logits/rejected": -0.905295193195343, "logps/chosen": -477.0942687988281, "logps/rejected": -471.25128173828125, "loss": 0.6933, "rewards/accuracies": 0.625, "rewards/chosen": -0.001213481416925788, "rewards/margins": -0.0002797842025756836, "rewards/rejected": -0.0009336970397271216, "step": 42 }, { "epoch": 0.028083925218385174, "grad_norm": 7.089768213142083, "learning_rate": 5.584415584415584e-08, "logits/chosen": -0.7968413829803467, "logits/rejected": -0.7868589162826538, "logps/chosen": -415.7538146972656, "logps/rejected": -465.5323486328125, "loss": 0.6921, "rewards/accuracies": 0.5, "rewards/chosen": 0.0018370484467595816, "rewards/margins": 0.0022343345917761326, "rewards/rejected": -0.0003972864360548556, "step": 43 }, { "epoch": 0.02873703975834762, "grad_norm": 6.217079116264922, "learning_rate": 5.714285714285714e-08, "logits/chosen": -0.8787972927093506, "logits/rejected": -0.8169955015182495, "logps/chosen": -515.1199951171875, "logps/rejected": -519.668701171875, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": -0.001179876271635294, "rewards/margins": -0.00044661754509434104, "rewards/rejected": -0.0007332589011639357, "step": 44 }, { "epoch": 0.029390154298310066, "grad_norm": 6.9137881963714936, "learning_rate": 5.8441558441558434e-08, "logits/chosen": -0.75499027967453, "logits/rejected": -0.6955289244651794, "logps/chosen": -398.29498291015625, "logps/rejected": -411.081787109375, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0006995415897108614, "rewards/margins": 0.0012732648756355047, "rewards/rejected": -0.001972806639969349, "step": 45 }, { "epoch": 0.030043268838272512, "grad_norm": 6.131893963737681, "learning_rate": 5.974025974025974e-08, "logits/chosen": -0.7929679155349731, "logits/rejected": -0.7688208222389221, "logps/chosen": -497.57989501953125, "logps/rejected": -497.4010925292969, "loss": 0.6938, "rewards/accuracies": 0.5, "rewards/chosen": -0.0028578471392393112, "rewards/margins": -0.0016406627837568521, "rewards/rejected": -0.0012171841226518154, "step": 46 }, { "epoch": 0.030696383378234958, "grad_norm": 6.308846265764008, "learning_rate": 6.103896103896104e-08, "logits/chosen": -0.7683753371238708, "logits/rejected": -0.7528908848762512, "logps/chosen": -499.39813232421875, "logps/rejected": -578.732666015625, "loss": 0.6915, "rewards/accuracies": 0.5625, "rewards/chosen": -0.002009766176342964, "rewards/margins": 0.002183923963457346, "rewards/rejected": -0.00419369013980031, "step": 47 }, { "epoch": 0.031349497918197404, "grad_norm": 6.225507507577421, "learning_rate": 6.233766233766233e-08, "logits/chosen": -0.891581654548645, "logits/rejected": -0.8392075896263123, "logps/chosen": -556.0016479492188, "logps/rejected": -578.8616943359375, "loss": 0.6921, "rewards/accuracies": 0.625, "rewards/chosen": -0.0008191632805392146, "rewards/margins": 0.0043422505259513855, "rewards/rejected": -0.005161413922905922, "step": 48 }, { "epoch": 0.03200261245815985, "grad_norm": 6.497895076049909, "learning_rate": 6.363636363636363e-08, "logits/chosen": -0.8468933701515198, "logits/rejected": -0.818824291229248, "logps/chosen": -434.587890625, "logps/rejected": -489.9275207519531, "loss": 0.6923, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0039938571862876415, "rewards/margins": 0.001086747506633401, "rewards/rejected": -0.005080604460090399, "step": 49 }, { "epoch": 0.032655726998122296, "grad_norm": 6.161089510077531, "learning_rate": 6.493506493506494e-08, "logits/chosen": -0.8131488561630249, "logits/rejected": -0.8177019357681274, "logps/chosen": -430.72418212890625, "logps/rejected": -458.6739501953125, "loss": 0.6931, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0010935901664197445, "rewards/margins": 0.002127976156771183, "rewards/rejected": -0.003221566788852215, "step": 50 }, { "epoch": 0.03330884153808474, "grad_norm": 6.1640122002418325, "learning_rate": 6.623376623376622e-08, "logits/chosen": -0.9181683659553528, "logits/rejected": -0.9139996767044067, "logps/chosen": -558.840087890625, "logps/rejected": -575.5423583984375, "loss": 0.6931, "rewards/accuracies": 0.4375, "rewards/chosen": -0.006191587541252375, "rewards/margins": -0.0004238271212670952, "rewards/rejected": -0.005767759867012501, "step": 51 }, { "epoch": 0.03396195607804719, "grad_norm": 7.623595975761383, "learning_rate": 6.753246753246753e-08, "logits/chosen": -0.9138858318328857, "logits/rejected": -0.8854256868362427, "logps/chosen": -449.04913330078125, "logps/rejected": -479.0080261230469, "loss": 0.6921, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0006973959389142692, "rewards/margins": 0.007185818627476692, "rewards/rejected": -0.007883214391767979, "step": 52 }, { "epoch": 0.034615070618009634, "grad_norm": 5.674823286453492, "learning_rate": 6.883116883116883e-08, "logits/chosen": -0.8322397470474243, "logits/rejected": -0.8036131858825684, "logps/chosen": -499.2374267578125, "logps/rejected": -474.3294677734375, "loss": 0.6917, "rewards/accuracies": 0.65625, "rewards/chosen": -0.002350029768422246, "rewards/margins": 0.002208204008638859, "rewards/rejected": -0.004558234475553036, "step": 53 }, { "epoch": 0.03526818515797208, "grad_norm": 6.418518626617565, "learning_rate": 7.012987012987013e-08, "logits/chosen": -0.9170102477073669, "logits/rejected": -0.9513068199157715, "logps/chosen": -508.000244140625, "logps/rejected": -505.0244140625, "loss": 0.6928, "rewards/accuracies": 0.40625, "rewards/chosen": -0.005700020585209131, "rewards/margins": -0.0018256568582728505, "rewards/rejected": -0.003874363610520959, "step": 54 }, { "epoch": 0.035921299697934526, "grad_norm": 5.982980505552759, "learning_rate": 7.142857142857142e-08, "logits/chosen": -0.8587192296981812, "logits/rejected": -0.8139776587486267, "logps/chosen": -489.4334716796875, "logps/rejected": -458.024658203125, "loss": 0.6923, "rewards/accuracies": 0.5, "rewards/chosen": -0.002181756542995572, "rewards/margins": 0.0015678240451961756, "rewards/rejected": -0.0037495801225304604, "step": 55 }, { "epoch": 0.03657441423789697, "grad_norm": 5.807776924515222, "learning_rate": 7.272727272727273e-08, "logits/chosen": -0.721636176109314, "logits/rejected": -0.6972697973251343, "logps/chosen": -414.83050537109375, "logps/rejected": -432.6552734375, "loss": 0.6926, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00701476726680994, "rewards/margins": -0.00036887641181237996, "rewards/rejected": -0.006645892281085253, "step": 56 }, { "epoch": 0.03722752877785942, "grad_norm": 6.888909832263999, "learning_rate": 7.402597402597403e-08, "logits/chosen": -0.8523138761520386, "logits/rejected": -0.8489971160888672, "logps/chosen": -450.8440246582031, "logps/rejected": -456.98492431640625, "loss": 0.6913, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0033631990663707256, "rewards/margins": -0.00023952008632477373, "rewards/rejected": -0.0031236789654940367, "step": 57 }, { "epoch": 0.037880643317821865, "grad_norm": 5.989946098741673, "learning_rate": 7.532467532467532e-08, "logits/chosen": -0.8812742233276367, "logits/rejected": -0.8634947538375854, "logps/chosen": -503.89544677734375, "logps/rejected": -511.4792175292969, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": -0.008199996314942837, "rewards/margins": -0.0015930174849927425, "rewards/rejected": -0.006606978829950094, "step": 58 }, { "epoch": 0.03853375785778431, "grad_norm": 7.748259119189096, "learning_rate": 7.662337662337662e-08, "logits/chosen": -0.9024884104728699, "logits/rejected": -0.8909275531768799, "logps/chosen": -421.2478332519531, "logps/rejected": -468.47564697265625, "loss": 0.6918, "rewards/accuracies": 0.5, "rewards/chosen": -0.008577623404562473, "rewards/margins": 0.0016973968595266342, "rewards/rejected": -0.010275020264089108, "step": 59 }, { "epoch": 0.03918687239774676, "grad_norm": 6.109377238755764, "learning_rate": 7.792207792207792e-08, "logits/chosen": -0.8703082203865051, "logits/rejected": -0.8328035473823547, "logps/chosen": -481.5205383300781, "logps/rejected": -503.9502258300781, "loss": 0.6912, "rewards/accuracies": 0.5, "rewards/chosen": -0.004758486524224281, "rewards/margins": 0.0038297700230032206, "rewards/rejected": -0.008588257245719433, "step": 60 }, { "epoch": 0.0398399869377092, "grad_norm": 5.995806133025539, "learning_rate": 7.922077922077923e-08, "logits/chosen": -0.7556477785110474, "logits/rejected": -0.6751359701156616, "logps/chosen": -434.9151611328125, "logps/rejected": -478.38226318359375, "loss": 0.6923, "rewards/accuracies": 0.5625, "rewards/chosen": -0.008997991681098938, "rewards/margins": 0.001691188896074891, "rewards/rejected": -0.010689180344343185, "step": 61 }, { "epoch": 0.04049310147767165, "grad_norm": 6.174429823082506, "learning_rate": 8.051948051948052e-08, "logits/chosen": -0.8938629031181335, "logits/rejected": -0.8404133319854736, "logps/chosen": -423.1520690917969, "logps/rejected": -414.3715515136719, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0060334536246955395, "rewards/margins": -0.0003200914361514151, "rewards/rejected": -0.005713362712413073, "step": 62 }, { "epoch": 0.041146216017634095, "grad_norm": 6.807841842313529, "learning_rate": 8.181818181818182e-08, "logits/chosen": -0.8384872078895569, "logits/rejected": -0.7748404741287231, "logps/chosen": -509.7137145996094, "logps/rejected": -572.6459350585938, "loss": 0.6921, "rewards/accuracies": 0.53125, "rewards/chosen": -0.009534873999655247, "rewards/margins": 0.00488923117518425, "rewards/rejected": -0.014424105174839497, "step": 63 }, { "epoch": 0.04179933055759654, "grad_norm": 5.919031965066986, "learning_rate": 8.311688311688312e-08, "logits/chosen": -0.8469180464744568, "logits/rejected": -0.7904057502746582, "logps/chosen": -456.8668212890625, "logps/rejected": -472.22271728515625, "loss": 0.6921, "rewards/accuracies": 0.53125, "rewards/chosen": -0.007433886174112558, "rewards/margins": 0.0018512008246034384, "rewards/rejected": -0.009285087697207928, "step": 64 }, { "epoch": 0.04245244509755899, "grad_norm": 7.5278828111367, "learning_rate": 8.441558441558441e-08, "logits/chosen": -0.8526628017425537, "logits/rejected": -0.8213850259780884, "logps/chosen": -491.56292724609375, "logps/rejected": -523.5179443359375, "loss": 0.6908, "rewards/accuracies": 0.59375, "rewards/chosen": -0.009033655747771263, "rewards/margins": 0.0045729633420705795, "rewards/rejected": -0.013606620021164417, "step": 65 }, { "epoch": 0.04310555963752143, "grad_norm": 5.958334774704796, "learning_rate": 8.57142857142857e-08, "logits/chosen": -0.788672924041748, "logits/rejected": -0.7575796842575073, "logps/chosen": -404.4598083496094, "logps/rejected": -429.7571716308594, "loss": 0.6917, "rewards/accuracies": 0.4375, "rewards/chosen": -0.009336566552519798, "rewards/margins": 0.002134217880666256, "rewards/rejected": -0.01147078350186348, "step": 66 }, { "epoch": 0.04375867417748388, "grad_norm": 7.088944587354933, "learning_rate": 8.7012987012987e-08, "logits/chosen": -0.7902776002883911, "logits/rejected": -0.7541419267654419, "logps/chosen": -469.0950927734375, "logps/rejected": -450.28570556640625, "loss": 0.6904, "rewards/accuracies": 0.59375, "rewards/chosen": -0.008544647134840488, "rewards/margins": 0.002165439072996378, "rewards/rejected": -0.010710087604820728, "step": 67 }, { "epoch": 0.044411788717446325, "grad_norm": 5.263961432989652, "learning_rate": 8.831168831168831e-08, "logits/chosen": -0.821535050868988, "logits/rejected": -0.7698171138763428, "logps/chosen": -429.2215576171875, "logps/rejected": -410.6090393066406, "loss": 0.6917, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0062387725338339806, "rewards/margins": 0.004181134980171919, "rewards/rejected": -0.010419907979667187, "step": 68 }, { "epoch": 0.04506490325740877, "grad_norm": 7.083198290201972, "learning_rate": 8.96103896103896e-08, "logits/chosen": -0.8009337186813354, "logits/rejected": -0.7727637887001038, "logps/chosen": -487.6383972167969, "logps/rejected": -469.3074035644531, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": -0.010825095698237419, "rewards/margins": 0.004274988081306219, "rewards/rejected": -0.015100083313882351, "step": 69 }, { "epoch": 0.04571801779737122, "grad_norm": 7.4421814888402755, "learning_rate": 9.09090909090909e-08, "logits/chosen": -0.8108646273612976, "logits/rejected": -0.7533100843429565, "logps/chosen": -430.077880859375, "logps/rejected": -439.3136901855469, "loss": 0.6907, "rewards/accuracies": 0.65625, "rewards/chosen": -0.01343483291566372, "rewards/margins": 0.005768372677266598, "rewards/rejected": -0.019203204661607742, "step": 70 }, { "epoch": 0.046371132337333656, "grad_norm": 5.711367581988735, "learning_rate": 9.22077922077922e-08, "logits/chosen": -0.7984856367111206, "logits/rejected": -0.692218542098999, "logps/chosen": -420.7679443359375, "logps/rejected": -392.79638671875, "loss": 0.6916, "rewards/accuracies": 0.40625, "rewards/chosen": -0.01482747495174408, "rewards/margins": -0.0020556068047881126, "rewards/rejected": -0.012771867215633392, "step": 71 }, { "epoch": 0.0470242468772961, "grad_norm": 6.490728908525714, "learning_rate": 9.35064935064935e-08, "logits/chosen": -0.6769509315490723, "logits/rejected": -0.669062077999115, "logps/chosen": -467.250244140625, "logps/rejected": -452.87518310546875, "loss": 0.691, "rewards/accuracies": 0.65625, "rewards/chosen": -0.010883898474276066, "rewards/margins": 0.005101327784359455, "rewards/rejected": -0.01598522625863552, "step": 72 }, { "epoch": 0.04767736141725855, "grad_norm": 5.9084258412842425, "learning_rate": 9.48051948051948e-08, "logits/chosen": -0.800752580165863, "logits/rejected": -0.809019148349762, "logps/chosen": -480.45501708984375, "logps/rejected": -450.3203430175781, "loss": 0.6931, "rewards/accuracies": 0.375, "rewards/chosen": -0.020324386656284332, "rewards/margins": -0.002547673648223281, "rewards/rejected": -0.017776712775230408, "step": 73 }, { "epoch": 0.048330475957220995, "grad_norm": 6.771433114214737, "learning_rate": 9.61038961038961e-08, "logits/chosen": -0.8106687664985657, "logits/rejected": -0.7991058230400085, "logps/chosen": -474.6903991699219, "logps/rejected": -460.9072265625, "loss": 0.6902, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017290135845541954, "rewards/margins": 0.004097265657037497, "rewards/rejected": -0.02138740010559559, "step": 74 }, { "epoch": 0.04898359049718344, "grad_norm": 6.111034389669093, "learning_rate": 9.74025974025974e-08, "logits/chosen": -0.972756028175354, "logits/rejected": -0.9051934480667114, "logps/chosen": -499.28411865234375, "logps/rejected": -504.16778564453125, "loss": 0.6904, "rewards/accuracies": 0.8125, "rewards/chosen": -0.011824184097349644, "rewards/margins": 0.01180923543870449, "rewards/rejected": -0.02363341674208641, "step": 75 }, { "epoch": 0.04963670503714589, "grad_norm": 6.052193857616701, "learning_rate": 9.870129870129869e-08, "logits/chosen": -0.8308959603309631, "logits/rejected": -0.7871338129043579, "logps/chosen": -451.3121032714844, "logps/rejected": -426.82366943359375, "loss": 0.6898, "rewards/accuracies": 0.625, "rewards/chosen": -0.015030915848910809, "rewards/margins": 0.0026831624563783407, "rewards/rejected": -0.01771407760679722, "step": 76 }, { "epoch": 0.05028981957710833, "grad_norm": 6.393654989952147, "learning_rate": 1e-07, "logits/chosen": -0.8764944672584534, "logits/rejected": -0.823025643825531, "logps/chosen": -431.4548034667969, "logps/rejected": -456.610107421875, "loss": 0.69, "rewards/accuracies": 0.625, "rewards/chosen": -0.01578567549586296, "rewards/margins": 0.006993000861257315, "rewards/rejected": -0.022778674960136414, "step": 77 }, { "epoch": 0.05094293411707078, "grad_norm": 6.160999400670431, "learning_rate": 1.0129870129870129e-07, "logits/chosen": -0.8555769920349121, "logits/rejected": -0.8144134283065796, "logps/chosen": -466.14141845703125, "logps/rejected": -429.2846374511719, "loss": 0.692, "rewards/accuracies": 0.59375, "rewards/chosen": -0.019040152430534363, "rewards/margins": 0.004302749410271645, "rewards/rejected": -0.023342899978160858, "step": 78 }, { "epoch": 0.051596048657033225, "grad_norm": 7.056169927053459, "learning_rate": 1.0259740259740259e-07, "logits/chosen": -0.9062113761901855, "logits/rejected": -0.8697599172592163, "logps/chosen": -453.70489501953125, "logps/rejected": -428.64678955078125, "loss": 0.6906, "rewards/accuracies": 0.6875, "rewards/chosen": -0.017968041822314262, "rewards/margins": 0.005362831987440586, "rewards/rejected": -0.023330872878432274, "step": 79 }, { "epoch": 0.05224916319699567, "grad_norm": 6.855093023185875, "learning_rate": 1.0389610389610388e-07, "logits/chosen": -0.8560048341751099, "logits/rejected": -0.8402966260910034, "logps/chosen": -530.1753540039062, "logps/rejected": -501.68438720703125, "loss": 0.6894, "rewards/accuracies": 0.59375, "rewards/chosen": -0.021542314440011978, "rewards/margins": 0.006968836300075054, "rewards/rejected": -0.028511153534054756, "step": 80 }, { "epoch": 0.05290227773695812, "grad_norm": 7.095472440756087, "learning_rate": 1.051948051948052e-07, "logits/chosen": -0.7790235877037048, "logits/rejected": -0.7337735295295715, "logps/chosen": -510.4595947265625, "logps/rejected": -553.8370361328125, "loss": 0.6896, "rewards/accuracies": 0.59375, "rewards/chosen": -0.03253030404448509, "rewards/margins": 0.009851142764091492, "rewards/rejected": -0.042381446808576584, "step": 81 }, { "epoch": 0.05355539227692056, "grad_norm": 5.876457899804008, "learning_rate": 1.0649350649350648e-07, "logits/chosen": -0.8381537199020386, "logits/rejected": -0.8183386921882629, "logps/chosen": -477.2951965332031, "logps/rejected": -468.66448974609375, "loss": 0.6922, "rewards/accuracies": 0.53125, "rewards/chosen": -0.022215649485588074, "rewards/margins": 0.0009337332448922098, "rewards/rejected": -0.023149382323026657, "step": 82 }, { "epoch": 0.05420850681688301, "grad_norm": 8.317291551451623, "learning_rate": 1.0779220779220779e-07, "logits/chosen": -0.8544670343399048, "logits/rejected": -0.8863725066184998, "logps/chosen": -600.6954345703125, "logps/rejected": -707.2957153320312, "loss": 0.6885, "rewards/accuracies": 0.59375, "rewards/chosen": -0.039892133325338364, "rewards/margins": 0.01902475208044052, "rewards/rejected": -0.05891688913106918, "step": 83 }, { "epoch": 0.054861621356845455, "grad_norm": 6.21282343438225, "learning_rate": 1.0909090909090908e-07, "logits/chosen": -0.8715451955795288, "logits/rejected": -0.9091987609863281, "logps/chosen": -466.81402587890625, "logps/rejected": -482.4119873046875, "loss": 0.6894, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02122860960662365, "rewards/margins": 0.0019442938501015306, "rewards/rejected": -0.023172900080680847, "step": 84 }, { "epoch": 0.0555147358968079, "grad_norm": 5.796572771273212, "learning_rate": 1.1038961038961039e-07, "logits/chosen": -0.8712518215179443, "logits/rejected": -0.8297672271728516, "logps/chosen": -540.7307739257812, "logps/rejected": -499.5742492675781, "loss": 0.6913, "rewards/accuracies": 0.625, "rewards/chosen": -0.025148704648017883, "rewards/margins": 0.0031996634788811207, "rewards/rejected": -0.028348371386528015, "step": 85 }, { "epoch": 0.05616785043677035, "grad_norm": 6.308588424445465, "learning_rate": 1.1168831168831168e-07, "logits/chosen": -0.8423592448234558, "logits/rejected": -0.8116931319236755, "logps/chosen": -547.5648193359375, "logps/rejected": -553.7283325195312, "loss": 0.6886, "rewards/accuracies": 0.59375, "rewards/chosen": -0.028540167957544327, "rewards/margins": 0.01041246484965086, "rewards/rejected": -0.03895263373851776, "step": 86 }, { "epoch": 0.05682096497673279, "grad_norm": 6.03429981202279, "learning_rate": 1.1298701298701299e-07, "logits/chosen": -0.8049710392951965, "logits/rejected": -0.7904379367828369, "logps/chosen": -479.36773681640625, "logps/rejected": -507.9740295410156, "loss": 0.6888, "rewards/accuracies": 0.6875, "rewards/chosen": -0.03078952059149742, "rewards/margins": 0.009615332819521427, "rewards/rejected": -0.040404852479696274, "step": 87 }, { "epoch": 0.05747407951669524, "grad_norm": 6.144437437040281, "learning_rate": 1.1428571428571427e-07, "logits/chosen": -0.7853479385375977, "logits/rejected": -0.7315254211425781, "logps/chosen": -455.5406188964844, "logps/rejected": -487.3783264160156, "loss": 0.6904, "rewards/accuracies": 0.65625, "rewards/chosen": -0.027616048231720924, "rewards/margins": 0.006320532876998186, "rewards/rejected": -0.033936578780412674, "step": 88 }, { "epoch": 0.058127194056657686, "grad_norm": 5.933687262032515, "learning_rate": 1.1558441558441558e-07, "logits/chosen": -0.7603040933609009, "logits/rejected": -0.7568442821502686, "logps/chosen": -436.42315673828125, "logps/rejected": -479.44122314453125, "loss": 0.6905, "rewards/accuracies": 0.59375, "rewards/chosen": -0.028719400987029076, "rewards/margins": 0.008343620225787163, "rewards/rejected": -0.03706301748752594, "step": 89 }, { "epoch": 0.05878030859662013, "grad_norm": 6.521345864177329, "learning_rate": 1.1688311688311687e-07, "logits/chosen": -0.7553395628929138, "logits/rejected": -0.7436259388923645, "logps/chosen": -538.7774047851562, "logps/rejected": -584.1131591796875, "loss": 0.6879, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03026486374437809, "rewards/margins": 0.010531079024076462, "rewards/rejected": -0.0407959409058094, "step": 90 }, { "epoch": 0.05943342313658258, "grad_norm": 6.090184904322082, "learning_rate": 1.1818181818181818e-07, "logits/chosen": -0.9382575154304504, "logits/rejected": -0.9285158514976501, "logps/chosen": -448.5179443359375, "logps/rejected": -442.5186462402344, "loss": 0.6886, "rewards/accuracies": 0.53125, "rewards/chosen": -0.03046746551990509, "rewards/margins": 0.004350304137915373, "rewards/rejected": -0.03481777012348175, "step": 91 }, { "epoch": 0.060086537676545024, "grad_norm": 6.195989030580817, "learning_rate": 1.1948051948051947e-07, "logits/chosen": -0.9594197273254395, "logits/rejected": -0.9092954397201538, "logps/chosen": -488.979736328125, "logps/rejected": -459.5779113769531, "loss": 0.6891, "rewards/accuracies": 0.46875, "rewards/chosen": -0.031389351934194565, "rewards/margins": 0.0023029684089124203, "rewards/rejected": -0.03369232267141342, "step": 92 }, { "epoch": 0.06073965221650747, "grad_norm": 6.679170721604982, "learning_rate": 1.207792207792208e-07, "logits/chosen": -0.8515009880065918, "logits/rejected": -0.8676334619522095, "logps/chosen": -440.25531005859375, "logps/rejected": -465.7403564453125, "loss": 0.6878, "rewards/accuracies": 0.65625, "rewards/chosen": -0.033117592334747314, "rewards/margins": 0.022125840187072754, "rewards/rejected": -0.05524343252182007, "step": 93 }, { "epoch": 0.061392766756469916, "grad_norm": 7.826182713581807, "learning_rate": 1.2207792207792208e-07, "logits/chosen": -0.8225115537643433, "logits/rejected": -0.8626826405525208, "logps/chosen": -454.4405822753906, "logps/rejected": -567.5311279296875, "loss": 0.6824, "rewards/accuracies": 0.625, "rewards/chosen": -0.030026568099856377, "rewards/margins": 0.03317389637231827, "rewards/rejected": -0.0632004663348198, "step": 94 }, { "epoch": 0.06204588129643236, "grad_norm": 6.19437116817884, "learning_rate": 1.2337662337662337e-07, "logits/chosen": -0.92811518907547, "logits/rejected": -0.9229671359062195, "logps/chosen": -429.3043212890625, "logps/rejected": -448.91278076171875, "loss": 0.6874, "rewards/accuracies": 0.75, "rewards/chosen": -0.03139014542102814, "rewards/margins": 0.01759587787091732, "rewards/rejected": -0.04898602515459061, "step": 95 }, { "epoch": 0.06269899583639481, "grad_norm": 6.410822303042916, "learning_rate": 1.2467532467532466e-07, "logits/chosen": -0.7579972147941589, "logits/rejected": -0.749883234500885, "logps/chosen": -426.28485107421875, "logps/rejected": -437.2420654296875, "loss": 0.6868, "rewards/accuracies": 0.84375, "rewards/chosen": -0.028881024569272995, "rewards/margins": 0.018690448254346848, "rewards/rejected": -0.04757147282361984, "step": 96 }, { "epoch": 0.06335211037635725, "grad_norm": 6.669657257596277, "learning_rate": 1.2597402597402597e-07, "logits/chosen": -0.8033837080001831, "logits/rejected": -0.7326114177703857, "logps/chosen": -448.3200378417969, "logps/rejected": -457.5045471191406, "loss": 0.685, "rewards/accuracies": 0.65625, "rewards/chosen": -0.035421933978796005, "rewards/margins": 0.008486203849315643, "rewards/rejected": -0.04390813410282135, "step": 97 }, { "epoch": 0.0640052249163197, "grad_norm": 6.214717449042044, "learning_rate": 1.2727272727272726e-07, "logits/chosen": -0.8324880003929138, "logits/rejected": -0.8319780230522156, "logps/chosen": -511.32891845703125, "logps/rejected": -537.2152099609375, "loss": 0.6885, "rewards/accuracies": 0.5625, "rewards/chosen": -0.047659579664468765, "rewards/margins": 0.014243985526263714, "rewards/rejected": -0.06190356984734535, "step": 98 }, { "epoch": 0.06465833945628215, "grad_norm": 6.031504782803402, "learning_rate": 1.2857142857142858e-07, "logits/chosen": -0.8415837287902832, "logits/rejected": -0.817263126373291, "logps/chosen": -524.89501953125, "logps/rejected": -573.4385375976562, "loss": 0.6886, "rewards/accuracies": 0.71875, "rewards/chosen": -0.04896525666117668, "rewards/margins": 0.025869663804769516, "rewards/rejected": -0.0748349204659462, "step": 99 }, { "epoch": 0.06531145399624459, "grad_norm": 6.485913528950639, "learning_rate": 1.2987012987012987e-07, "logits/chosen": -0.7721492052078247, "logits/rejected": -0.6845570802688599, "logps/chosen": -489.182861328125, "logps/rejected": -462.3575439453125, "loss": 0.686, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05609467625617981, "rewards/margins": 0.005238112062215805, "rewards/rejected": -0.06133279576897621, "step": 100 }, { "epoch": 0.06531145399624459, "eval_logits/chosen": -0.7427287697792053, "eval_logits/rejected": -0.7034040093421936, "eval_logps/chosen": -478.8180847167969, "eval_logps/rejected": -471.3314514160156, "eval_loss": 0.6856335997581482, "eval_rewards/accuracies": 0.6480000019073486, "eval_rewards/chosen": -0.04910915717482567, "eval_rewards/margins": 0.012519298121333122, "eval_rewards/rejected": -0.06162845715880394, "eval_runtime": 615.2872, "eval_samples_per_second": 6.501, "eval_steps_per_second": 0.406, "step": 100 }, { "epoch": 0.06596456853620704, "grad_norm": 6.288509399205162, "learning_rate": 1.3116883116883116e-07, "logits/chosen": -0.8475708961486816, "logits/rejected": -0.8312665224075317, "logps/chosen": -409.4945068359375, "logps/rejected": -514.4505004882812, "loss": 0.6861, "rewards/accuracies": 0.65625, "rewards/chosen": -0.036619801074266434, "rewards/margins": 0.03393262252211571, "rewards/rejected": -0.07055243104696274, "step": 101 }, { "epoch": 0.06661768307616948, "grad_norm": 8.43910713373062, "learning_rate": 1.3246753246753245e-07, "logits/chosen": -0.8536262512207031, "logits/rejected": -0.7814661264419556, "logps/chosen": -474.07562255859375, "logps/rejected": -504.9327697753906, "loss": 0.6803, "rewards/accuracies": 0.75, "rewards/chosen": -0.041596654802560806, "rewards/margins": 0.02779841423034668, "rewards/rejected": -0.06939506530761719, "step": 102 }, { "epoch": 0.06727079761613193, "grad_norm": 5.9455668904408405, "learning_rate": 1.3376623376623374e-07, "logits/chosen": -0.8607446551322937, "logits/rejected": -0.8579328656196594, "logps/chosen": -480.89239501953125, "logps/rejected": -477.357177734375, "loss": 0.6889, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04734374210238457, "rewards/margins": 0.0077428012154996395, "rewards/rejected": -0.05508654564619064, "step": 103 }, { "epoch": 0.06792391215609438, "grad_norm": 7.084091083186374, "learning_rate": 1.3506493506493506e-07, "logits/chosen": -0.7221732139587402, "logits/rejected": -0.7565256953239441, "logps/chosen": -453.64556884765625, "logps/rejected": -535.10986328125, "loss": 0.6872, "rewards/accuracies": 0.78125, "rewards/chosen": -0.05151586979627609, "rewards/margins": 0.03345106542110443, "rewards/rejected": -0.08496693521738052, "step": 104 }, { "epoch": 0.06857702669605682, "grad_norm": 7.518260078445051, "learning_rate": 1.3636363636363635e-07, "logits/chosen": -0.8677546381950378, "logits/rejected": -0.8008454442024231, "logps/chosen": -466.8918151855469, "logps/rejected": -481.9544982910156, "loss": 0.6826, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0516490712761879, "rewards/margins": 0.025956520810723305, "rewards/rejected": -0.07760559022426605, "step": 105 }, { "epoch": 0.06923014123601927, "grad_norm": 6.820450121720659, "learning_rate": 1.3766233766233766e-07, "logits/chosen": -0.7937129735946655, "logits/rejected": -0.7826783657073975, "logps/chosen": -448.1839599609375, "logps/rejected": -473.87188720703125, "loss": 0.6822, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0625004917383194, "rewards/margins": 0.016829611733555794, "rewards/rejected": -0.07933010160923004, "step": 106 }, { "epoch": 0.06988325577598171, "grad_norm": 7.2075078702746715, "learning_rate": 1.3896103896103895e-07, "logits/chosen": -0.9469939470291138, "logits/rejected": -0.9710195064544678, "logps/chosen": -528.0358276367188, "logps/rejected": -546.5836791992188, "loss": 0.6826, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06540645658969879, "rewards/margins": 0.015278171747922897, "rewards/rejected": -0.08068463206291199, "step": 107 }, { "epoch": 0.07053637031594416, "grad_norm": 5.859716807084932, "learning_rate": 1.4025974025974027e-07, "logits/chosen": -0.9260960817337036, "logits/rejected": -0.9055756330490112, "logps/chosen": -437.0494079589844, "logps/rejected": -446.2071533203125, "loss": 0.6855, "rewards/accuracies": 0.625, "rewards/chosen": -0.059963203966617584, "rewards/margins": 0.01329735480248928, "rewards/rejected": -0.07326056808233261, "step": 108 }, { "epoch": 0.0711894848559066, "grad_norm": 6.594713572620182, "learning_rate": 1.4155844155844153e-07, "logits/chosen": -0.8733989000320435, "logits/rejected": -0.8238499164581299, "logps/chosen": -490.5877380371094, "logps/rejected": -517.9608764648438, "loss": 0.6851, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07540058344602585, "rewards/margins": 0.01694507710635662, "rewards/rejected": -0.09234566241502762, "step": 109 }, { "epoch": 0.07184259939586905, "grad_norm": 5.977398778954903, "learning_rate": 1.4285714285714285e-07, "logits/chosen": -0.6950225830078125, "logits/rejected": -0.7042300701141357, "logps/chosen": -360.7035217285156, "logps/rejected": -361.4801330566406, "loss": 0.6849, "rewards/accuracies": 0.40625, "rewards/chosen": -0.05551227554678917, "rewards/margins": 0.004719756543636322, "rewards/rejected": -0.06023203581571579, "step": 110 }, { "epoch": 0.0724957139358315, "grad_norm": 6.145204957324782, "learning_rate": 1.4415584415584414e-07, "logits/chosen": -0.7611751556396484, "logits/rejected": -0.7577171325683594, "logps/chosen": -493.44537353515625, "logps/rejected": -515.5286254882812, "loss": 0.6834, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08799681067466736, "rewards/margins": 0.019383134320378304, "rewards/rejected": -0.10737993568181992, "step": 111 }, { "epoch": 0.07314882847579394, "grad_norm": 6.213137266301709, "learning_rate": 1.4545454545454545e-07, "logits/chosen": -0.8211960792541504, "logits/rejected": -0.7990384697914124, "logps/chosen": -427.08642578125, "logps/rejected": -450.64056396484375, "loss": 0.6845, "rewards/accuracies": 0.53125, "rewards/chosen": -0.0742255300283432, "rewards/margins": 0.0058108242228627205, "rewards/rejected": -0.08003635704517365, "step": 112 }, { "epoch": 0.07380194301575639, "grad_norm": 6.315726030814639, "learning_rate": 1.4675324675324674e-07, "logits/chosen": -0.7984048128128052, "logits/rejected": -0.7764061689376831, "logps/chosen": -471.4085693359375, "logps/rejected": -501.5404052734375, "loss": 0.6777, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07434587180614471, "rewards/margins": 0.03265540301799774, "rewards/rejected": -0.10700127482414246, "step": 113 }, { "epoch": 0.07445505755571884, "grad_norm": 7.263965754945047, "learning_rate": 1.4805194805194806e-07, "logits/chosen": -0.7634120583534241, "logits/rejected": -0.7683815360069275, "logps/chosen": -467.3013916015625, "logps/rejected": -447.38470458984375, "loss": 0.6804, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06479562073945999, "rewards/margins": 0.022856786847114563, "rewards/rejected": -0.08765240758657455, "step": 114 }, { "epoch": 0.07510817209568128, "grad_norm": 6.8685173621443125, "learning_rate": 1.4935064935064935e-07, "logits/chosen": -0.7747349143028259, "logits/rejected": -0.7697080373764038, "logps/chosen": -427.6845703125, "logps/rejected": -428.98291015625, "loss": 0.6787, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07621235400438309, "rewards/margins": 0.010828008875250816, "rewards/rejected": -0.08704036474227905, "step": 115 }, { "epoch": 0.07576128663564373, "grad_norm": 6.505431738842133, "learning_rate": 1.5064935064935064e-07, "logits/chosen": -0.9600522518157959, "logits/rejected": -0.9311190843582153, "logps/chosen": -488.33428955078125, "logps/rejected": -485.6419677734375, "loss": 0.68, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06629984080791473, "rewards/margins": 0.011177876964211464, "rewards/rejected": -0.07747771590948105, "step": 116 }, { "epoch": 0.07641440117560618, "grad_norm": 7.44390451084911, "learning_rate": 1.5194805194805193e-07, "logits/chosen": -0.7398961782455444, "logits/rejected": -0.7255181670188904, "logps/chosen": -511.2425231933594, "logps/rejected": -504.2010803222656, "loss": 0.6798, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07773499935865402, "rewards/margins": 0.02082427777349949, "rewards/rejected": -0.09855926781892776, "step": 117 }, { "epoch": 0.07706751571556862, "grad_norm": 5.877894495278698, "learning_rate": 1.5324675324675324e-07, "logits/chosen": -0.8150400519371033, "logits/rejected": -0.7478416562080383, "logps/chosen": -445.16583251953125, "logps/rejected": -471.5647888183594, "loss": 0.6848, "rewards/accuracies": 0.75, "rewards/chosen": -0.08694851398468018, "rewards/margins": 0.02347085252404213, "rewards/rejected": -0.110419362783432, "step": 118 }, { "epoch": 0.07772063025553107, "grad_norm": 6.184245731480456, "learning_rate": 1.5454545454545453e-07, "logits/chosen": -0.9232463836669922, "logits/rejected": -0.8822212219238281, "logps/chosen": -441.2488098144531, "logps/rejected": -428.74822998046875, "loss": 0.6788, "rewards/accuracies": 0.625, "rewards/chosen": -0.07863913476467133, "rewards/margins": 0.019341323524713516, "rewards/rejected": -0.09798046201467514, "step": 119 }, { "epoch": 0.07837374479549351, "grad_norm": 6.059601097022327, "learning_rate": 1.5584415584415585e-07, "logits/chosen": -0.7953625917434692, "logits/rejected": -0.7896191477775574, "logps/chosen": -454.513671875, "logps/rejected": -499.6773681640625, "loss": 0.6858, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09819990396499634, "rewards/margins": 0.030399596318602562, "rewards/rejected": -0.12859950959682465, "step": 120 }, { "epoch": 0.07902685933545596, "grad_norm": 6.332138116216839, "learning_rate": 1.5714285714285714e-07, "logits/chosen": -0.8561302423477173, "logits/rejected": -0.8522703647613525, "logps/chosen": -545.7410888671875, "logps/rejected": -648.7020874023438, "loss": 0.6739, "rewards/accuracies": 0.78125, "rewards/chosen": -0.13791513442993164, "rewards/margins": 0.099464550614357, "rewards/rejected": -0.23737969994544983, "step": 121 }, { "epoch": 0.0796799738754184, "grad_norm": 6.533955823119497, "learning_rate": 1.5844155844155846e-07, "logits/chosen": -0.9354885220527649, "logits/rejected": -0.8845950961112976, "logps/chosen": -550.8383178710938, "logps/rejected": -526.287109375, "loss": 0.6829, "rewards/accuracies": 0.71875, "rewards/chosen": -0.08527510613203049, "rewards/margins": 0.020781541243195534, "rewards/rejected": -0.10605664551258087, "step": 122 }, { "epoch": 0.08033308841538085, "grad_norm": 6.228795518907741, "learning_rate": 1.5974025974025972e-07, "logits/chosen": -0.8380718231201172, "logits/rejected": -0.7975863218307495, "logps/chosen": -441.62408447265625, "logps/rejected": -465.06524658203125, "loss": 0.6811, "rewards/accuracies": 0.625, "rewards/chosen": -0.09198759496212006, "rewards/margins": 0.05546523258090019, "rewards/rejected": -0.14745283126831055, "step": 123 }, { "epoch": 0.0809862029553433, "grad_norm": 6.98012118568781, "learning_rate": 1.6103896103896104e-07, "logits/chosen": -0.9168799519538879, "logits/rejected": -0.8905295729637146, "logps/chosen": -487.2784118652344, "logps/rejected": -439.59735107421875, "loss": 0.6846, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07767526805400848, "rewards/margins": 0.01843424327671528, "rewards/rejected": -0.09610950946807861, "step": 124 }, { "epoch": 0.08163931749530574, "grad_norm": 7.3021683779348265, "learning_rate": 1.6233766233766232e-07, "logits/chosen": -0.873420000076294, "logits/rejected": -0.8396754264831543, "logps/chosen": -549.7445678710938, "logps/rejected": -550.0377197265625, "loss": 0.6809, "rewards/accuracies": 0.625, "rewards/chosen": -0.13183246552944183, "rewards/margins": 0.015030565671622753, "rewards/rejected": -0.14686302840709686, "step": 125 }, { "epoch": 0.08229243203526819, "grad_norm": 6.911212156499355, "learning_rate": 1.6363636363636364e-07, "logits/chosen": -0.8185966610908508, "logits/rejected": -0.8198251724243164, "logps/chosen": -501.91595458984375, "logps/rejected": -536.2969970703125, "loss": 0.6708, "rewards/accuracies": 0.75, "rewards/chosen": -0.1114685982465744, "rewards/margins": 0.05052608996629715, "rewards/rejected": -0.16199469566345215, "step": 126 }, { "epoch": 0.08294554657523064, "grad_norm": 7.712937728537032, "learning_rate": 1.6493506493506493e-07, "logits/chosen": -0.8163201808929443, "logits/rejected": -0.7817766666412354, "logps/chosen": -549.0859985351562, "logps/rejected": -532.6451416015625, "loss": 0.671, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1389622688293457, "rewards/margins": 0.05282050371170044, "rewards/rejected": -0.19178277254104614, "step": 127 }, { "epoch": 0.08359866111519308, "grad_norm": 6.718382214474356, "learning_rate": 1.6623376623376625e-07, "logits/chosen": -0.9383991360664368, "logits/rejected": -0.9336289167404175, "logps/chosen": -490.7161865234375, "logps/rejected": -474.29144287109375, "loss": 0.6741, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09263370931148529, "rewards/margins": 0.04221198707818985, "rewards/rejected": -0.13484567403793335, "step": 128 }, { "epoch": 0.08425177565515553, "grad_norm": 5.961800672060285, "learning_rate": 1.6753246753246754e-07, "logits/chosen": -0.878481388092041, "logits/rejected": -0.7739174365997314, "logps/chosen": -474.29010009765625, "logps/rejected": -467.60345458984375, "loss": 0.6778, "rewards/accuracies": 0.625, "rewards/chosen": -0.15530624985694885, "rewards/margins": 0.038683634251356125, "rewards/rejected": -0.19398987293243408, "step": 129 }, { "epoch": 0.08490489019511797, "grad_norm": 6.6363324619995385, "learning_rate": 1.6883116883116883e-07, "logits/chosen": -0.9586436748504639, "logits/rejected": -0.9272456169128418, "logps/chosen": -548.6116943359375, "logps/rejected": -552.0598754882812, "loss": 0.6822, "rewards/accuracies": 0.625, "rewards/chosen": -0.12150692194700241, "rewards/margins": 0.045128632336854935, "rewards/rejected": -0.16663555800914764, "step": 130 }, { "epoch": 0.08555800473508042, "grad_norm": 7.156351873248016, "learning_rate": 1.7012987012987012e-07, "logits/chosen": -0.8477173447608948, "logits/rejected": -0.8483027219772339, "logps/chosen": -464.6786193847656, "logps/rejected": -552.1746826171875, "loss": 0.6671, "rewards/accuracies": 0.75, "rewards/chosen": -0.13094085454940796, "rewards/margins": 0.07437079399824142, "rewards/rejected": -0.2053116410970688, "step": 131 }, { "epoch": 0.08621111927504287, "grad_norm": 11.567562751248682, "learning_rate": 1.714285714285714e-07, "logits/chosen": -0.810928463935852, "logits/rejected": -0.8170727491378784, "logps/chosen": -473.8848571777344, "logps/rejected": -483.2086486816406, "loss": 0.6811, "rewards/accuracies": 0.75, "rewards/chosen": -0.11197689175605774, "rewards/margins": 0.042106710374355316, "rewards/rejected": -0.15408360958099365, "step": 132 }, { "epoch": 0.08686423381500531, "grad_norm": 6.666536354703348, "learning_rate": 1.7272727272727272e-07, "logits/chosen": -0.8156629204750061, "logits/rejected": -0.7771432399749756, "logps/chosen": -588.0108032226562, "logps/rejected": -608.7498168945312, "loss": 0.6779, "rewards/accuracies": 0.71875, "rewards/chosen": -0.20428195595741272, "rewards/margins": 0.08440540730953217, "rewards/rejected": -0.2886873781681061, "step": 133 }, { "epoch": 0.08751734835496776, "grad_norm": 7.1435262883808734, "learning_rate": 1.74025974025974e-07, "logits/chosen": -0.774854302406311, "logits/rejected": -0.7982902526855469, "logps/chosen": -470.2498474121094, "logps/rejected": -583.7744750976562, "loss": 0.6698, "rewards/accuracies": 0.625, "rewards/chosen": -0.1737588495016098, "rewards/margins": 0.1177835762500763, "rewards/rejected": -0.2915424108505249, "step": 134 }, { "epoch": 0.0881704628949302, "grad_norm": 6.2313339803498655, "learning_rate": 1.7532467532467533e-07, "logits/chosen": -0.8765559196472168, "logits/rejected": -0.8805602192878723, "logps/chosen": -469.5799255371094, "logps/rejected": -518.7642822265625, "loss": 0.6744, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15586817264556885, "rewards/margins": 0.038987912237644196, "rewards/rejected": -0.19485607743263245, "step": 135 }, { "epoch": 0.08882357743489265, "grad_norm": 6.745544786449858, "learning_rate": 1.7662337662337662e-07, "logits/chosen": -0.8797418475151062, "logits/rejected": -0.8522154688835144, "logps/chosen": -551.00146484375, "logps/rejected": -525.474609375, "loss": 0.6662, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19286242127418518, "rewards/margins": 0.05776097998023033, "rewards/rejected": -0.2506234049797058, "step": 136 }, { "epoch": 0.0894766919748551, "grad_norm": 7.32189969799847, "learning_rate": 1.779220779220779e-07, "logits/chosen": -0.9289051294326782, "logits/rejected": -0.9343584179878235, "logps/chosen": -466.8741455078125, "logps/rejected": -475.151611328125, "loss": 0.6611, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1371385008096695, "rewards/margins": 0.011942176148295403, "rewards/rejected": -0.14908067882061005, "step": 137 }, { "epoch": 0.09012980651481754, "grad_norm": 6.847931231483095, "learning_rate": 1.792207792207792e-07, "logits/chosen": -0.8206534385681152, "logits/rejected": -0.8144592642784119, "logps/chosen": -501.97467041015625, "logps/rejected": -535.951904296875, "loss": 0.6776, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1563834846019745, "rewards/margins": 0.054855406284332275, "rewards/rejected": -0.21123890578746796, "step": 138 }, { "epoch": 0.09078292105477999, "grad_norm": 6.944669833446691, "learning_rate": 1.805194805194805e-07, "logits/chosen": -0.8529220819473267, "logits/rejected": -0.8208853006362915, "logps/chosen": -459.8351135253906, "logps/rejected": -492.44573974609375, "loss": 0.6773, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1605282872915268, "rewards/margins": 0.03590104728937149, "rewards/rejected": -0.19642934203147888, "step": 139 }, { "epoch": 0.09143603559474243, "grad_norm": 6.928164394224459, "learning_rate": 1.818181818181818e-07, "logits/chosen": -0.7550444006919861, "logits/rejected": -0.7985954880714417, "logps/chosen": -370.5205078125, "logps/rejected": -476.3122253417969, "loss": 0.6676, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14772818982601166, "rewards/margins": 0.13047771155834198, "rewards/rejected": -0.27820590138435364, "step": 140 }, { "epoch": 0.09208915013470488, "grad_norm": 6.968051505403654, "learning_rate": 1.8311688311688312e-07, "logits/chosen": -0.8749439716339111, "logits/rejected": -0.8448376655578613, "logps/chosen": -494.04486083984375, "logps/rejected": -511.3379211425781, "loss": 0.6682, "rewards/accuracies": 0.625, "rewards/chosen": -0.2082904875278473, "rewards/margins": 0.04293321818113327, "rewards/rejected": -0.25122371315956116, "step": 141 }, { "epoch": 0.09274226467466731, "grad_norm": 6.287379820512906, "learning_rate": 1.844155844155844e-07, "logits/chosen": -0.8604624271392822, "logits/rejected": -0.8241639137268066, "logps/chosen": -480.1129455566406, "logps/rejected": -433.39208984375, "loss": 0.6757, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15264149010181427, "rewards/margins": 0.027708852663636208, "rewards/rejected": -0.18035033345222473, "step": 142 }, { "epoch": 0.09339537921462976, "grad_norm": 6.399592718147689, "learning_rate": 1.8571428571428572e-07, "logits/chosen": -0.9117648601531982, "logits/rejected": -0.8506286144256592, "logps/chosen": -496.1771240234375, "logps/rejected": -476.2972106933594, "loss": 0.6791, "rewards/accuracies": 0.65625, "rewards/chosen": -0.19126328825950623, "rewards/margins": 0.022945519536733627, "rewards/rejected": -0.21420881152153015, "step": 143 }, { "epoch": 0.0940484937545922, "grad_norm": 6.323941767214625, "learning_rate": 1.87012987012987e-07, "logits/chosen": -0.9264488816261292, "logits/rejected": -0.9081891775131226, "logps/chosen": -481.20404052734375, "logps/rejected": -480.6315612792969, "loss": 0.6782, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1617911458015442, "rewards/margins": 0.03854794800281525, "rewards/rejected": -0.20033907890319824, "step": 144 }, { "epoch": 0.09470160829455465, "grad_norm": 6.647653521405033, "learning_rate": 1.883116883116883e-07, "logits/chosen": -0.8386011123657227, "logits/rejected": -0.7837764024734497, "logps/chosen": -458.1123352050781, "logps/rejected": -543.1805419921875, "loss": 0.6695, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21621732413768768, "rewards/margins": 0.12856005132198334, "rewards/rejected": -0.34477734565734863, "step": 145 }, { "epoch": 0.0953547228345171, "grad_norm": 7.22549122734511, "learning_rate": 1.896103896103896e-07, "logits/chosen": -0.7894729375839233, "logits/rejected": -0.7863112092018127, "logps/chosen": -535.9451904296875, "logps/rejected": -595.9136352539062, "loss": 0.6751, "rewards/accuracies": 0.625, "rewards/chosen": -0.25997599959373474, "rewards/margins": 0.13046929240226746, "rewards/rejected": -0.3904453217983246, "step": 146 }, { "epoch": 0.09600783737447954, "grad_norm": 6.515339257767948, "learning_rate": 1.909090909090909e-07, "logits/chosen": -0.8731563091278076, "logits/rejected": -0.8720945119857788, "logps/chosen": -479.39776611328125, "logps/rejected": -538.8330078125, "loss": 0.6701, "rewards/accuracies": 0.59375, "rewards/chosen": -0.22556887567043304, "rewards/margins": 0.12516018748283386, "rewards/rejected": -0.3507290482521057, "step": 147 }, { "epoch": 0.09666095191444199, "grad_norm": 6.422766757760342, "learning_rate": 1.922077922077922e-07, "logits/chosen": -0.9519265294075012, "logits/rejected": -0.9368531107902527, "logps/chosen": -473.68560791015625, "logps/rejected": -474.37939453125, "loss": 0.6683, "rewards/accuracies": 0.5625, "rewards/chosen": -0.18442487716674805, "rewards/margins": 0.029139067977666855, "rewards/rejected": -0.213563933968544, "step": 148 }, { "epoch": 0.09731406645440444, "grad_norm": 6.65435406028437, "learning_rate": 1.9350649350649352e-07, "logits/chosen": -0.9168094396591187, "logits/rejected": -0.9032158851623535, "logps/chosen": -461.6015930175781, "logps/rejected": -425.8824462890625, "loss": 0.6689, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2183716893196106, "rewards/margins": 0.02544441632926464, "rewards/rejected": -0.24381610751152039, "step": 149 }, { "epoch": 0.09796718099436688, "grad_norm": 6.9916618084619175, "learning_rate": 1.948051948051948e-07, "logits/chosen": -0.7993723154067993, "logits/rejected": -0.7403357625007629, "logps/chosen": -549.0083618164062, "logps/rejected": -495.74847412109375, "loss": 0.6565, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2393510341644287, "rewards/margins": 0.03217237442731857, "rewards/rejected": -0.2715234160423279, "step": 150 }, { "epoch": 0.09862029553432933, "grad_norm": 6.002844245053828, "learning_rate": 1.961038961038961e-07, "logits/chosen": -0.8811283707618713, "logits/rejected": -0.8406999111175537, "logps/chosen": -448.83868408203125, "logps/rejected": -474.09320068359375, "loss": 0.667, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17506016790866852, "rewards/margins": 0.10420884937047958, "rewards/rejected": -0.2792690396308899, "step": 151 }, { "epoch": 0.09927341007429177, "grad_norm": 6.898188356647436, "learning_rate": 1.9740259740259739e-07, "logits/chosen": -0.9522019028663635, "logits/rejected": -0.9420698881149292, "logps/chosen": -476.7374267578125, "logps/rejected": -501.6094055175781, "loss": 0.6578, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22090810537338257, "rewards/margins": 0.062128640711307526, "rewards/rejected": -0.2830367684364319, "step": 152 }, { "epoch": 0.09992652461425422, "grad_norm": 6.4452295203644825, "learning_rate": 1.987012987012987e-07, "logits/chosen": -0.9114395380020142, "logits/rejected": -0.897411584854126, "logps/chosen": -495.1845397949219, "logps/rejected": -469.0250244140625, "loss": 0.6644, "rewards/accuracies": 0.625, "rewards/chosen": -0.21842332184314728, "rewards/margins": 0.025180380791425705, "rewards/rejected": -0.24360370635986328, "step": 153 }, { "epoch": 0.10057963915421667, "grad_norm": 6.438642037152613, "learning_rate": 2e-07, "logits/chosen": -0.9732076525688171, "logits/rejected": -0.8783437013626099, "logps/chosen": -459.40673828125, "logps/rejected": -439.1183776855469, "loss": 0.6663, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2149794101715088, "rewards/margins": 0.020638851448893547, "rewards/rejected": -0.23561826348304749, "step": 154 }, { "epoch": 0.10123275369417911, "grad_norm": 6.922197606721808, "learning_rate": 1.9999973974344256e-07, "logits/chosen": -1.0275825262069702, "logits/rejected": -0.9289765357971191, "logps/chosen": -447.41766357421875, "logps/rejected": -410.023193359375, "loss": 0.666, "rewards/accuracies": 0.625, "rewards/chosen": -0.2406703233718872, "rewards/margins": 0.03814300149679184, "rewards/rejected": -0.27881333231925964, "step": 155 }, { "epoch": 0.10188586823414156, "grad_norm": 8.364313956052694, "learning_rate": 1.999989589751249e-07, "logits/chosen": -0.8802437782287598, "logits/rejected": -0.9175126552581787, "logps/chosen": -527.3651123046875, "logps/rejected": -564.7503051757812, "loss": 0.6565, "rewards/accuracies": 0.65625, "rewards/chosen": -0.27885937690734863, "rewards/margins": 0.0818130373954773, "rewards/rejected": -0.3606724143028259, "step": 156 }, { "epoch": 0.102538982774104, "grad_norm": 6.154889997810939, "learning_rate": 1.9999765769911105e-07, "logits/chosen": -0.8315755724906921, "logits/rejected": -0.7917363047599792, "logps/chosen": -484.8338623046875, "logps/rejected": -492.9305419921875, "loss": 0.6668, "rewards/accuracies": 0.75, "rewards/chosen": -0.1891915202140808, "rewards/margins": 0.07525278627872467, "rewards/rejected": -0.26444435119628906, "step": 157 }, { "epoch": 0.10319209731406645, "grad_norm": 6.516656144007839, "learning_rate": 1.999958359221743e-07, "logits/chosen": -0.9213133454322815, "logits/rejected": -0.8424961566925049, "logps/chosen": -497.5096435546875, "logps/rejected": -483.623046875, "loss": 0.6643, "rewards/accuracies": 0.71875, "rewards/chosen": -0.23822657763957977, "rewards/margins": 0.054590702056884766, "rewards/rejected": -0.29281729459762573, "step": 158 }, { "epoch": 0.1038452118540289, "grad_norm": 6.620879007235329, "learning_rate": 1.999934936537972e-07, "logits/chosen": -0.8575921058654785, "logits/rejected": -0.8345927000045776, "logps/chosen": -440.8963623046875, "logps/rejected": -478.8257141113281, "loss": 0.6524, "rewards/accuracies": 0.625, "rewards/chosen": -0.2483789175748825, "rewards/margins": 0.10234285145998001, "rewards/rejected": -0.3507218062877655, "step": 159 }, { "epoch": 0.10449832639399134, "grad_norm": 6.828503715764836, "learning_rate": 1.9999063090617166e-07, "logits/chosen": -0.9059373140335083, "logits/rejected": -0.8238606452941895, "logps/chosen": -526.1168212890625, "logps/rejected": -510.9229431152344, "loss": 0.6583, "rewards/accuracies": 0.75, "rewards/chosen": -0.28075307607650757, "rewards/margins": 0.07372645288705826, "rewards/rejected": -0.3544795513153076, "step": 160 }, { "epoch": 0.10515144093395379, "grad_norm": 6.0655028367067505, "learning_rate": 1.9998724769419858e-07, "logits/chosen": -0.8036881685256958, "logits/rejected": -0.7686895132064819, "logps/chosen": -383.0535583496094, "logps/rejected": -403.23736572265625, "loss": 0.6746, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18963344395160675, "rewards/margins": 0.05757593736052513, "rewards/rejected": -0.24720938503742218, "step": 161 }, { "epoch": 0.10580455547391623, "grad_norm": 6.590865574876252, "learning_rate": 1.9998334403548806e-07, "logits/chosen": -0.9176549315452576, "logits/rejected": -0.9008191823959351, "logps/chosen": -531.5628662109375, "logps/rejected": -509.3006591796875, "loss": 0.676, "rewards/accuracies": 0.65625, "rewards/chosen": -0.27991557121276855, "rewards/margins": 0.058448441326618195, "rewards/rejected": -0.33836400508880615, "step": 162 }, { "epoch": 0.10645767001387868, "grad_norm": 6.467086480102644, "learning_rate": 1.9997891995035913e-07, "logits/chosen": -0.9845995903015137, "logits/rejected": -0.9540070295333862, "logps/chosen": -553.7200927734375, "logps/rejected": -549.807861328125, "loss": 0.6718, "rewards/accuracies": 0.6875, "rewards/chosen": -0.34165340662002563, "rewards/margins": 0.04177336394786835, "rewards/rejected": -0.38342681527137756, "step": 163 }, { "epoch": 0.10711078455384113, "grad_norm": 6.633851760456449, "learning_rate": 1.9997397546183974e-07, "logits/chosen": -0.8908398151397705, "logits/rejected": -0.9239029884338379, "logps/chosen": -435.40771484375, "logps/rejected": -506.06903076171875, "loss": 0.655, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2753165364265442, "rewards/margins": 0.14071771502494812, "rewards/rejected": -0.4160342216491699, "step": 164 }, { "epoch": 0.10776389909380357, "grad_norm": 7.504573304749105, "learning_rate": 1.999685105956666e-07, "logits/chosen": -0.84445720911026, "logits/rejected": -0.846612811088562, "logps/chosen": -481.5384521484375, "logps/rejected": -507.06134033203125, "loss": 0.6437, "rewards/accuracies": 0.625, "rewards/chosen": -0.28293365240097046, "rewards/margins": 0.06457722187042236, "rewards/rejected": -0.34751084446907043, "step": 165 }, { "epoch": 0.10841701363376602, "grad_norm": 6.844588401003989, "learning_rate": 1.9996252538028508e-07, "logits/chosen": -0.9970215559005737, "logits/rejected": -0.9549089670181274, "logps/chosen": -523.70068359375, "logps/rejected": -515.5821533203125, "loss": 0.6466, "rewards/accuracies": 0.75, "rewards/chosen": -0.3379030227661133, "rewards/margins": 0.1277962177991867, "rewards/rejected": -0.4656992256641388, "step": 166 }, { "epoch": 0.10907012817372846, "grad_norm": 7.236130567280733, "learning_rate": 1.9995601984684897e-07, "logits/chosen": -0.7774499654769897, "logits/rejected": -0.7958889007568359, "logps/chosen": -579.2680053710938, "logps/rejected": -638.60791015625, "loss": 0.6503, "rewards/accuracies": 0.84375, "rewards/chosen": -0.44350510835647583, "rewards/margins": 0.1982521116733551, "rewards/rejected": -0.6417572498321533, "step": 167 }, { "epoch": 0.10972324271369091, "grad_norm": 6.547100915291269, "learning_rate": 1.9994899402922046e-07, "logits/chosen": -0.7743135690689087, "logits/rejected": -0.7389811277389526, "logps/chosen": -477.3546142578125, "logps/rejected": -516.999267578125, "loss": 0.6593, "rewards/accuracies": 0.6875, "rewards/chosen": -0.29847046732902527, "rewards/margins": 0.18295258283615112, "rewards/rejected": -0.4814230799674988, "step": 168 }, { "epoch": 0.11037635725365336, "grad_norm": 6.580578918239811, "learning_rate": 1.999414479639698e-07, "logits/chosen": -0.8631864786148071, "logits/rejected": -0.8823621273040771, "logps/chosen": -490.77557373046875, "logps/rejected": -499.3733825683594, "loss": 0.6744, "rewards/accuracies": 0.65625, "rewards/chosen": -0.35784608125686646, "rewards/margins": 0.03360201418399811, "rewards/rejected": -0.39144808053970337, "step": 169 }, { "epoch": 0.1110294717936158, "grad_norm": 7.304635089452459, "learning_rate": 1.9993338169037532e-07, "logits/chosen": -0.9329928755760193, "logits/rejected": -0.8873375654220581, "logps/chosen": -536.2473754882812, "logps/rejected": -565.2854614257812, "loss": 0.6412, "rewards/accuracies": 0.78125, "rewards/chosen": -0.35816970467567444, "rewards/margins": 0.16970431804656982, "rewards/rejected": -0.5278739929199219, "step": 170 }, { "epoch": 0.11168258633357825, "grad_norm": 8.526831736459082, "learning_rate": 1.99924795250423e-07, "logits/chosen": -0.9018800854682922, "logits/rejected": -0.9441125392913818, "logps/chosen": -514.7568359375, "logps/rejected": -537.2463989257812, "loss": 0.6555, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3121597468852997, "rewards/margins": 0.09106716513633728, "rewards/rejected": -0.4032268822193146, "step": 171 }, { "epoch": 0.1123357008735407, "grad_norm": 6.974764550329646, "learning_rate": 1.9991568868880638e-07, "logits/chosen": -0.7577574849128723, "logits/rejected": -0.7482568025588989, "logps/chosen": -465.4934997558594, "logps/rejected": -487.9968566894531, "loss": 0.6518, "rewards/accuracies": 0.75, "rewards/chosen": -0.3646446764469147, "rewards/margins": 0.11736685037612915, "rewards/rejected": -0.48201149702072144, "step": 172 }, { "epoch": 0.11298881541350314, "grad_norm": 6.435484683449247, "learning_rate": 1.999060620529263e-07, "logits/chosen": -0.8632010221481323, "logits/rejected": -0.8566238880157471, "logps/chosen": -494.66363525390625, "logps/rejected": -519.395751953125, "loss": 0.6526, "rewards/accuracies": 0.78125, "rewards/chosen": -0.22226670384407043, "rewards/margins": 0.15008774399757385, "rewards/rejected": -0.3723544478416443, "step": 173 }, { "epoch": 0.11364192995346559, "grad_norm": 7.420727411793039, "learning_rate": 1.998959153928907e-07, "logits/chosen": -0.9504894018173218, "logits/rejected": -0.9857321977615356, "logps/chosen": -527.0250244140625, "logps/rejected": -545.4429931640625, "loss": 0.6424, "rewards/accuracies": 0.75, "rewards/chosen": -0.3586081862449646, "rewards/margins": 0.08520010113716125, "rewards/rejected": -0.44380828738212585, "step": 174 }, { "epoch": 0.11429504449342803, "grad_norm": 6.600657284454907, "learning_rate": 1.9988524876151422e-07, "logits/chosen": -0.7927142381668091, "logits/rejected": -0.7309151887893677, "logps/chosen": -497.8948059082031, "logps/rejected": -530.8662719726562, "loss": 0.6508, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4121289849281311, "rewards/margins": 0.22072520852088928, "rewards/rejected": -0.6328542828559875, "step": 175 }, { "epoch": 0.11494815903339048, "grad_norm": 6.711546585878465, "learning_rate": 1.9987406221431812e-07, "logits/chosen": -0.9204037189483643, "logits/rejected": -0.8354212641716003, "logps/chosen": -486.84637451171875, "logps/rejected": -469.3777770996094, "loss": 0.6305, "rewards/accuracies": 0.84375, "rewards/chosen": -0.30236339569091797, "rewards/margins": 0.1800605207681656, "rewards/rejected": -0.4824238419532776, "step": 176 }, { "epoch": 0.11560127357335293, "grad_norm": 7.413380504551747, "learning_rate": 1.9986235580952986e-07, "logits/chosen": -0.898276686668396, "logits/rejected": -0.8423072695732117, "logps/chosen": -421.8368835449219, "logps/rejected": -463.8457946777344, "loss": 0.6556, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2767115533351898, "rewards/margins": 0.15376242995262146, "rewards/rejected": -0.4304739832878113, "step": 177 }, { "epoch": 0.11625438811331537, "grad_norm": 7.4603000934120685, "learning_rate": 1.9985012960808275e-07, "logits/chosen": -0.9808669686317444, "logits/rejected": -0.9150485396385193, "logps/chosen": -546.5240478515625, "logps/rejected": -494.89044189453125, "loss": 0.6512, "rewards/accuracies": 0.6875, "rewards/chosen": -0.39803603291511536, "rewards/margins": 0.059254515916109085, "rewards/rejected": -0.45729053020477295, "step": 178 }, { "epoch": 0.11690750265327782, "grad_norm": 7.101437920971849, "learning_rate": 1.998373836736158e-07, "logits/chosen": -0.9734604954719543, "logits/rejected": -0.8278497457504272, "logps/chosen": -579.0867919921875, "logps/rejected": -503.48492431640625, "loss": 0.6381, "rewards/accuracies": 0.5625, "rewards/chosen": -0.42628052830696106, "rewards/margins": 0.023855991661548615, "rewards/rejected": -0.4501365125179291, "step": 179 }, { "epoch": 0.11756061719324026, "grad_norm": 6.538781837196094, "learning_rate": 1.998241180724733e-07, "logits/chosen": -0.9165725708007812, "logits/rejected": -0.9196174144744873, "logps/chosen": -464.7592468261719, "logps/rejected": -482.8262023925781, "loss": 0.6509, "rewards/accuracies": 0.59375, "rewards/chosen": -0.41940075159072876, "rewards/margins": 0.05880502983927727, "rewards/rejected": -0.47820577025413513, "step": 180 }, { "epoch": 0.11821373173320271, "grad_norm": 7.321345164948039, "learning_rate": 1.998103328737044e-07, "logits/chosen": -1.006074070930481, "logits/rejected": -1.001975417137146, "logps/chosen": -491.1534423828125, "logps/rejected": -515.4642333984375, "loss": 0.6435, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3396851420402527, "rewards/margins": 0.13319769501686096, "rewards/rejected": -0.47288286685943604, "step": 181 }, { "epoch": 0.11886684627316516, "grad_norm": 6.894930940879784, "learning_rate": 1.997960281490629e-07, "logits/chosen": -0.927212119102478, "logits/rejected": -0.9049844145774841, "logps/chosen": -600.8464965820312, "logps/rejected": -554.48095703125, "loss": 0.6648, "rewards/accuracies": 0.625, "rewards/chosen": -0.5532928109169006, "rewards/margins": 0.03803117573261261, "rewards/rejected": -0.591323971748352, "step": 182 }, { "epoch": 0.1195199608131276, "grad_norm": 7.83193928010655, "learning_rate": 1.9978120397300673e-07, "logits/chosen": -0.8268590569496155, "logits/rejected": -0.7920058965682983, "logps/chosen": -452.5147705078125, "logps/rejected": -533.310302734375, "loss": 0.6315, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4391025900840759, "rewards/margins": 0.2984185218811035, "rewards/rejected": -0.7375210523605347, "step": 183 }, { "epoch": 0.12017307535309005, "grad_norm": 6.386679940574844, "learning_rate": 1.9976586042269772e-07, "logits/chosen": -0.8321893215179443, "logits/rejected": -0.8634829521179199, "logps/chosen": -488.01654052734375, "logps/rejected": -579.4048461914062, "loss": 0.6259, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4608284533023834, "rewards/margins": 0.266685426235199, "rewards/rejected": -0.72751384973526, "step": 184 }, { "epoch": 0.1208261898930525, "grad_norm": 6.8983232744264305, "learning_rate": 1.9974999757800103e-07, "logits/chosen": -0.832870602607727, "logits/rejected": -0.8473939895629883, "logps/chosen": -465.09686279296875, "logps/rejected": -476.3861083984375, "loss": 0.6348, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4560048282146454, "rewards/margins": 0.16139870882034302, "rewards/rejected": -0.6174035668373108, "step": 185 }, { "epoch": 0.12147930443301494, "grad_norm": 7.754190109178935, "learning_rate": 1.9973361552148487e-07, "logits/chosen": -0.8424513339996338, "logits/rejected": -0.8219509124755859, "logps/chosen": -574.000732421875, "logps/rejected": -605.0245361328125, "loss": 0.6487, "rewards/accuracies": 0.75, "rewards/chosen": -0.5188160538673401, "rewards/margins": 0.3015327453613281, "rewards/rejected": -0.8203487992286682, "step": 186 }, { "epoch": 0.12213241897297739, "grad_norm": 6.591297590145272, "learning_rate": 1.9971671433841998e-07, "logits/chosen": -0.8903267979621887, "logits/rejected": -0.8687557578086853, "logps/chosen": -519.8242797851562, "logps/rejected": -529.4027099609375, "loss": 0.6555, "rewards/accuracies": 0.71875, "rewards/chosen": -0.463649719953537, "rewards/margins": 0.062138475477695465, "rewards/rejected": -0.5257881879806519, "step": 187 }, { "epoch": 0.12278553351293983, "grad_norm": 7.4689116101629125, "learning_rate": 1.996992941167792e-07, "logits/chosen": -1.0059764385223389, "logits/rejected": -0.9330079555511475, "logps/chosen": -653.8500366210938, "logps/rejected": -690.8359375, "loss": 0.656, "rewards/accuracies": 0.59375, "rewards/chosen": -0.6045035719871521, "rewards/margins": 0.2457866370677948, "rewards/rejected": -0.8502901792526245, "step": 188 }, { "epoch": 0.12343864805290228, "grad_norm": 7.523661722899911, "learning_rate": 1.996813549472371e-07, "logits/chosen": -0.8645133376121521, "logits/rejected": -0.8016350269317627, "logps/chosen": -536.940673828125, "logps/rejected": -569.5060424804688, "loss": 0.6702, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5659014582633972, "rewards/margins": 0.19686836004257202, "rewards/rejected": -0.7627697587013245, "step": 189 }, { "epoch": 0.12409176259286472, "grad_norm": 7.435260811262935, "learning_rate": 1.9966289692316943e-07, "logits/chosen": -0.9768977165222168, "logits/rejected": -0.9664483666419983, "logps/chosen": -512.6085205078125, "logps/rejected": -521.680908203125, "loss": 0.6227, "rewards/accuracies": 0.875, "rewards/chosen": -0.48955708742141724, "rewards/margins": 0.16032075881958008, "rewards/rejected": -0.6498778462409973, "step": 190 }, { "epoch": 0.12474487713282717, "grad_norm": 7.410166333031568, "learning_rate": 1.996439201406526e-07, "logits/chosen": -0.8727903366088867, "logits/rejected": -0.8746020793914795, "logps/chosen": -460.9349060058594, "logps/rejected": -550.4550170898438, "loss": 0.6256, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4731426239013672, "rewards/margins": 0.357341468334198, "rewards/rejected": -0.8304840326309204, "step": 191 }, { "epoch": 0.12539799167278962, "grad_norm": 7.451321792890638, "learning_rate": 1.9962442469846325e-07, "logits/chosen": -0.9719246625900269, "logits/rejected": -0.9369036555290222, "logps/chosen": -501.4394226074219, "logps/rejected": -502.3212585449219, "loss": 0.6383, "rewards/accuracies": 0.6875, "rewards/chosen": -0.44892147183418274, "rewards/margins": 0.11068614572286606, "rewards/rejected": -0.5596076250076294, "step": 192 }, { "epoch": 0.12605110621275206, "grad_norm": 7.35526823945006, "learning_rate": 1.9960441069807775e-07, "logits/chosen": -0.9953307509422302, "logits/rejected": -1.020664930343628, "logps/chosen": -521.7427368164062, "logps/rejected": -562.8399658203125, "loss": 0.6475, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5778794288635254, "rewards/margins": 0.2361249178647995, "rewards/rejected": -0.8140044212341309, "step": 193 }, { "epoch": 0.1267042207527145, "grad_norm": 7.423354130295339, "learning_rate": 1.9958387824367153e-07, "logits/chosen": -0.8011785745620728, "logits/rejected": -0.7897564172744751, "logps/chosen": -497.1465148925781, "logps/rejected": -523.15576171875, "loss": 0.6356, "rewards/accuracies": 0.625, "rewards/chosen": -0.5150357484817505, "rewards/margins": 0.06541642546653748, "rewards/rejected": -0.5804521441459656, "step": 194 }, { "epoch": 0.12735733529267695, "grad_norm": 6.975653189759925, "learning_rate": 1.9956282744211878e-07, "logits/chosen": -0.9849708676338196, "logits/rejected": -0.9643858671188354, "logps/chosen": -563.0205078125, "logps/rejected": -580.1819458007812, "loss": 0.6415, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6102763414382935, "rewards/margins": 0.12003323435783386, "rewards/rejected": -0.7303095459938049, "step": 195 }, { "epoch": 0.1280104498326394, "grad_norm": 7.2065492895384295, "learning_rate": 1.9954125840299163e-07, "logits/chosen": -0.9601786732673645, "logits/rejected": -0.8700001835823059, "logps/chosen": -471.837158203125, "logps/rejected": -481.9418029785156, "loss": 0.63, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4637281894683838, "rewards/margins": 0.13474319875240326, "rewards/rejected": -0.5984713435173035, "step": 196 }, { "epoch": 0.12866356437260185, "grad_norm": 6.808584697465035, "learning_rate": 1.9951917123855978e-07, "logits/chosen": -0.8570945262908936, "logits/rejected": -0.8283834457397461, "logps/chosen": -463.7060546875, "logps/rejected": -492.38006591796875, "loss": 0.6419, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5440891981124878, "rewards/margins": 0.13518810272216797, "rewards/rejected": -0.6792773008346558, "step": 197 }, { "epoch": 0.1293166789125643, "grad_norm": 7.7662023295805405, "learning_rate": 1.994965660637898e-07, "logits/chosen": -0.9607435464859009, "logits/rejected": -0.9405413866043091, "logps/chosen": -539.56884765625, "logps/rejected": -557.1348266601562, "loss": 0.614, "rewards/accuracies": 0.875, "rewards/chosen": -0.5751670598983765, "rewards/margins": 0.279240220785141, "rewards/rejected": -0.8544072508811951, "step": 198 }, { "epoch": 0.12996979345252674, "grad_norm": 6.6784323628411775, "learning_rate": 1.994734429963446e-07, "logits/chosen": -0.8653057813644409, "logits/rejected": -0.8396562337875366, "logps/chosen": -538.5821533203125, "logps/rejected": -567.431396484375, "loss": 0.6326, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6097425818443298, "rewards/margins": 0.1930355578660965, "rewards/rejected": -0.8027780652046204, "step": 199 }, { "epoch": 0.13062290799248918, "grad_norm": 6.693517839607438, "learning_rate": 1.994498021565828e-07, "logits/chosen": -0.9042263031005859, "logits/rejected": -0.9179930686950684, "logps/chosen": -510.3880615234375, "logps/rejected": -595.0519409179688, "loss": 0.6218, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5225197076797485, "rewards/margins": 0.37714314460754395, "rewards/rejected": -0.8996628522872925, "step": 200 }, { "epoch": 0.13062290799248918, "eval_logits/chosen": -0.8125271797180176, "eval_logits/rejected": -0.7770608067512512, "eval_logps/chosen": -535.1920166015625, "eval_logps/rejected": -542.3652954101562, "eval_loss": 0.6276674866676331, "eval_rewards/accuracies": 0.6959999799728394, "eval_rewards/chosen": -0.6128481030464172, "eval_rewards/margins": 0.15911929309368134, "eval_rewards/rejected": -0.7719674110412598, "eval_runtime": 619.4957, "eval_samples_per_second": 6.457, "eval_steps_per_second": 0.404, "step": 200 }, { "epoch": 0.13127602253245163, "grad_norm": 7.287720671466593, "learning_rate": 1.9942564366755805e-07, "logits/chosen": -0.8729226589202881, "logits/rejected": -0.9026110768318176, "logps/chosen": -488.0004577636719, "logps/rejected": -517.5531616210938, "loss": 0.6221, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5494257807731628, "rewards/margins": 0.14743518829345703, "rewards/rejected": -0.6968609690666199, "step": 201 }, { "epoch": 0.13192913707241408, "grad_norm": 6.635612912694771, "learning_rate": 1.9940096765501845e-07, "logits/chosen": -0.8704729676246643, "logits/rejected": -0.8529733419418335, "logps/chosen": -469.6875305175781, "logps/rejected": -517.5676879882812, "loss": 0.6488, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5424898862838745, "rewards/margins": 0.19880425930023193, "rewards/rejected": -0.7412941455841064, "step": 202 }, { "epoch": 0.13258225161237652, "grad_norm": 6.565903567595678, "learning_rate": 1.993757742474059e-07, "logits/chosen": -0.8626876473426819, "logits/rejected": -0.8197176456451416, "logps/chosen": -515.7911376953125, "logps/rejected": -539.673583984375, "loss": 0.6154, "rewards/accuracies": 0.75, "rewards/chosen": -0.5579182505607605, "rewards/margins": 0.15692903101444244, "rewards/rejected": -0.7148473262786865, "step": 203 }, { "epoch": 0.13323536615233897, "grad_norm": 6.898446368005499, "learning_rate": 1.993500635758554e-07, "logits/chosen": -0.8769382834434509, "logits/rejected": -0.8424308896064758, "logps/chosen": -547.3653564453125, "logps/rejected": -543.1265869140625, "loss": 0.6247, "rewards/accuracies": 0.75, "rewards/chosen": -0.6659564971923828, "rewards/margins": 0.1936778426170349, "rewards/rejected": -0.8596343994140625, "step": 204 }, { "epoch": 0.13388848069230141, "grad_norm": 7.35385978193956, "learning_rate": 1.9932383577419428e-07, "logits/chosen": -0.8527657985687256, "logits/rejected": -0.8732788562774658, "logps/chosen": -515.5074462890625, "logps/rejected": -542.03515625, "loss": 0.6355, "rewards/accuracies": 0.75, "rewards/chosen": -0.5365366339683533, "rewards/margins": 0.19824588298797607, "rewards/rejected": -0.7347825169563293, "step": 205 }, { "epoch": 0.13454159523226386, "grad_norm": 7.030648984280733, "learning_rate": 1.992970909789418e-07, "logits/chosen": -0.957828164100647, "logits/rejected": -0.8857869505882263, "logps/chosen": -509.6930236816406, "logps/rejected": -477.85186767578125, "loss": 0.6093, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5899068713188171, "rewards/margins": 0.13009968400001526, "rewards/rejected": -0.7200065851211548, "step": 206 }, { "epoch": 0.1351947097722263, "grad_norm": 6.950088867136282, "learning_rate": 1.9926982932930807e-07, "logits/chosen": -1.0200141668319702, "logits/rejected": -1.0256128311157227, "logps/chosen": -581.5166625976562, "logps/rejected": -643.7260131835938, "loss": 0.6331, "rewards/accuracies": 0.75, "rewards/chosen": -0.6091457605361938, "rewards/margins": 0.21974962949752808, "rewards/rejected": -0.8288955092430115, "step": 207 }, { "epoch": 0.13584782431218875, "grad_norm": 7.122577238645886, "learning_rate": 1.9924205096719357e-07, "logits/chosen": -0.8400067090988159, "logits/rejected": -0.8682948350906372, "logps/chosen": -515.2028198242188, "logps/rejected": -556.894775390625, "loss": 0.6278, "rewards/accuracies": 0.75, "rewards/chosen": -0.6332363486289978, "rewards/margins": 0.19957150518894196, "rewards/rejected": -0.8328077793121338, "step": 208 }, { "epoch": 0.1365009388521512, "grad_norm": 6.576309079334382, "learning_rate": 1.992137560371883e-07, "logits/chosen": -0.9659208655357361, "logits/rejected": -0.9228329062461853, "logps/chosen": -523.8739624023438, "logps/rejected": -537.5508422851562, "loss": 0.6493, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7064003944396973, "rewards/margins": 0.04580266401171684, "rewards/rejected": -0.7522029280662537, "step": 209 }, { "epoch": 0.13715405339211365, "grad_norm": 6.973323990656964, "learning_rate": 1.991849446865711e-07, "logits/chosen": -0.8906298875808716, "logits/rejected": -0.8565115332603455, "logps/chosen": -530.02197265625, "logps/rejected": -524.1921997070312, "loss": 0.6055, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6586757898330688, "rewards/margins": 0.16047915816307068, "rewards/rejected": -0.8191549181938171, "step": 210 }, { "epoch": 0.1378071679320761, "grad_norm": 6.816468269528872, "learning_rate": 1.991556170653088e-07, "logits/chosen": -0.8264785408973694, "logits/rejected": -0.7743552923202515, "logps/chosen": -635.3679809570312, "logps/rejected": -661.7918090820312, "loss": 0.5945, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8539211750030518, "rewards/margins": 0.3885781168937683, "rewards/rejected": -1.2424993515014648, "step": 211 }, { "epoch": 0.13846028247203854, "grad_norm": 6.929590404216663, "learning_rate": 1.9912577332605557e-07, "logits/chosen": -1.066705346107483, "logits/rejected": -1.0079761743545532, "logps/chosen": -534.2091064453125, "logps/rejected": -533.3729858398438, "loss": 0.6477, "rewards/accuracies": 0.625, "rewards/chosen": -0.62717205286026, "rewards/margins": 0.10539636760950089, "rewards/rejected": -0.7325683832168579, "step": 212 }, { "epoch": 0.13911339701200098, "grad_norm": 7.150556554578528, "learning_rate": 1.990954136241519e-07, "logits/chosen": -0.8896300196647644, "logits/rejected": -0.860119104385376, "logps/chosen": -561.0973510742188, "logps/rejected": -588.75146484375, "loss": 0.5929, "rewards/accuracies": 0.75, "rewards/chosen": -0.7131221890449524, "rewards/margins": 0.23361392319202423, "rewards/rejected": -0.9467360377311707, "step": 213 }, { "epoch": 0.13976651155196343, "grad_norm": 7.459196518999915, "learning_rate": 1.9906453811762414e-07, "logits/chosen": -0.9893519878387451, "logits/rejected": -0.972107470035553, "logps/chosen": -550.1785278320312, "logps/rejected": -562.7999877929688, "loss": 0.6273, "rewards/accuracies": 0.65625, "rewards/chosen": -0.7028112411499023, "rewards/margins": 0.19713109731674194, "rewards/rejected": -0.8999423384666443, "step": 214 }, { "epoch": 0.14041962609192588, "grad_norm": 7.006968091209495, "learning_rate": 1.9903314696718323e-07, "logits/chosen": -0.9318097233772278, "logits/rejected": -0.9505141973495483, "logps/chosen": -491.9817810058594, "logps/rejected": -514.4917602539062, "loss": 0.6164, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6546629667282104, "rewards/margins": 0.14145053923130035, "rewards/rejected": -0.7961135506629944, "step": 215 }, { "epoch": 0.14107274063188832, "grad_norm": 6.927856564940898, "learning_rate": 1.990012403362243e-07, "logits/chosen": -0.8761850595474243, "logits/rejected": -0.8285849094390869, "logps/chosen": -573.4921264648438, "logps/rejected": -563.6383056640625, "loss": 0.6095, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8476728200912476, "rewards/margins": 0.14320024847984314, "rewards/rejected": -0.9908730387687683, "step": 216 }, { "epoch": 0.14172585517185077, "grad_norm": 8.058961160191211, "learning_rate": 1.9896881839082554e-07, "logits/chosen": -0.9572507739067078, "logits/rejected": -0.8944281339645386, "logps/chosen": -646.2454223632812, "logps/rejected": -608.398681640625, "loss": 0.5914, "rewards/accuracies": 0.625, "rewards/chosen": -0.9121346473693848, "rewards/margins": 0.20039290189743042, "rewards/rejected": -1.1125273704528809, "step": 217 }, { "epoch": 0.1423789697118132, "grad_norm": 7.091499089560837, "learning_rate": 1.9893588129974738e-07, "logits/chosen": -1.0472147464752197, "logits/rejected": -1.029089093208313, "logps/chosen": -622.0521240234375, "logps/rejected": -670.965576171875, "loss": 0.604, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8638104796409607, "rewards/margins": 0.325113445520401, "rewards/rejected": -1.1889238357543945, "step": 218 }, { "epoch": 0.14303208425177566, "grad_norm": 7.315033939603737, "learning_rate": 1.9890242923443176e-07, "logits/chosen": -0.9592210650444031, "logits/rejected": -0.922424852848053, "logps/chosen": -566.4813232421875, "logps/rejected": -592.270751953125, "loss": 0.5941, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8809952139854431, "rewards/margins": 0.32125452160835266, "rewards/rejected": -1.2022497653961182, "step": 219 }, { "epoch": 0.1436851987917381, "grad_norm": 7.526883226909424, "learning_rate": 1.9886846236900102e-07, "logits/chosen": -0.9030224680900574, "logits/rejected": -0.8189893364906311, "logps/chosen": -543.3269653320312, "logps/rejected": -526.5144653320312, "loss": 0.6133, "rewards/accuracies": 0.65625, "rewards/chosen": -0.9240608811378479, "rewards/margins": 0.1997796893119812, "rewards/rejected": -1.123840570449829, "step": 220 }, { "epoch": 0.14433831333170055, "grad_norm": 7.141612907726172, "learning_rate": 1.9883398088025718e-07, "logits/chosen": -0.9949901103973389, "logits/rejected": -0.9826943278312683, "logps/chosen": -547.9266357421875, "logps/rejected": -558.5771484375, "loss": 0.6015, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8585708141326904, "rewards/margins": 0.33034569025039673, "rewards/rejected": -1.188916563987732, "step": 221 }, { "epoch": 0.144991427871663, "grad_norm": 8.244844325347787, "learning_rate": 1.987989849476809e-07, "logits/chosen": -0.9119670391082764, "logits/rejected": -0.8896617293357849, "logps/chosen": -560.7728271484375, "logps/rejected": -550.8117065429688, "loss": 0.6158, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9433919191360474, "rewards/margins": 0.2703930735588074, "rewards/rejected": -1.2137850522994995, "step": 222 }, { "epoch": 0.14564454241162544, "grad_norm": 8.841979767709514, "learning_rate": 1.9876347475343058e-07, "logits/chosen": -1.0196629762649536, "logits/rejected": -1.0518569946289062, "logps/chosen": -581.1697387695312, "logps/rejected": -607.8892211914062, "loss": 0.5911, "rewards/accuracies": 0.59375, "rewards/chosen": -1.0797791481018066, "rewards/margins": 0.3377801179885864, "rewards/rejected": -1.4175591468811035, "step": 223 }, { "epoch": 0.1462976569515879, "grad_norm": 7.111657119116627, "learning_rate": 1.9872745048234148e-07, "logits/chosen": -0.8659726977348328, "logits/rejected": -0.8837684392929077, "logps/chosen": -644.4042358398438, "logps/rejected": -690.21435546875, "loss": 0.5954, "rewards/accuracies": 0.75, "rewards/chosen": -1.214494228363037, "rewards/margins": 0.4262455105781555, "rewards/rejected": -1.6407395601272583, "step": 224 }, { "epoch": 0.14695077149155034, "grad_norm": 10.605087808773934, "learning_rate": 1.9869091232192463e-07, "logits/chosen": -0.9252822995185852, "logits/rejected": -0.9206048250198364, "logps/chosen": -576.3617553710938, "logps/rejected": -601.0115356445312, "loss": 0.6156, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0873240232467651, "rewards/margins": 0.27771976590156555, "rewards/rejected": -1.3650437593460083, "step": 225 }, { "epoch": 0.14760388603151278, "grad_norm": 9.084674235016786, "learning_rate": 1.9865386046236595e-07, "logits/chosen": -0.968360424041748, "logits/rejected": -0.9349038600921631, "logps/chosen": -588.273681640625, "logps/rejected": -619.4586791992188, "loss": 0.6567, "rewards/accuracies": 0.71875, "rewards/chosen": -1.0742963552474976, "rewards/margins": 0.31582218408584595, "rewards/rejected": -1.3901184797286987, "step": 226 }, { "epoch": 0.14825700057147523, "grad_norm": 7.5679079102766975, "learning_rate": 1.9861629509652522e-07, "logits/chosen": -0.897240161895752, "logits/rejected": -0.8546115159988403, "logps/chosen": -556.501708984375, "logps/rejected": -554.8712158203125, "loss": 0.6248, "rewards/accuracies": 0.65625, "rewards/chosen": -1.0640370845794678, "rewards/margins": 0.20565065741539001, "rewards/rejected": -1.2696877717971802, "step": 227 }, { "epoch": 0.14891011511143767, "grad_norm": 7.498591308619216, "learning_rate": 1.985782164199351e-07, "logits/chosen": -0.9917216300964355, "logits/rejected": -0.9834437370300293, "logps/chosen": -541.1337890625, "logps/rejected": -572.7382202148438, "loss": 0.6082, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9065245985984802, "rewards/margins": 0.3573816418647766, "rewards/rejected": -1.2639062404632568, "step": 228 }, { "epoch": 0.14956322965140012, "grad_norm": 8.030664967261915, "learning_rate": 1.9853962463080012e-07, "logits/chosen": -1.0200045108795166, "logits/rejected": -0.9756948351860046, "logps/chosen": -598.68896484375, "logps/rejected": -618.638916015625, "loss": 0.6075, "rewards/accuracies": 0.71875, "rewards/chosen": -1.1298401355743408, "rewards/margins": 0.4494876265525818, "rewards/rejected": -1.5793277025222778, "step": 229 }, { "epoch": 0.15021634419136257, "grad_norm": 8.61826945907326, "learning_rate": 1.9850051992999558e-07, "logits/chosen": -0.9880169630050659, "logits/rejected": -0.9926092624664307, "logps/chosen": -639.80810546875, "logps/rejected": -766.6765747070312, "loss": 0.5953, "rewards/accuracies": 0.75, "rewards/chosen": -1.3067864179611206, "rewards/margins": 0.631999135017395, "rewards/rejected": -1.9387855529785156, "step": 230 }, { "epoch": 0.150869458731325, "grad_norm": 7.108840046005145, "learning_rate": 1.9846090252106657e-07, "logits/chosen": -0.9846518635749817, "logits/rejected": -1.0040161609649658, "logps/chosen": -643.1053466796875, "logps/rejected": -744.1202392578125, "loss": 0.5933, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3677211999893188, "rewards/margins": 0.5599484443664551, "rewards/rejected": -1.9276697635650635, "step": 231 }, { "epoch": 0.15152257327128746, "grad_norm": 7.40371296978151, "learning_rate": 1.9842077261022688e-07, "logits/chosen": -0.8514267802238464, "logits/rejected": -0.9118195176124573, "logps/chosen": -563.9459838867188, "logps/rejected": -678.0374145507812, "loss": 0.6064, "rewards/accuracies": 0.625, "rewards/chosen": -1.1446633338928223, "rewards/margins": 0.520073652267456, "rewards/rejected": -1.6647369861602783, "step": 232 }, { "epoch": 0.1521756878112499, "grad_norm": 7.490174879543239, "learning_rate": 1.9838013040635805e-07, "logits/chosen": -0.9286482334136963, "logits/rejected": -0.831351637840271, "logps/chosen": -589.9814453125, "logps/rejected": -574.0662231445312, "loss": 0.5966, "rewards/accuracies": 0.75, "rewards/chosen": -1.133133053779602, "rewards/margins": 0.36746925115585327, "rewards/rejected": -1.5006022453308105, "step": 233 }, { "epoch": 0.15282880235121235, "grad_norm": 7.601651007846018, "learning_rate": 1.9833897612100798e-07, "logits/chosen": -0.9623463749885559, "logits/rejected": -0.9710357785224915, "logps/chosen": -609.2854614257812, "logps/rejected": -691.9542236328125, "loss": 0.5545, "rewards/accuracies": 0.75, "rewards/chosen": -1.1702455282211304, "rewards/margins": 0.5435682535171509, "rewards/rejected": -1.7138137817382812, "step": 234 }, { "epoch": 0.1534819168911748, "grad_norm": 7.172747159619703, "learning_rate": 1.982973099683902e-07, "logits/chosen": -1.0496864318847656, "logits/rejected": -1.0496934652328491, "logps/chosen": -536.92333984375, "logps/rejected": -569.0079345703125, "loss": 0.5691, "rewards/accuracies": 0.71875, "rewards/chosen": -1.185002088546753, "rewards/margins": 0.29621896147727966, "rewards/rejected": -1.4812211990356445, "step": 235 }, { "epoch": 0.15413503143113724, "grad_norm": 7.8826748034616845, "learning_rate": 1.982551321653824e-07, "logits/chosen": -1.0674985647201538, "logits/rejected": -1.0483006238937378, "logps/chosen": -655.8088989257812, "logps/rejected": -660.1513671875, "loss": 0.6004, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3943867683410645, "rewards/margins": 0.18238916993141174, "rewards/rejected": -1.5767759084701538, "step": 236 }, { "epoch": 0.1547881459710997, "grad_norm": 8.758377848112476, "learning_rate": 1.982124429315257e-07, "logits/chosen": -0.9774172306060791, "logits/rejected": -0.9349012970924377, "logps/chosen": -590.2391357421875, "logps/rejected": -650.866943359375, "loss": 0.5894, "rewards/accuracies": 0.71875, "rewards/chosen": -1.2001911401748657, "rewards/margins": 0.6267807483673096, "rewards/rejected": -1.8269720077514648, "step": 237 }, { "epoch": 0.15544126051106213, "grad_norm": 6.943165336635335, "learning_rate": 1.9816924248902302e-07, "logits/chosen": -0.9072751998901367, "logits/rejected": -0.8869557976722717, "logps/chosen": -595.5549926757812, "logps/rejected": -593.1593017578125, "loss": 0.6123, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2035523653030396, "rewards/margins": 0.13962620496749878, "rewards/rejected": -1.343178629875183, "step": 238 }, { "epoch": 0.15609437505102458, "grad_norm": 8.698887817875539, "learning_rate": 1.9812553106273846e-07, "logits/chosen": -0.9801173806190491, "logits/rejected": -0.9282537698745728, "logps/chosen": -623.5844116210938, "logps/rejected": -658.0555419921875, "loss": 0.5646, "rewards/accuracies": 0.8125, "rewards/chosen": -1.3153058290481567, "rewards/margins": 0.47327354550361633, "rewards/rejected": -1.7885793447494507, "step": 239 }, { "epoch": 0.15674748959098703, "grad_norm": 7.1608426664869755, "learning_rate": 1.9808130888019568e-07, "logits/chosen": -0.9460001587867737, "logits/rejected": -0.9269174337387085, "logps/chosen": -674.88134765625, "logps/rejected": -838.9345092773438, "loss": 0.5693, "rewards/accuracies": 0.78125, "rewards/chosen": -1.4076124429702759, "rewards/margins": 0.9736231565475464, "rewards/rejected": -2.3812355995178223, "step": 240 }, { "epoch": 0.15740060413094947, "grad_norm": 8.96596264367979, "learning_rate": 1.9803657617157689e-07, "logits/chosen": -0.9591872692108154, "logits/rejected": -0.9243131875991821, "logps/chosen": -737.2781372070312, "logps/rejected": -727.5070190429688, "loss": 0.6155, "rewards/accuracies": 0.625, "rewards/chosen": -1.726030707359314, "rewards/margins": 0.2121991366147995, "rewards/rejected": -1.9382299184799194, "step": 241 }, { "epoch": 0.15805371867091192, "grad_norm": 9.023241556895329, "learning_rate": 1.979913331697218e-07, "logits/chosen": -1.0456653833389282, "logits/rejected": -0.9968949556350708, "logps/chosen": -675.23974609375, "logps/rejected": -667.6704711914062, "loss": 0.5662, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3876211643218994, "rewards/margins": 0.3616403341293335, "rewards/rejected": -1.749261498451233, "step": 242 }, { "epoch": 0.15870683321087437, "grad_norm": 7.43494665793943, "learning_rate": 1.9794558011012607e-07, "logits/chosen": -0.8788321614265442, "logits/rejected": -0.8721754550933838, "logps/chosen": -537.51904296875, "logps/rejected": -576.8412475585938, "loss": 0.5945, "rewards/accuracies": 0.6875, "rewards/chosen": -1.1825263500213623, "rewards/margins": 0.27157512307167053, "rewards/rejected": -1.4541014432907104, "step": 243 }, { "epoch": 0.1593599477508368, "grad_norm": 7.341841973123469, "learning_rate": 1.9789931723094044e-07, "logits/chosen": -1.0291013717651367, "logits/rejected": -0.9565322399139404, "logps/chosen": -612.6153564453125, "logps/rejected": -646.62646484375, "loss": 0.58, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5041933059692383, "rewards/margins": 0.3266003727912903, "rewards/rejected": -1.8307936191558838, "step": 244 }, { "epoch": 0.16001306229079926, "grad_norm": 6.979064593900016, "learning_rate": 1.9785254477296926e-07, "logits/chosen": -0.9677096009254456, "logits/rejected": -0.9133827686309814, "logps/chosen": -586.70849609375, "logps/rejected": -601.3690185546875, "loss": 0.5816, "rewards/accuracies": 0.71875, "rewards/chosen": -1.4074910879135132, "rewards/margins": 0.4098448157310486, "rewards/rejected": -1.817335844039917, "step": 245 }, { "epoch": 0.1606661768307617, "grad_norm": 7.344334329816908, "learning_rate": 1.978052629796693e-07, "logits/chosen": -0.9641053080558777, "logits/rejected": -0.9103111028671265, "logps/chosen": -622.2855834960938, "logps/rejected": -630.8062133789062, "loss": 0.6178, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5755188465118408, "rewards/margins": 0.21382541954517365, "rewards/rejected": -1.7893444299697876, "step": 246 }, { "epoch": 0.16131929137072415, "grad_norm": 7.615942815150963, "learning_rate": 1.9775747209714844e-07, "logits/chosen": -0.9888575077056885, "logits/rejected": -0.9431308507919312, "logps/chosen": -653.2548217773438, "logps/rejected": -701.7452392578125, "loss": 0.5962, "rewards/accuracies": 0.6875, "rewards/chosen": -1.7049720287322998, "rewards/margins": 0.26299968361854553, "rewards/rejected": -1.9679718017578125, "step": 247 }, { "epoch": 0.1619724059106866, "grad_norm": 7.121943628659046, "learning_rate": 1.9770917237416458e-07, "logits/chosen": -0.943242073059082, "logits/rejected": -0.956061601638794, "logps/chosen": -577.7714233398438, "logps/rejected": -563.3513793945312, "loss": 0.6032, "rewards/accuracies": 0.65625, "rewards/chosen": -1.5149903297424316, "rewards/margins": 0.162530779838562, "rewards/rejected": -1.6775211095809937, "step": 248 }, { "epoch": 0.16262552045064904, "grad_norm": 7.460563232555389, "learning_rate": 1.9766036406212402e-07, "logits/chosen": -0.8586480617523193, "logits/rejected": -0.7898170948028564, "logps/chosen": -577.0723876953125, "logps/rejected": -589.3902587890625, "loss": 0.5677, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3534388542175293, "rewards/margins": 0.28514593839645386, "rewards/rejected": -1.638584852218628, "step": 249 }, { "epoch": 0.1632786349906115, "grad_norm": 8.27330500287747, "learning_rate": 1.9761104741508055e-07, "logits/chosen": -0.859459400177002, "logits/rejected": -0.8584420680999756, "logps/chosen": -660.7594604492188, "logps/rejected": -681.0894165039062, "loss": 0.604, "rewards/accuracies": 0.65625, "rewards/chosen": -1.583940029144287, "rewards/margins": 0.39344677329063416, "rewards/rejected": -1.9773868322372437, "step": 250 }, { "epoch": 0.16393174953057393, "grad_norm": 7.994745849819422, "learning_rate": 1.9756122268973368e-07, "logits/chosen": -1.1016628742218018, "logits/rejected": -1.0982387065887451, "logps/chosen": -659.254638671875, "logps/rejected": -722.5975952148438, "loss": 0.5555, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6833679676055908, "rewards/margins": 0.5387370586395264, "rewards/rejected": -2.2221052646636963, "step": 251 }, { "epoch": 0.16458486407053638, "grad_norm": 8.509706028882437, "learning_rate": 1.9751089014542767e-07, "logits/chosen": -1.010880947113037, "logits/rejected": -1.0128508806228638, "logps/chosen": -672.9129638671875, "logps/rejected": -704.9036254882812, "loss": 0.6262, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5938973426818848, "rewards/margins": 0.4070626199245453, "rewards/rejected": -2.000959873199463, "step": 252 }, { "epoch": 0.16523797861049883, "grad_norm": 8.068629943543698, "learning_rate": 1.9746005004415002e-07, "logits/chosen": -0.955176055431366, "logits/rejected": -0.9755375981330872, "logps/chosen": -651.1551513671875, "logps/rejected": -893.3212280273438, "loss": 0.5303, "rewards/accuracies": 0.78125, "rewards/chosen": -1.529944658279419, "rewards/margins": 1.257524013519287, "rewards/rejected": -2.787468910217285, "step": 253 }, { "epoch": 0.16589109315046127, "grad_norm": 10.311573848019473, "learning_rate": 1.9740870265053011e-07, "logits/chosen": -0.9392127990722656, "logits/rejected": -0.9196099042892456, "logps/chosen": -640.4437866210938, "logps/rejected": -758.6360473632812, "loss": 0.597, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6619254350662231, "rewards/margins": 0.6193193793296814, "rewards/rejected": -2.2812447547912598, "step": 254 }, { "epoch": 0.16654420769042372, "grad_norm": 7.676564230554778, "learning_rate": 1.9735684823183786e-07, "logits/chosen": -1.03840970993042, "logits/rejected": -0.9937460422515869, "logps/chosen": -608.6917724609375, "logps/rejected": -635.8554077148438, "loss": 0.5846, "rewards/accuracies": 0.71875, "rewards/chosen": -1.6807466745376587, "rewards/margins": 0.34590983390808105, "rewards/rejected": -2.02665638923645, "step": 255 }, { "epoch": 0.16719732223038616, "grad_norm": 7.630789747584126, "learning_rate": 1.9730448705798237e-07, "logits/chosen": -0.9532262086868286, "logits/rejected": -0.8471428155899048, "logps/chosen": -715.9351806640625, "logps/rejected": -676.1060180664062, "loss": 0.5836, "rewards/accuracies": 0.71875, "rewards/chosen": -1.81856369972229, "rewards/margins": 0.22772595286369324, "rewards/rejected": -2.0462896823883057, "step": 256 }, { "epoch": 0.1678504367703486, "grad_norm": 7.996225765356348, "learning_rate": 1.9725161940151037e-07, "logits/chosen": -1.0379149913787842, "logits/rejected": -0.9661443829536438, "logps/chosen": -589.5501098632812, "logps/rejected": -574.7515869140625, "loss": 0.5762, "rewards/accuracies": 0.78125, "rewards/chosen": -1.6499119997024536, "rewards/margins": 0.21118004620075226, "rewards/rejected": -1.8610920906066895, "step": 257 }, { "epoch": 0.16850355131031106, "grad_norm": 7.457105463708072, "learning_rate": 1.9719824553760493e-07, "logits/chosen": -0.976207435131073, "logits/rejected": -0.939882218837738, "logps/chosen": -630.826171875, "logps/rejected": -711.6494750976562, "loss": 0.5239, "rewards/accuracies": 0.75, "rewards/chosen": -1.785365104675293, "rewards/margins": 0.5453168153762817, "rewards/rejected": -2.330681800842285, "step": 258 }, { "epoch": 0.1691566658502735, "grad_norm": 7.10936396957504, "learning_rate": 1.9714436574408404e-07, "logits/chosen": -1.0234023332595825, "logits/rejected": -1.0151737928390503, "logps/chosen": -654.0241088867188, "logps/rejected": -693.206298828125, "loss": 0.5556, "rewards/accuracies": 0.84375, "rewards/chosen": -1.698794960975647, "rewards/margins": 0.47481226921081543, "rewards/rejected": -2.173607349395752, "step": 259 }, { "epoch": 0.16980978039023595, "grad_norm": 12.120889100254757, "learning_rate": 1.970899803013991e-07, "logits/chosen": -0.9613388776779175, "logits/rejected": -0.9475601315498352, "logps/chosen": -721.533203125, "logps/rejected": -764.3331909179688, "loss": 0.6242, "rewards/accuracies": 0.65625, "rewards/chosen": -2.249241352081299, "rewards/margins": 0.5473915338516235, "rewards/rejected": -2.796632766723633, "step": 260 }, { "epoch": 0.1704628949301984, "grad_norm": 8.375507399375959, "learning_rate": 1.9703508949263343e-07, "logits/chosen": -0.9917237758636475, "logits/rejected": -0.974717915058136, "logps/chosen": -657.542724609375, "logps/rejected": -657.2991333007812, "loss": 0.5991, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8568816184997559, "rewards/margins": 0.41493868827819824, "rewards/rejected": -2.271820068359375, "step": 261 }, { "epoch": 0.17111600947016084, "grad_norm": 8.620447456913936, "learning_rate": 1.9697969360350095e-07, "logits/chosen": -0.9772311449050903, "logits/rejected": -0.9717585444450378, "logps/chosen": -563.0853271484375, "logps/rejected": -628.078125, "loss": 0.5931, "rewards/accuracies": 0.59375, "rewards/chosen": -1.7189618349075317, "rewards/margins": 0.3262416422367096, "rewards/rejected": -2.045203447341919, "step": 262 }, { "epoch": 0.1717691240101233, "grad_norm": 8.622090097281658, "learning_rate": 1.9692379292234446e-07, "logits/chosen": -1.1292492151260376, "logits/rejected": -1.065726399421692, "logps/chosen": -713.7160034179688, "logps/rejected": -700.1644287109375, "loss": 0.5657, "rewards/accuracies": 0.71875, "rewards/chosen": -1.874625563621521, "rewards/margins": 0.375280499458313, "rewards/rejected": -2.249905824661255, "step": 263 }, { "epoch": 0.17242223855008573, "grad_norm": 8.32964680659536, "learning_rate": 1.9686738774013438e-07, "logits/chosen": -0.9963126182556152, "logits/rejected": -0.9847568273544312, "logps/chosen": -616.8109130859375, "logps/rejected": -623.2798461914062, "loss": 0.5323, "rewards/accuracies": 0.625, "rewards/chosen": -1.5587760210037231, "rewards/margins": 0.21900975704193115, "rewards/rejected": -1.7777857780456543, "step": 264 }, { "epoch": 0.17307535309004818, "grad_norm": 6.771586579633424, "learning_rate": 1.9681047835046707e-07, "logits/chosen": -1.026088833808899, "logits/rejected": -0.977704644203186, "logps/chosen": -680.363525390625, "logps/rejected": -708.6707153320312, "loss": 0.5194, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8157883882522583, "rewards/margins": 0.44769570231437683, "rewards/rejected": -2.263484001159668, "step": 265 }, { "epoch": 0.17372846763001062, "grad_norm": 7.5583512580487495, "learning_rate": 1.9675306504956338e-07, "logits/chosen": -0.9970998167991638, "logits/rejected": -0.9781684279441833, "logps/chosen": -659.970947265625, "logps/rejected": -856.7073974609375, "loss": 0.5688, "rewards/accuracies": 0.8125, "rewards/chosen": -1.790527582168579, "rewards/margins": 1.0892386436462402, "rewards/rejected": -2.8797662258148193, "step": 266 }, { "epoch": 0.17438158216997307, "grad_norm": 9.147249940817659, "learning_rate": 1.9669514813626704e-07, "logits/chosen": -0.8966047167778015, "logits/rejected": -0.8831186890602112, "logps/chosen": -675.2528686523438, "logps/rejected": -670.5457763671875, "loss": 0.5648, "rewards/accuracies": 0.78125, "rewards/chosen": -1.9768502712249756, "rewards/margins": 0.2957232892513275, "rewards/rejected": -2.272573471069336, "step": 267 }, { "epoch": 0.17503469670993552, "grad_norm": 7.702705924911228, "learning_rate": 1.9663672791204323e-07, "logits/chosen": -1.0310410261154175, "logits/rejected": -0.9511358141899109, "logps/chosen": -653.04296875, "logps/rejected": -671.1371459960938, "loss": 0.5649, "rewards/accuracies": 0.625, "rewards/chosen": -1.9696815013885498, "rewards/margins": 0.4211324155330658, "rewards/rejected": -2.3908138275146484, "step": 268 }, { "epoch": 0.17568781124989796, "grad_norm": 7.604828455223188, "learning_rate": 1.9657780468097683e-07, "logits/chosen": -0.9046274423599243, "logits/rejected": -0.8803516030311584, "logps/chosen": -581.923828125, "logps/rejected": -623.5220947265625, "loss": 0.5438, "rewards/accuracies": 0.78125, "rewards/chosen": -1.5920655727386475, "rewards/margins": 0.4273078441619873, "rewards/rejected": -2.0193734169006348, "step": 269 }, { "epoch": 0.1763409257898604, "grad_norm": 9.076618915096422, "learning_rate": 1.96518378749771e-07, "logits/chosen": -0.9741644859313965, "logits/rejected": -0.9750419855117798, "logps/chosen": -733.3799438476562, "logps/rejected": -751.2376708984375, "loss": 0.6095, "rewards/accuracies": 0.6875, "rewards/chosen": -2.264136552810669, "rewards/margins": 0.37653613090515137, "rewards/rejected": -2.6406726837158203, "step": 270 }, { "epoch": 0.17699404032982285, "grad_norm": 9.37938779675179, "learning_rate": 1.964584504277455e-07, "logits/chosen": -1.0458801984786987, "logits/rejected": -0.9988090395927429, "logps/chosen": -671.9390258789062, "logps/rejected": -693.88818359375, "loss": 0.5901, "rewards/accuracies": 0.75, "rewards/chosen": -1.9275598526000977, "rewards/margins": 0.33395007252693176, "rewards/rejected": -2.261509895324707, "step": 271 }, { "epoch": 0.1776471548697853, "grad_norm": 7.552555225756131, "learning_rate": 1.9639802002683514e-07, "logits/chosen": -0.946459949016571, "logits/rejected": -0.892015278339386, "logps/chosen": -660.0249633789062, "logps/rejected": -649.5670166015625, "loss": 0.5484, "rewards/accuracies": 0.75, "rewards/chosen": -1.8262689113616943, "rewards/margins": 0.5144949555397034, "rewards/rejected": -2.340763568878174, "step": 272 }, { "epoch": 0.17830026940974775, "grad_norm": 8.342988224658326, "learning_rate": 1.9633708786158803e-07, "logits/chosen": -0.973565936088562, "logits/rejected": -0.9508368968963623, "logps/chosen": -704.4890747070312, "logps/rejected": -710.4368896484375, "loss": 0.5307, "rewards/accuracies": 0.8125, "rewards/chosen": -2.024822950363159, "rewards/margins": 0.5465009212493896, "rewards/rejected": -2.571324110031128, "step": 273 }, { "epoch": 0.1789533839497102, "grad_norm": 8.120756554883108, "learning_rate": 1.962756542491641e-07, "logits/chosen": -0.8403066396713257, "logits/rejected": -0.8492639660835266, "logps/chosen": -621.307373046875, "logps/rejected": -661.5020141601562, "loss": 0.5477, "rewards/accuracies": 0.75, "rewards/chosen": -1.9450585842132568, "rewards/margins": 0.508897066116333, "rewards/rejected": -2.4539551734924316, "step": 274 }, { "epoch": 0.17960649848967264, "grad_norm": 8.2187266045248, "learning_rate": 1.962137195093334e-07, "logits/chosen": -0.9613364934921265, "logits/rejected": -0.938071608543396, "logps/chosen": -661.04052734375, "logps/rejected": -709.01171875, "loss": 0.57, "rewards/accuracies": 0.75, "rewards/chosen": -2.1429219245910645, "rewards/margins": 0.5459993481636047, "rewards/rejected": -2.6889214515686035, "step": 275 }, { "epoch": 0.18025961302963509, "grad_norm": 7.73778880473398, "learning_rate": 1.9615128396447432e-07, "logits/chosen": -0.9699392318725586, "logits/rejected": -0.9538300633430481, "logps/chosen": -690.2393798828125, "logps/rejected": -724.1671142578125, "loss": 0.5256, "rewards/accuracies": 0.6875, "rewards/chosen": -2.015857696533203, "rewards/margins": 0.4910595118999481, "rewards/rejected": -2.5069172382354736, "step": 276 }, { "epoch": 0.18091272756959753, "grad_norm": 7.911509339012901, "learning_rate": 1.9608834793957208e-07, "logits/chosen": -0.9603309631347656, "logits/rejected": -0.9850507378578186, "logps/chosen": -765.2130126953125, "logps/rejected": -842.7460327148438, "loss": 0.5647, "rewards/accuracies": 0.84375, "rewards/chosen": -2.3112220764160156, "rewards/margins": 0.5749973654747009, "rewards/rejected": -2.8862195014953613, "step": 277 }, { "epoch": 0.18156584210955998, "grad_norm": 8.967748933935127, "learning_rate": 1.9602491176221695e-07, "logits/chosen": -0.9512305855751038, "logits/rejected": -0.9300289154052734, "logps/chosen": -670.3013305664062, "logps/rejected": -727.5516357421875, "loss": 0.5245, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9914320707321167, "rewards/margins": 0.5332896113395691, "rewards/rejected": -2.524721622467041, "step": 278 }, { "epoch": 0.18221895664952242, "grad_norm": 8.734148197725288, "learning_rate": 1.9596097576260253e-07, "logits/chosen": -0.9387493133544922, "logits/rejected": -0.9077507853507996, "logps/chosen": -765.1206665039062, "logps/rejected": -880.9041137695312, "loss": 0.5773, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2102487087249756, "rewards/margins": 1.0287326574325562, "rewards/rejected": -3.238981246948242, "step": 279 }, { "epoch": 0.18287207118948487, "grad_norm": 8.15049441196506, "learning_rate": 1.9589654027352411e-07, "logits/chosen": -0.999761700630188, "logits/rejected": -0.9837722778320312, "logps/chosen": -709.5980224609375, "logps/rejected": -741.883544921875, "loss": 0.5495, "rewards/accuracies": 0.75, "rewards/chosen": -2.2854127883911133, "rewards/margins": 0.4614034593105316, "rewards/rejected": -2.746816396713257, "step": 280 }, { "epoch": 0.18352518572944732, "grad_norm": 10.6404743180184, "learning_rate": 1.9583160563037687e-07, "logits/chosen": -0.7266749143600464, "logits/rejected": -0.6888471841812134, "logps/chosen": -661.6748657226562, "logps/rejected": -709.5576171875, "loss": 0.5863, "rewards/accuracies": 0.78125, "rewards/chosen": -2.1588385105133057, "rewards/margins": 0.5338287353515625, "rewards/rejected": -2.692667245864868, "step": 281 }, { "epoch": 0.18417830026940976, "grad_norm": 9.00089638171536, "learning_rate": 1.957661721711541e-07, "logits/chosen": -0.8879116773605347, "logits/rejected": -0.8716956377029419, "logps/chosen": -693.7220458984375, "logps/rejected": -783.8416748046875, "loss": 0.6046, "rewards/accuracies": 0.625, "rewards/chosen": -2.3302977085113525, "rewards/margins": 0.6564316749572754, "rewards/rejected": -2.986729145050049, "step": 282 }, { "epoch": 0.18483141480937218, "grad_norm": 8.397472630305604, "learning_rate": 1.9570024023644555e-07, "logits/chosen": -0.9611495137214661, "logits/rejected": -0.9361308813095093, "logps/chosen": -715.2149047851562, "logps/rejected": -756.7767333984375, "loss": 0.5468, "rewards/accuracies": 0.84375, "rewards/chosen": -2.1140494346618652, "rewards/margins": 0.8066607117652893, "rewards/rejected": -2.9207100868225098, "step": 283 }, { "epoch": 0.18548452934933463, "grad_norm": 8.115486395877507, "learning_rate": 1.9563381016943562e-07, "logits/chosen": -0.9859124422073364, "logits/rejected": -0.9830190539360046, "logps/chosen": -775.6683959960938, "logps/rejected": -819.667724609375, "loss": 0.5704, "rewards/accuracies": 0.71875, "rewards/chosen": -2.2429869174957275, "rewards/margins": 0.4830154776573181, "rewards/rejected": -2.7260022163391113, "step": 284 }, { "epoch": 0.18613764388929707, "grad_norm": 9.071986066041832, "learning_rate": 1.9556688231590148e-07, "logits/chosen": -0.8561621904373169, "logits/rejected": -0.8536893129348755, "logps/chosen": -665.8601684570312, "logps/rejected": -814.3583374023438, "loss": 0.5968, "rewards/accuracies": 0.71875, "rewards/chosen": -2.13525652885437, "rewards/margins": 0.8098075985908508, "rewards/rejected": -2.945064067840576, "step": 285 }, { "epoch": 0.18679075842925952, "grad_norm": 9.258588546101418, "learning_rate": 1.9549945702421142e-07, "logits/chosen": -0.8817156553268433, "logits/rejected": -0.8830188512802124, "logps/chosen": -679.46044921875, "logps/rejected": -752.0320434570312, "loss": 0.537, "rewards/accuracies": 0.90625, "rewards/chosen": -2.2376911640167236, "rewards/margins": 0.5545761585235596, "rewards/rejected": -2.792267322540283, "step": 286 }, { "epoch": 0.18744387296922196, "grad_norm": 8.056969647323232, "learning_rate": 1.9543153464532288e-07, "logits/chosen": -0.7749789357185364, "logits/rejected": -0.794037401676178, "logps/chosen": -639.8425903320312, "logps/rejected": -680.3607788085938, "loss": 0.5139, "rewards/accuracies": 0.75, "rewards/chosen": -1.8591387271881104, "rewards/margins": 0.4303709864616394, "rewards/rejected": -2.2895097732543945, "step": 287 }, { "epoch": 0.1880969875091844, "grad_norm": 7.563536225052495, "learning_rate": 1.9536311553278083e-07, "logits/chosen": -0.9034614562988281, "logits/rejected": -0.8050072193145752, "logps/chosen": -599.6620483398438, "logps/rejected": -571.1824951171875, "loss": 0.5816, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0411486625671387, "rewards/margins": 0.15406599640846252, "rewards/rejected": -2.1952145099639893, "step": 288 }, { "epoch": 0.18875010204914686, "grad_norm": 8.136613787281412, "learning_rate": 1.9529420004271567e-07, "logits/chosen": -0.886310338973999, "logits/rejected": -0.9234850406646729, "logps/chosen": -753.2724609375, "logps/rejected": -823.5413208007812, "loss": 0.5271, "rewards/accuracies": 0.6875, "rewards/chosen": -2.702235221862793, "rewards/margins": 0.4669128954410553, "rewards/rejected": -3.1691482067108154, "step": 289 }, { "epoch": 0.1894032165891093, "grad_norm": 8.155433466668587, "learning_rate": 1.952247885338415e-07, "logits/chosen": -0.8842246532440186, "logits/rejected": -0.9555231332778931, "logps/chosen": -596.1199340820312, "logps/rejected": -699.538330078125, "loss": 0.5528, "rewards/accuracies": 0.8125, "rewards/chosen": -2.0811939239501953, "rewards/margins": 0.7108913660049438, "rewards/rejected": -2.7920849323272705, "step": 290 }, { "epoch": 0.19005633112907175, "grad_norm": 11.089638112757227, "learning_rate": 1.9515488136745445e-07, "logits/chosen": -0.9472813010215759, "logits/rejected": -0.8430065512657166, "logps/chosen": -725.784912109375, "logps/rejected": -711.8215942382812, "loss": 0.5915, "rewards/accuracies": 0.46875, "rewards/chosen": -2.388014793395996, "rewards/margins": 0.0675196647644043, "rewards/rejected": -2.4555346965789795, "step": 291 }, { "epoch": 0.1907094456690342, "grad_norm": 7.820461974964933, "learning_rate": 1.9508447890743046e-07, "logits/chosen": -0.9438467621803284, "logits/rejected": -0.9221634864807129, "logps/chosen": -731.6463012695312, "logps/rejected": -786.1243896484375, "loss": 0.508, "rewards/accuracies": 0.875, "rewards/chosen": -2.3206136226654053, "rewards/margins": 0.6845057010650635, "rewards/rejected": -3.0051193237304688, "step": 292 }, { "epoch": 0.19136256020899664, "grad_norm": 8.708040081311037, "learning_rate": 1.9501358152022349e-07, "logits/chosen": -0.926476240158081, "logits/rejected": -0.9502461552619934, "logps/chosen": -659.2603149414062, "logps/rejected": -810.6748046875, "loss": 0.5761, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1894261837005615, "rewards/margins": 0.9734176397323608, "rewards/rejected": -3.162843704223633, "step": 293 }, { "epoch": 0.1920156747489591, "grad_norm": 7.970091802437382, "learning_rate": 1.949421895748638e-07, "logits/chosen": -0.9131462574005127, "logits/rejected": -0.9216738939285278, "logps/chosen": -757.0573120117188, "logps/rejected": -837.2061767578125, "loss": 0.561, "rewards/accuracies": 0.8125, "rewards/chosen": -2.317478656768799, "rewards/margins": 0.5989239811897278, "rewards/rejected": -2.916402578353882, "step": 294 }, { "epoch": 0.19266878928892153, "grad_norm": 10.606953825436584, "learning_rate": 1.9487030344295584e-07, "logits/chosen": -0.9458974003791809, "logits/rejected": -0.911933183670044, "logps/chosen": -795.9569091796875, "logps/rejected": -945.5296020507812, "loss": 0.5062, "rewards/accuracies": 0.71875, "rewards/chosen": -2.4541730880737305, "rewards/margins": 0.7940478324890137, "rewards/rejected": -3.248220920562744, "step": 295 }, { "epoch": 0.19332190382888398, "grad_norm": 7.8610409749854675, "learning_rate": 1.947979234986763e-07, "logits/chosen": -1.084223985671997, "logits/rejected": -1.0780235528945923, "logps/chosen": -672.0103759765625, "logps/rejected": -736.9426879882812, "loss": 0.4731, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2389864921569824, "rewards/margins": 0.7914223670959473, "rewards/rejected": -3.0304088592529297, "step": 296 }, { "epoch": 0.19397501836884642, "grad_norm": 7.916708965496982, "learning_rate": 1.9472505011877235e-07, "logits/chosen": -0.839238703250885, "logits/rejected": -0.8083611130714417, "logps/chosen": -606.4061279296875, "logps/rejected": -644.0174560546875, "loss": 0.5493, "rewards/accuracies": 0.78125, "rewards/chosen": -1.8088489770889282, "rewards/margins": 0.4753211736679077, "rewards/rejected": -2.284170150756836, "step": 297 }, { "epoch": 0.19462813290880887, "grad_norm": 7.45863666364653, "learning_rate": 1.9465168368255945e-07, "logits/chosen": -0.9710556268692017, "logits/rejected": -0.9412792921066284, "logps/chosen": -696.9168090820312, "logps/rejected": -764.7929077148438, "loss": 0.5276, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5390238761901855, "rewards/margins": 0.6176880598068237, "rewards/rejected": -3.1567115783691406, "step": 298 }, { "epoch": 0.19528124744877132, "grad_norm": 8.63941851768535, "learning_rate": 1.9457782457191949e-07, "logits/chosen": -0.9380285739898682, "logits/rejected": -0.9279670119285583, "logps/chosen": -657.2073364257812, "logps/rejected": -776.177490234375, "loss": 0.5322, "rewards/accuracies": 0.8125, "rewards/chosen": -2.1041078567504883, "rewards/margins": 0.8005395531654358, "rewards/rejected": -2.9046473503112793, "step": 299 }, { "epoch": 0.19593436198873376, "grad_norm": 10.905449832753915, "learning_rate": 1.9450347317129891e-07, "logits/chosen": -0.8918706178665161, "logits/rejected": -0.8934108018875122, "logps/chosen": -648.9880981445312, "logps/rejected": -658.7398681640625, "loss": 0.5705, "rewards/accuracies": 0.59375, "rewards/chosen": -2.2292122840881348, "rewards/margins": 0.24789035320281982, "rewards/rejected": -2.477102518081665, "step": 300 }, { "epoch": 0.19593436198873376, "eval_logits/chosen": -0.8229660391807556, "eval_logits/rejected": -0.7893882989883423, "eval_logps/chosen": -721.2881469726562, "eval_logps/rejected": -765.6893920898438, "eval_loss": 0.5544829368591309, "eval_rewards/accuracies": 0.7269999980926514, "eval_rewards/chosen": -2.473808765411377, "eval_rewards/margins": 0.5313993096351624, "eval_rewards/rejected": -3.0052082538604736, "eval_runtime": 618.0515, "eval_samples_per_second": 6.472, "eval_steps_per_second": 0.404, "step": 300 }, { "epoch": 0.1965874765286962, "grad_norm": 7.659026133135656, "learning_rate": 1.9442862986770643e-07, "logits/chosen": -0.9449422359466553, "logits/rejected": -0.8925285935401917, "logps/chosen": -697.9733276367188, "logps/rejected": -778.6115112304688, "loss": 0.508, "rewards/accuracies": 0.8125, "rewards/chosen": -2.2272863388061523, "rewards/margins": 0.7337895631790161, "rewards/rejected": -2.961075782775879, "step": 301 }, { "epoch": 0.19724059106865865, "grad_norm": 10.725925196101898, "learning_rate": 1.943532950507113e-07, "logits/chosen": -0.9608031511306763, "logits/rejected": -0.9174195528030396, "logps/chosen": -821.3086547851562, "logps/rejected": -875.5884399414062, "loss": 0.5612, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8072926998138428, "rewards/margins": 0.6858442425727844, "rewards/rejected": -3.4931368827819824, "step": 302 }, { "epoch": 0.1978937056086211, "grad_norm": 8.209549535542726, "learning_rate": 1.9427746911244113e-07, "logits/chosen": -1.0311881303787231, "logits/rejected": -0.9527757167816162, "logps/chosen": -755.90478515625, "logps/rejected": -760.92529296875, "loss": 0.5497, "rewards/accuracies": 0.59375, "rewards/chosen": -2.6526520252227783, "rewards/margins": 0.47115930914878845, "rewards/rejected": -3.1238112449645996, "step": 303 }, { "epoch": 0.19854682014858355, "grad_norm": 8.394826636042648, "learning_rate": 1.942011524475798e-07, "logits/chosen": -0.8262045383453369, "logits/rejected": -0.894850492477417, "logps/chosen": -703.4110107421875, "logps/rejected": -817.5936279296875, "loss": 0.4896, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3225224018096924, "rewards/margins": 0.6646079421043396, "rewards/rejected": -2.9871304035186768, "step": 304 }, { "epoch": 0.199199934688546, "grad_norm": 8.107673849474413, "learning_rate": 1.9412434545336566e-07, "logits/chosen": -0.9343682527542114, "logits/rejected": -0.9112167954444885, "logps/chosen": -664.5975952148438, "logps/rejected": -693.5718383789062, "loss": 0.579, "rewards/accuracies": 0.625, "rewards/chosen": -2.497659683227539, "rewards/margins": 0.25632160902023315, "rewards/rejected": -2.753981828689575, "step": 305 }, { "epoch": 0.19985304922850844, "grad_norm": 9.022343657029202, "learning_rate": 1.9404704852958912e-07, "logits/chosen": -0.9269659519195557, "logits/rejected": -0.8933895826339722, "logps/chosen": -639.3394165039062, "logps/rejected": -709.7356567382812, "loss": 0.5264, "rewards/accuracies": 0.78125, "rewards/chosen": -2.0649757385253906, "rewards/margins": 0.6906264424324036, "rewards/rejected": -2.7556021213531494, "step": 306 }, { "epoch": 0.20050616376847089, "grad_norm": 7.583513266351151, "learning_rate": 1.9396926207859085e-07, "logits/chosen": -0.7031735181808472, "logits/rejected": -0.6803351640701294, "logps/chosen": -688.1170043945312, "logps/rejected": -824.4000854492188, "loss": 0.5313, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5198776721954346, "rewards/margins": 0.7780323624610901, "rewards/rejected": -3.29790997505188, "step": 307 }, { "epoch": 0.20115927830843333, "grad_norm": 8.379739148957032, "learning_rate": 1.9389098650525947e-07, "logits/chosen": -0.8177533149719238, "logits/rejected": -0.8568516969680786, "logps/chosen": -676.4760131835938, "logps/rejected": -744.55126953125, "loss": 0.509, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2387030124664307, "rewards/margins": 0.5821399092674255, "rewards/rejected": -2.820842742919922, "step": 308 }, { "epoch": 0.20181239284839578, "grad_norm": 8.173710195620405, "learning_rate": 1.9381222221702967e-07, "logits/chosen": -0.9144182205200195, "logits/rejected": -0.9687687158584595, "logps/chosen": -734.7119750976562, "logps/rejected": -1004.3690795898438, "loss": 0.5303, "rewards/accuracies": 0.71875, "rewards/chosen": -2.554551362991333, "rewards/margins": 1.2628813982009888, "rewards/rejected": -3.8174331188201904, "step": 309 }, { "epoch": 0.20246550738835822, "grad_norm": 7.9552125591432485, "learning_rate": 1.9373296962387984e-07, "logits/chosen": -0.9191279411315918, "logits/rejected": -0.8958581686019897, "logps/chosen": -601.743896484375, "logps/rejected": -661.82421875, "loss": 0.5474, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0206687450408936, "rewards/margins": 0.6330419182777405, "rewards/rejected": -2.65371036529541, "step": 310 }, { "epoch": 0.20311862192832067, "grad_norm": 7.980624051483625, "learning_rate": 1.9365322913833015e-07, "logits/chosen": -0.866454005241394, "logits/rejected": -0.7789896726608276, "logps/chosen": -754.0518188476562, "logps/rejected": -765.2423706054688, "loss": 0.5546, "rewards/accuracies": 0.65625, "rewards/chosen": -2.443909168243408, "rewards/margins": 0.3114027678966522, "rewards/rejected": -2.755311965942383, "step": 311 }, { "epoch": 0.20377173646828312, "grad_norm": 7.33860819487254, "learning_rate": 1.935730011754403e-07, "logits/chosen": -0.8991135358810425, "logits/rejected": -0.9318748712539673, "logps/chosen": -720.3719482421875, "logps/rejected": -895.6969604492188, "loss": 0.5255, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2463953495025635, "rewards/margins": 1.159752607345581, "rewards/rejected": -3.4061479568481445, "step": 312 }, { "epoch": 0.20442485100824556, "grad_norm": 9.418850525971324, "learning_rate": 1.9349228615280734e-07, "logits/chosen": -0.8775783777236938, "logits/rejected": -0.8939123153686523, "logps/chosen": -679.7825927734375, "logps/rejected": -848.8208618164062, "loss": 0.4997, "rewards/accuracies": 0.78125, "rewards/chosen": -2.3244364261627197, "rewards/margins": 1.083613634109497, "rewards/rejected": -3.4080498218536377, "step": 313 }, { "epoch": 0.205077965548208, "grad_norm": 8.778963808667314, "learning_rate": 1.9341108449056358e-07, "logits/chosen": -0.960693359375, "logits/rejected": -1.0465654134750366, "logps/chosen": -718.642333984375, "logps/rejected": -837.5458984375, "loss": 0.5544, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5699141025543213, "rewards/margins": 0.5482870936393738, "rewards/rejected": -3.118201494216919, "step": 314 }, { "epoch": 0.20573108008817045, "grad_norm": 8.973296115737988, "learning_rate": 1.9332939661137425e-07, "logits/chosen": -0.9772869348526001, "logits/rejected": -0.9910680055618286, "logps/chosen": -700.7576904296875, "logps/rejected": -708.429931640625, "loss": 0.5846, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4407153129577637, "rewards/margins": 0.2659532427787781, "rewards/rejected": -2.7066686153411865, "step": 315 }, { "epoch": 0.2063841946281329, "grad_norm": 11.761985597145776, "learning_rate": 1.9324722294043556e-07, "logits/chosen": -0.9107195138931274, "logits/rejected": -0.8959240317344666, "logps/chosen": -743.5145263671875, "logps/rejected": -857.8228149414062, "loss": 0.5304, "rewards/accuracies": 0.78125, "rewards/chosen": -2.518425941467285, "rewards/margins": 0.9631866812705994, "rewards/rejected": -3.48161244392395, "step": 316 }, { "epoch": 0.20703730916809535, "grad_norm": 9.491575896290264, "learning_rate": 1.931645639054722e-07, "logits/chosen": -0.9179579019546509, "logits/rejected": -0.9592142105102539, "logps/chosen": -705.681396484375, "logps/rejected": -892.9383544921875, "loss": 0.5062, "rewards/accuracies": 0.75, "rewards/chosen": -2.5186941623687744, "rewards/margins": 1.0660593509674072, "rewards/rejected": -3.5847535133361816, "step": 317 }, { "epoch": 0.2076904237080578, "grad_norm": 8.630378132342063, "learning_rate": 1.930814199367353e-07, "logits/chosen": -0.8748406171798706, "logits/rejected": -0.9312347769737244, "logps/chosen": -766.6427001953125, "logps/rejected": -819.1407470703125, "loss": 0.5132, "rewards/accuracies": 0.875, "rewards/chosen": -2.654548406600952, "rewards/margins": 0.5859209299087524, "rewards/rejected": -3.240469455718994, "step": 318 }, { "epoch": 0.20834353824802024, "grad_norm": 9.313105997470586, "learning_rate": 1.9299779146700008e-07, "logits/chosen": -0.8464125990867615, "logits/rejected": -0.8674778938293457, "logps/chosen": -749.940185546875, "logps/rejected": -874.630859375, "loss": 0.5174, "rewards/accuracies": 0.84375, "rewards/chosen": -2.640587091445923, "rewards/margins": 0.7609758973121643, "rewards/rejected": -3.4015626907348633, "step": 319 }, { "epoch": 0.20899665278798268, "grad_norm": 8.761272107103547, "learning_rate": 1.9291367893156374e-07, "logits/chosen": -0.8507086634635925, "logits/rejected": -0.8460374474525452, "logps/chosen": -706.5238647460938, "logps/rejected": -763.0086669921875, "loss": 0.5502, "rewards/accuracies": 0.65625, "rewards/chosen": -2.7435646057128906, "rewards/margins": 0.4100922644138336, "rewards/rejected": -3.1536569595336914, "step": 320 }, { "epoch": 0.20964976732794513, "grad_norm": 8.645077439540554, "learning_rate": 1.9282908276824305e-07, "logits/chosen": -0.8159580230712891, "logits/rejected": -0.8971022367477417, "logps/chosen": -718.7769165039062, "logps/rejected": -896.5993041992188, "loss": 0.5023, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4595208168029785, "rewards/margins": 1.1103695631027222, "rewards/rejected": -3.5698904991149902, "step": 321 }, { "epoch": 0.21030288186790758, "grad_norm": 8.9608952505274, "learning_rate": 1.927440034173721e-07, "logits/chosen": -0.9348753690719604, "logits/rejected": -0.9571008682250977, "logps/chosen": -768.90673828125, "logps/rejected": -839.931640625, "loss": 0.5535, "rewards/accuracies": 0.65625, "rewards/chosen": -2.864488124847412, "rewards/margins": 0.6067023873329163, "rewards/rejected": -3.4711906909942627, "step": 322 }, { "epoch": 0.21095599640787002, "grad_norm": 8.764928992953658, "learning_rate": 1.9265844132180014e-07, "logits/chosen": -0.937222957611084, "logits/rejected": -0.9439467191696167, "logps/chosen": -740.7075805664062, "logps/rejected": -851.681396484375, "loss": 0.5409, "rewards/accuracies": 0.71875, "rewards/chosen": -2.908351182937622, "rewards/margins": 0.8476966023445129, "rewards/rejected": -3.7560479640960693, "step": 323 }, { "epoch": 0.21160911094783247, "grad_norm": 11.203248328192403, "learning_rate": 1.9257239692688904e-07, "logits/chosen": -0.9657084345817566, "logits/rejected": -0.8316485285758972, "logps/chosen": -710.1865234375, "logps/rejected": -722.1703491210938, "loss": 0.5703, "rewards/accuracies": 0.625, "rewards/chosen": -2.541426420211792, "rewards/margins": 0.4015898108482361, "rewards/rejected": -2.943016290664673, "step": 324 }, { "epoch": 0.21226222548779491, "grad_norm": 10.154285686767231, "learning_rate": 1.9248587068051117e-07, "logits/chosen": -0.9456658959388733, "logits/rejected": -0.9287790060043335, "logps/chosen": -759.39697265625, "logps/rejected": -901.678955078125, "loss": 0.5445, "rewards/accuracies": 0.71875, "rewards/chosen": -2.964029550552368, "rewards/margins": 0.8283199071884155, "rewards/rejected": -3.7923495769500732, "step": 325 }, { "epoch": 0.21291534002775736, "grad_norm": 10.085226190171486, "learning_rate": 1.92398863033047e-07, "logits/chosen": -0.9480457901954651, "logits/rejected": -0.8808225989341736, "logps/chosen": -737.1611938476562, "logps/rejected": -832.3869018554688, "loss": 0.5297, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4359898567199707, "rewards/margins": 1.118359088897705, "rewards/rejected": -3.5543487071990967, "step": 326 }, { "epoch": 0.2135684545677198, "grad_norm": 8.037243703725244, "learning_rate": 1.9231137443738273e-07, "logits/chosen": -0.8650938868522644, "logits/rejected": -0.7863737940788269, "logps/chosen": -667.995361328125, "logps/rejected": -802.8931274414062, "loss": 0.4848, "rewards/accuracies": 0.78125, "rewards/chosen": -2.464874029159546, "rewards/margins": 0.8920389413833618, "rewards/rejected": -3.3569130897521973, "step": 327 }, { "epoch": 0.21422156910768225, "grad_norm": 9.782455330939415, "learning_rate": 1.92223405348908e-07, "logits/chosen": -0.9213351011276245, "logits/rejected": -0.861817479133606, "logps/chosen": -762.0364990234375, "logps/rejected": -799.6807250976562, "loss": 0.5027, "rewards/accuracies": 0.84375, "rewards/chosen": -2.5765061378479004, "rewards/margins": 0.7302671074867249, "rewards/rejected": -3.3067734241485596, "step": 328 }, { "epoch": 0.2148746836476447, "grad_norm": 8.275520691185628, "learning_rate": 1.9213495622551346e-07, "logits/chosen": -0.8799265027046204, "logits/rejected": -0.9568032622337341, "logps/chosen": -696.4993286132812, "logps/rejected": -950.8281860351562, "loss": 0.526, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4880318641662598, "rewards/margins": 1.6394169330596924, "rewards/rejected": -4.127448558807373, "step": 329 }, { "epoch": 0.21552779818760714, "grad_norm": 9.733494867655624, "learning_rate": 1.9204602752758836e-07, "logits/chosen": -0.8873504996299744, "logits/rejected": -0.8919836282730103, "logps/chosen": -682.7972412109375, "logps/rejected": -753.762451171875, "loss": 0.5733, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7004520893096924, "rewards/margins": 0.7935549020767212, "rewards/rejected": -3.494007110595703, "step": 330 }, { "epoch": 0.2161809127275696, "grad_norm": 11.487422940603599, "learning_rate": 1.9195661971801823e-07, "logits/chosen": -0.8627737760543823, "logits/rejected": -0.8270461559295654, "logps/chosen": -752.065673828125, "logps/rejected": -787.5745239257812, "loss": 0.5432, "rewards/accuracies": 0.8125, "rewards/chosen": -2.530182361602783, "rewards/margins": 0.5825455188751221, "rewards/rejected": -3.1127278804779053, "step": 331 }, { "epoch": 0.21683402726753204, "grad_norm": 9.536090726405337, "learning_rate": 1.9186673326218252e-07, "logits/chosen": -0.7737371325492859, "logits/rejected": -0.8088192343711853, "logps/chosen": -740.6680297851562, "logps/rejected": -831.1947631835938, "loss": 0.5135, "rewards/accuracies": 0.84375, "rewards/chosen": -2.6271743774414062, "rewards/margins": 0.6504449844360352, "rewards/rejected": -3.2776193618774414, "step": 332 }, { "epoch": 0.21748714180749448, "grad_norm": 10.23713461294934, "learning_rate": 1.9177636862795192e-07, "logits/chosen": -0.8282246589660645, "logits/rejected": -0.8120365142822266, "logps/chosen": -678.1328735351562, "logps/rejected": -745.9762573242188, "loss": 0.5841, "rewards/accuracies": 0.71875, "rewards/chosen": -2.664412498474121, "rewards/margins": 0.6008880734443665, "rewards/rejected": -3.2653002738952637, "step": 333 }, { "epoch": 0.21814025634745693, "grad_norm": 7.80838685357101, "learning_rate": 1.9168552628568628e-07, "logits/chosen": -0.917161226272583, "logits/rejected": -1.018967628479004, "logps/chosen": -695.7123413085938, "logps/rejected": -829.2813720703125, "loss": 0.5086, "rewards/accuracies": 0.875, "rewards/chosen": -2.2889459133148193, "rewards/margins": 0.8900274634361267, "rewards/rejected": -3.17897367477417, "step": 334 }, { "epoch": 0.21879337088741938, "grad_norm": 8.958596685509493, "learning_rate": 1.9159420670823185e-07, "logits/chosen": -0.9185332655906677, "logits/rejected": -0.870037317276001, "logps/chosen": -707.0963745117188, "logps/rejected": -741.2311401367188, "loss": 0.5556, "rewards/accuracies": 0.625, "rewards/chosen": -2.3948493003845215, "rewards/margins": 0.5061476230621338, "rewards/rejected": -2.9009971618652344, "step": 335 }, { "epoch": 0.21944648542738182, "grad_norm": 11.406917533980884, "learning_rate": 1.9150241037091908e-07, "logits/chosen": -0.9214222431182861, "logits/rejected": -0.9454975724220276, "logps/chosen": -763.0989379882812, "logps/rejected": -757.4050903320312, "loss": 0.5357, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7994232177734375, "rewards/margins": 0.5790168642997742, "rewards/rejected": -3.3784401416778564, "step": 336 }, { "epoch": 0.22009959996734427, "grad_norm": 8.348133500220372, "learning_rate": 1.9141013775155985e-07, "logits/chosen": -0.8210875391960144, "logits/rejected": -0.79030442237854, "logps/chosen": -674.8720092773438, "logps/rejected": -720.6124877929688, "loss": 0.4822, "rewards/accuracies": 0.75, "rewards/chosen": -2.2248129844665527, "rewards/margins": 0.6432361006736755, "rewards/rejected": -2.868049144744873, "step": 337 }, { "epoch": 0.2207527145073067, "grad_norm": 10.69630326702095, "learning_rate": 1.913173893304453e-07, "logits/chosen": -0.9009968638420105, "logits/rejected": -0.9283880591392517, "logps/chosen": -710.2175903320312, "logps/rejected": -817.3650512695312, "loss": 0.5991, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6748363971710205, "rewards/margins": 0.6548750400543213, "rewards/rejected": -3.329711675643921, "step": 338 }, { "epoch": 0.22140582904726916, "grad_norm": 8.33216110402213, "learning_rate": 1.9122416559034314e-07, "logits/chosen": -0.8454635143280029, "logits/rejected": -0.7918909788131714, "logps/chosen": -822.1680297851562, "logps/rejected": -859.948486328125, "loss": 0.5271, "rewards/accuracies": 0.75, "rewards/chosen": -2.614701271057129, "rewards/margins": 0.7649893164634705, "rewards/rejected": -3.379690647125244, "step": 339 }, { "epoch": 0.2220589435872316, "grad_norm": 9.685522044762283, "learning_rate": 1.9113046701649514e-07, "logits/chosen": -0.9304038882255554, "logits/rejected": -0.8563180565834045, "logps/chosen": -647.0829467773438, "logps/rejected": -720.6229248046875, "loss": 0.5638, "rewards/accuracies": 0.625, "rewards/chosen": -2.3711228370666504, "rewards/margins": 0.4756318926811218, "rewards/rejected": -2.846754789352417, "step": 340 }, { "epoch": 0.22271205812719405, "grad_norm": 12.691659802253687, "learning_rate": 1.9103629409661467e-07, "logits/chosen": -0.8918319940567017, "logits/rejected": -0.8558528423309326, "logps/chosen": -666.69140625, "logps/rejected": -701.9205932617188, "loss": 0.586, "rewards/accuracies": 0.59375, "rewards/chosen": -2.631117343902588, "rewards/margins": 0.31648769974708557, "rewards/rejected": -2.9476051330566406, "step": 341 }, { "epoch": 0.2233651726671565, "grad_norm": 9.139752244778244, "learning_rate": 1.9094164732088412e-07, "logits/chosen": -0.9762035608291626, "logits/rejected": -0.9089761972427368, "logps/chosen": -749.697998046875, "logps/rejected": -739.471435546875, "loss": 0.5427, "rewards/accuracies": 0.6875, "rewards/chosen": -2.6595542430877686, "rewards/margins": 0.30329763889312744, "rewards/rejected": -2.9628520011901855, "step": 342 }, { "epoch": 0.22401828720711894, "grad_norm": 8.791854812952089, "learning_rate": 1.9084652718195236e-07, "logits/chosen": -0.8667416572570801, "logits/rejected": -0.7667337656021118, "logps/chosen": -681.466796875, "logps/rejected": -695.0787353515625, "loss": 0.4833, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6133289337158203, "rewards/margins": 0.5205738544464111, "rewards/rejected": -3.1339030265808105, "step": 343 }, { "epoch": 0.2246714017470814, "grad_norm": 8.391392138914927, "learning_rate": 1.9075093417493222e-07, "logits/chosen": -1.021234154701233, "logits/rejected": -0.9969886541366577, "logps/chosen": -772.0455932617188, "logps/rejected": -815.7417602539062, "loss": 0.5024, "rewards/accuracies": 0.6875, "rewards/chosen": -2.6911168098449707, "rewards/margins": 0.5029557943344116, "rewards/rejected": -3.1940724849700928, "step": 344 }, { "epoch": 0.22532451628704384, "grad_norm": 11.090830095089824, "learning_rate": 1.9065486879739783e-07, "logits/chosen": -0.868331789970398, "logits/rejected": -0.8692890405654907, "logps/chosen": -716.2054443359375, "logps/rejected": -811.1107788085938, "loss": 0.4973, "rewards/accuracies": 0.8125, "rewards/chosen": -2.5343575477600098, "rewards/margins": 0.8151875138282776, "rewards/rejected": -3.3495450019836426, "step": 345 }, { "epoch": 0.22597763082700628, "grad_norm": 7.7925319190317675, "learning_rate": 1.9055833154938206e-07, "logits/chosen": -0.8892742991447449, "logits/rejected": -0.8983861207962036, "logps/chosen": -748.330078125, "logps/rejected": -880.4389038085938, "loss": 0.4856, "rewards/accuracies": 0.8125, "rewards/chosen": -2.730029344558716, "rewards/margins": 0.9022680521011353, "rewards/rejected": -3.6322972774505615, "step": 346 }, { "epoch": 0.22663074536696873, "grad_norm": 8.532957227412508, "learning_rate": 1.9046132293337398e-07, "logits/chosen": -0.8384896516799927, "logits/rejected": -0.8548451066017151, "logps/chosen": -680.845458984375, "logps/rejected": -710.210693359375, "loss": 0.5661, "rewards/accuracies": 0.6875, "rewards/chosen": -2.533750295639038, "rewards/margins": 0.35651496052742004, "rewards/rejected": -2.890265464782715, "step": 347 }, { "epoch": 0.22728385990693117, "grad_norm": 8.837287762308879, "learning_rate": 1.903638434543161e-07, "logits/chosen": -0.9365876317024231, "logits/rejected": -0.9628958702087402, "logps/chosen": -703.1912841796875, "logps/rejected": -868.1168823242188, "loss": 0.5385, "rewards/accuracies": 0.78125, "rewards/chosen": -2.4941153526306152, "rewards/margins": 0.9199743866920471, "rewards/rejected": -3.4140899181365967, "step": 348 }, { "epoch": 0.22793697444689362, "grad_norm": 8.36148014201041, "learning_rate": 1.9026589361960198e-07, "logits/chosen": -0.8639967441558838, "logits/rejected": -0.8728487491607666, "logps/chosen": -726.3617553710938, "logps/rejected": -799.6815185546875, "loss": 0.535, "rewards/accuracies": 0.6875, "rewards/chosen": -2.505831241607666, "rewards/margins": 0.5206820964813232, "rewards/rejected": -3.0265133380889893, "step": 349 }, { "epoch": 0.22859008898685607, "grad_norm": 9.530271630865817, "learning_rate": 1.9016747393907327e-07, "logits/chosen": -0.9139630794525146, "logits/rejected": -0.8439926505088806, "logps/chosen": -815.9520263671875, "logps/rejected": -833.1660766601562, "loss": 0.5739, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1119563579559326, "rewards/margins": 0.37380701303482056, "rewards/rejected": -3.4857633113861084, "step": 350 }, { "epoch": 0.2292432035268185, "grad_norm": 8.867463894546207, "learning_rate": 1.9006858492501734e-07, "logits/chosen": -0.8531547784805298, "logits/rejected": -0.8365933895111084, "logps/chosen": -704.0611572265625, "logps/rejected": -903.056640625, "loss": 0.5359, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5138795375823975, "rewards/margins": 1.4063800573349, "rewards/rejected": -3.920259475708008, "step": 351 }, { "epoch": 0.22989631806678096, "grad_norm": 11.470432910725284, "learning_rate": 1.8996922709216454e-07, "logits/chosen": -0.894100546836853, "logits/rejected": -0.8663679957389832, "logps/chosen": -734.0579833984375, "logps/rejected": -780.0435791015625, "loss": 0.5576, "rewards/accuracies": 0.625, "rewards/chosen": -2.629918336868286, "rewards/margins": 0.5574440956115723, "rewards/rejected": -3.1873621940612793, "step": 352 }, { "epoch": 0.2305494326067434, "grad_norm": 8.028334619645092, "learning_rate": 1.8986940095768532e-07, "logits/chosen": -0.898781955242157, "logits/rejected": -0.8125208616256714, "logps/chosen": -815.3742065429688, "logps/rejected": -846.159912109375, "loss": 0.5197, "rewards/accuracies": 0.78125, "rewards/chosen": -2.95741868019104, "rewards/margins": 0.7997546792030334, "rewards/rejected": -3.7571730613708496, "step": 353 }, { "epoch": 0.23120254714670585, "grad_norm": 9.751558463586461, "learning_rate": 1.8976910704118788e-07, "logits/chosen": -0.9632163047790527, "logits/rejected": -0.9397919178009033, "logps/chosen": -815.559326171875, "logps/rejected": -870.219482421875, "loss": 0.5069, "rewards/accuracies": 0.8125, "rewards/chosen": -2.571608066558838, "rewards/margins": 0.7947617173194885, "rewards/rejected": -3.3663697242736816, "step": 354 }, { "epoch": 0.2318556616866683, "grad_norm": 8.9770404991692, "learning_rate": 1.8966834586471517e-07, "logits/chosen": -0.7659550309181213, "logits/rejected": -0.8094583749771118, "logps/chosen": -731.0473022460938, "logps/rejected": -909.0474853515625, "loss": 0.4928, "rewards/accuracies": 0.84375, "rewards/chosen": -2.2166824340820312, "rewards/margins": 1.0738444328308105, "rewards/rejected": -3.290526866912842, "step": 355 }, { "epoch": 0.23250877622663074, "grad_norm": 13.094673327461953, "learning_rate": 1.8956711795274234e-07, "logits/chosen": -0.915695071220398, "logits/rejected": -0.9428746700286865, "logps/chosen": -727.38671875, "logps/rejected": -821.8265991210938, "loss": 0.5928, "rewards/accuracies": 0.625, "rewards/chosen": -2.389467239379883, "rewards/margins": 0.8192275762557983, "rewards/rejected": -3.2086949348449707, "step": 356 }, { "epoch": 0.2331618907665932, "grad_norm": 9.652480627651235, "learning_rate": 1.8946542383217393e-07, "logits/chosen": -1.0150421857833862, "logits/rejected": -0.9018900990486145, "logps/chosen": -819.73486328125, "logps/rejected": -817.6781616210938, "loss": 0.5024, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8414320945739746, "rewards/margins": 0.6176068782806396, "rewards/rejected": -3.459038734436035, "step": 357 }, { "epoch": 0.23381500530655563, "grad_norm": 9.434572929754303, "learning_rate": 1.8936326403234123e-07, "logits/chosen": -0.8316928148269653, "logits/rejected": -0.8276723623275757, "logps/chosen": -629.0164794921875, "logps/rejected": -735.0913696289062, "loss": 0.549, "rewards/accuracies": 0.625, "rewards/chosen": -2.401994228363037, "rewards/margins": 0.6702228784561157, "rewards/rejected": -3.072216749191284, "step": 358 }, { "epoch": 0.23446811984651808, "grad_norm": 8.192667820567904, "learning_rate": 1.892606390849993e-07, "logits/chosen": -0.9244518280029297, "logits/rejected": -0.8568234443664551, "logps/chosen": -805.7052612304688, "logps/rejected": -841.6261596679688, "loss": 0.496, "rewards/accuracies": 0.90625, "rewards/chosen": -2.8729684352874756, "rewards/margins": 1.1182036399841309, "rewards/rejected": -3.9911718368530273, "step": 359 }, { "epoch": 0.23512123438648053, "grad_norm": 7.785077716699336, "learning_rate": 1.8915754952432455e-07, "logits/chosen": -0.9250266551971436, "logits/rejected": -0.8511897325515747, "logps/chosen": -683.8753662109375, "logps/rejected": -751.814453125, "loss": 0.5046, "rewards/accuracies": 0.875, "rewards/chosen": -2.457275390625, "rewards/margins": 0.8861963748931885, "rewards/rejected": -3.3434715270996094, "step": 360 }, { "epoch": 0.23577434892644297, "grad_norm": 7.374397652768221, "learning_rate": 1.8905399588691163e-07, "logits/chosen": -0.8517379760742188, "logits/rejected": -0.7823293209075928, "logps/chosen": -737.6988525390625, "logps/rejected": -842.8553466796875, "loss": 0.4827, "rewards/accuracies": 0.8125, "rewards/chosen": -2.6914682388305664, "rewards/margins": 1.1641268730163574, "rewards/rejected": -3.855595111846924, "step": 361 }, { "epoch": 0.23642746346640542, "grad_norm": 10.541536769336025, "learning_rate": 1.8894997871177077e-07, "logits/chosen": -1.0059581995010376, "logits/rejected": -0.885239839553833, "logps/chosen": -715.8147583007812, "logps/rejected": -688.3126220703125, "loss": 0.5834, "rewards/accuracies": 0.71875, "rewards/chosen": -2.433546781539917, "rewards/margins": 0.3138864040374756, "rewards/rejected": -2.7474331855773926, "step": 362 }, { "epoch": 0.23708057800636786, "grad_norm": 10.21089003459886, "learning_rate": 1.8884549854032504e-07, "logits/chosen": -0.8944604396820068, "logits/rejected": -0.7838302850723267, "logps/chosen": -701.7962646484375, "logps/rejected": -715.971923828125, "loss": 0.5636, "rewards/accuracies": 0.75, "rewards/chosen": -2.525576591491699, "rewards/margins": 0.3668100833892822, "rewards/rejected": -2.8923864364624023, "step": 363 }, { "epoch": 0.2377336925463303, "grad_norm": 11.101824662582676, "learning_rate": 1.8874055591640742e-07, "logits/chosen": -0.8422518372535706, "logits/rejected": -0.8626506328582764, "logps/chosen": -752.20556640625, "logps/rejected": -995.0734252929688, "loss": 0.5241, "rewards/accuracies": 0.75, "rewards/chosen": -2.914336919784546, "rewards/margins": 1.4878953695297241, "rewards/rejected": -4.4022321701049805, "step": 364 }, { "epoch": 0.23838680708629276, "grad_norm": 8.509188829026256, "learning_rate": 1.8863515138625802e-07, "logits/chosen": -0.9586243629455566, "logits/rejected": -0.937591552734375, "logps/chosen": -756.9347534179688, "logps/rejected": -820.6338500976562, "loss": 0.5407, "rewards/accuracies": 0.75, "rewards/chosen": -2.6895179748535156, "rewards/margins": 0.6859859228134155, "rewards/rejected": -3.3755040168762207, "step": 365 }, { "epoch": 0.2390399216262552, "grad_norm": 16.91175185837713, "learning_rate": 1.885292854985213e-07, "logits/chosen": -0.875924825668335, "logits/rejected": -0.881722629070282, "logps/chosen": -728.5872802734375, "logps/rejected": -868.9918823242188, "loss": 0.5496, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5441956520080566, "rewards/margins": 0.7560574412345886, "rewards/rejected": -3.300252914428711, "step": 366 }, { "epoch": 0.23969303616621765, "grad_norm": 7.315683900969026, "learning_rate": 1.8842295880424304e-07, "logits/chosen": -0.8934835195541382, "logits/rejected": -0.8142973184585571, "logps/chosen": -755.492919921875, "logps/rejected": -836.1302490234375, "loss": 0.4819, "rewards/accuracies": 0.8125, "rewards/chosen": -2.8435540199279785, "rewards/margins": 0.8605523705482483, "rewards/rejected": -3.704106330871582, "step": 367 }, { "epoch": 0.2403461507061801, "grad_norm": 11.255355527223116, "learning_rate": 1.8831617185686762e-07, "logits/chosen": -0.8581317663192749, "logits/rejected": -0.8509604930877686, "logps/chosen": -827.1642456054688, "logps/rejected": -935.1969604492188, "loss": 0.4666, "rewards/accuracies": 0.84375, "rewards/chosen": -2.654917001724243, "rewards/margins": 1.1863821744918823, "rewards/rejected": -3.841298818588257, "step": 368 }, { "epoch": 0.24099926524614254, "grad_norm": 8.041901256168211, "learning_rate": 1.8820892521223515e-07, "logits/chosen": -0.8912599086761475, "logits/rejected": -0.8826814889907837, "logps/chosen": -760.4750366210938, "logps/rejected": -869.6399536132812, "loss": 0.459, "rewards/accuracies": 0.875, "rewards/chosen": -2.5947632789611816, "rewards/margins": 1.0988965034484863, "rewards/rejected": -3.6936593055725098, "step": 369 }, { "epoch": 0.241652379786105, "grad_norm": 9.210568805727371, "learning_rate": 1.8810121942857845e-07, "logits/chosen": -0.8365862965583801, "logits/rejected": -0.8083174824714661, "logps/chosen": -714.6243896484375, "logps/rejected": -734.9310302734375, "loss": 0.5766, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7714502811431885, "rewards/margins": 0.40789082646369934, "rewards/rejected": -3.1793413162231445, "step": 370 }, { "epoch": 0.24230549432606743, "grad_norm": 8.423599734298417, "learning_rate": 1.8799305506652025e-07, "logits/chosen": -0.994741678237915, "logits/rejected": -0.9545141458511353, "logps/chosen": -809.184326171875, "logps/rejected": -890.893310546875, "loss": 0.5425, "rewards/accuracies": 0.75, "rewards/chosen": -3.0367493629455566, "rewards/margins": 1.0203640460968018, "rewards/rejected": -4.057113170623779, "step": 371 }, { "epoch": 0.24295860886602988, "grad_norm": 9.652327213203613, "learning_rate": 1.8788443268907024e-07, "logits/chosen": -0.8552968502044678, "logits/rejected": -0.8310869932174683, "logps/chosen": -720.93017578125, "logps/rejected": -783.8108520507812, "loss": 0.5013, "rewards/accuracies": 0.875, "rewards/chosen": -2.675302505493164, "rewards/margins": 0.6832669973373413, "rewards/rejected": -3.358569860458374, "step": 372 }, { "epoch": 0.24361172340599233, "grad_norm": 11.703544457496719, "learning_rate": 1.8777535286162217e-07, "logits/chosen": -0.881072461605072, "logits/rejected": -0.874359130859375, "logps/chosen": -771.0457763671875, "logps/rejected": -828.0839233398438, "loss": 0.5119, "rewards/accuracies": 0.75, "rewards/chosen": -2.8744983673095703, "rewards/margins": 0.5727369785308838, "rewards/rejected": -3.447235345840454, "step": 373 }, { "epoch": 0.24426483794595477, "grad_norm": 11.546985055661386, "learning_rate": 1.8766581615195078e-07, "logits/chosen": -0.9114188551902771, "logits/rejected": -0.8961868286132812, "logps/chosen": -736.6749267578125, "logps/rejected": -839.9960327148438, "loss": 0.5171, "rewards/accuracies": 0.8125, "rewards/chosen": -2.694876194000244, "rewards/margins": 0.8417505025863647, "rewards/rejected": -3.5366265773773193, "step": 374 }, { "epoch": 0.24491795248591722, "grad_norm": 8.61604895408707, "learning_rate": 1.875558231302091e-07, "logits/chosen": -0.9092810153961182, "logits/rejected": -0.781123697757721, "logps/chosen": -660.8868408203125, "logps/rejected": -716.446044921875, "loss": 0.581, "rewards/accuracies": 0.78125, "rewards/chosen": -2.2286787033081055, "rewards/margins": 0.6053875684738159, "rewards/rejected": -2.834066390991211, "step": 375 }, { "epoch": 0.24557106702587966, "grad_norm": 13.796271931510978, "learning_rate": 1.8744537436892512e-07, "logits/chosen": -0.8940467834472656, "logits/rejected": -0.8263184428215027, "logps/chosen": -826.9315185546875, "logps/rejected": -844.8912963867188, "loss": 0.4862, "rewards/accuracies": 0.75, "rewards/chosen": -3.1379830837249756, "rewards/margins": 0.6378382444381714, "rewards/rejected": -3.7758212089538574, "step": 376 }, { "epoch": 0.2462241815658421, "grad_norm": 11.831975946336552, "learning_rate": 1.8733447044299925e-07, "logits/chosen": -0.7302409410476685, "logits/rejected": -0.814877986907959, "logps/chosen": -765.5311889648438, "logps/rejected": -1002.50244140625, "loss": 0.4899, "rewards/accuracies": 0.875, "rewards/chosen": -2.6444497108459473, "rewards/margins": 1.4703834056854248, "rewards/rejected": -4.114832878112793, "step": 377 }, { "epoch": 0.24687729610580456, "grad_norm": 12.2428231194753, "learning_rate": 1.8722311192970092e-07, "logits/chosen": -0.8322169780731201, "logits/rejected": -0.7421005368232727, "logps/chosen": -788.8143310546875, "logps/rejected": -853.354248046875, "loss": 0.5892, "rewards/accuracies": 0.625, "rewards/chosen": -2.7768325805664062, "rewards/margins": 0.5746185183525085, "rewards/rejected": -3.3514509201049805, "step": 378 }, { "epoch": 0.247530410645767, "grad_norm": 9.846423022437284, "learning_rate": 1.8711129940866575e-07, "logits/chosen": -0.91043621301651, "logits/rejected": -0.9167795777320862, "logps/chosen": -828.8065185546875, "logps/rejected": -872.3666381835938, "loss": 0.4836, "rewards/accuracies": 0.75, "rewards/chosen": -2.97462797164917, "rewards/margins": 0.8676510453224182, "rewards/rejected": -3.8422789573669434, "step": 379 }, { "epoch": 0.24818352518572945, "grad_norm": 11.536895925758275, "learning_rate": 1.8699903346189263e-07, "logits/chosen": -0.8867179751396179, "logits/rejected": -0.8912706971168518, "logps/chosen": -804.9017944335938, "logps/rejected": -852.7586059570312, "loss": 0.5841, "rewards/accuracies": 0.75, "rewards/chosen": -2.9465479850769043, "rewards/margins": 0.7642606496810913, "rewards/rejected": -3.710808753967285, "step": 380 }, { "epoch": 0.2488366397256919, "grad_norm": 9.03975607711823, "learning_rate": 1.8688631467374054e-07, "logits/chosen": -0.946536123752594, "logits/rejected": -0.9552872180938721, "logps/chosen": -782.4651489257812, "logps/rejected": -899.2623291015625, "loss": 0.5573, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9219205379486084, "rewards/margins": 0.8479382991790771, "rewards/rejected": -3.7698588371276855, "step": 381 }, { "epoch": 0.24948975426565434, "grad_norm": 10.588989944794507, "learning_rate": 1.8677314363092553e-07, "logits/chosen": -0.9365772604942322, "logits/rejected": -0.8860799670219421, "logps/chosen": -748.2943725585938, "logps/rejected": -792.3837890625, "loss": 0.5409, "rewards/accuracies": 0.65625, "rewards/chosen": -2.813943862915039, "rewards/margins": 0.5208563208580017, "rewards/rejected": -3.3347997665405273, "step": 382 }, { "epoch": 0.25014286880561676, "grad_norm": 9.063214190734078, "learning_rate": 1.866595209225177e-07, "logits/chosen": -0.7872967720031738, "logits/rejected": -0.8394625186920166, "logps/chosen": -805.676513671875, "logps/rejected": -910.6593627929688, "loss": 0.5242, "rewards/accuracies": 0.71875, "rewards/chosen": -2.991018533706665, "rewards/margins": 0.9019701480865479, "rewards/rejected": -3.892988920211792, "step": 383 }, { "epoch": 0.25079598334557923, "grad_norm": 12.184205086121409, "learning_rate": 1.8654544713993822e-07, "logits/chosen": -0.8133022785186768, "logits/rejected": -0.811676025390625, "logps/chosen": -755.30859375, "logps/rejected": -825.8359985351562, "loss": 0.5428, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7623047828674316, "rewards/margins": 0.6998733282089233, "rewards/rejected": -3.4621777534484863, "step": 384 }, { "epoch": 0.25144909788554165, "grad_norm": 8.545125637268963, "learning_rate": 1.8643092287695602e-07, "logits/chosen": -0.8532843589782715, "logits/rejected": -0.7933013439178467, "logps/chosen": -736.1025390625, "logps/rejected": -808.1593017578125, "loss": 0.4891, "rewards/accuracies": 0.84375, "rewards/chosen": -2.8755104541778564, "rewards/margins": 0.8513441681861877, "rewards/rejected": -3.7268548011779785, "step": 385 }, { "epoch": 0.2521022124255041, "grad_norm": 12.071178659572865, "learning_rate": 1.8631594872968496e-07, "logits/chosen": -0.770072340965271, "logits/rejected": -0.7640020847320557, "logps/chosen": -683.5043334960938, "logps/rejected": -767.7935180664062, "loss": 0.4933, "rewards/accuracies": 0.6875, "rewards/chosen": -2.2001888751983643, "rewards/margins": 0.619857668876648, "rewards/rejected": -2.8200464248657227, "step": 386 }, { "epoch": 0.25275532696546654, "grad_norm": 10.703726113073369, "learning_rate": 1.862005252965805e-07, "logits/chosen": -0.7848135828971863, "logits/rejected": -0.8621163964271545, "logps/chosen": -687.1354370117188, "logps/rejected": -819.2810668945312, "loss": 0.5356, "rewards/accuracies": 0.8125, "rewards/chosen": -2.449026584625244, "rewards/margins": 0.7430239915847778, "rewards/rejected": -3.1920504570007324, "step": 387 }, { "epoch": 0.253408441505429, "grad_norm": 11.674209847253039, "learning_rate": 1.8608465317843676e-07, "logits/chosen": -0.8625525832176208, "logits/rejected": -0.8220775127410889, "logps/chosen": -649.7574462890625, "logps/rejected": -685.041259765625, "loss": 0.5312, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3073811531066895, "rewards/margins": 0.5945886373519897, "rewards/rejected": -2.9019696712493896, "step": 388 }, { "epoch": 0.25406155604539143, "grad_norm": 8.583769038852582, "learning_rate": 1.8596833297838335e-07, "logits/chosen": -0.8622403740882874, "logits/rejected": -0.8727516531944275, "logps/chosen": -745.3436889648438, "logps/rejected": -847.031982421875, "loss": 0.5052, "rewards/accuracies": 0.875, "rewards/chosen": -2.6236987113952637, "rewards/margins": 0.9856184720993042, "rewards/rejected": -3.6093173027038574, "step": 389 }, { "epoch": 0.2547146705853539, "grad_norm": 12.037107211910246, "learning_rate": 1.8585156530188214e-07, "logits/chosen": -0.8463901281356812, "logits/rejected": -0.8325417041778564, "logps/chosen": -732.111083984375, "logps/rejected": -828.5350952148438, "loss": 0.6109, "rewards/accuracies": 0.75, "rewards/chosen": -2.5609564781188965, "rewards/margins": 0.821265459060669, "rewards/rejected": -3.3822219371795654, "step": 390 }, { "epoch": 0.2553677851253163, "grad_norm": 8.857824861638335, "learning_rate": 1.8573435075672421e-07, "logits/chosen": -0.8568727374076843, "logits/rejected": -0.8396463394165039, "logps/chosen": -761.6552734375, "logps/rejected": -789.5634155273438, "loss": 0.4699, "rewards/accuracies": 0.78125, "rewards/chosen": -2.5762624740600586, "rewards/margins": 0.6971707344055176, "rewards/rejected": -3.273433208465576, "step": 391 }, { "epoch": 0.2560208996652788, "grad_norm": 10.947327596621403, "learning_rate": 1.8561668995302665e-07, "logits/chosen": -0.9098179936408997, "logits/rejected": -0.9328880906105042, "logps/chosen": -706.7294311523438, "logps/rejected": -816.31787109375, "loss": 0.4787, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7702081203460693, "rewards/margins": 0.8480209112167358, "rewards/rejected": -3.6182289123535156, "step": 392 }, { "epoch": 0.2566740142052412, "grad_norm": 9.965810910376886, "learning_rate": 1.8549858350322932e-07, "logits/chosen": -0.9181755781173706, "logits/rejected": -0.8338078856468201, "logps/chosen": -825.3251342773438, "logps/rejected": -857.3953857421875, "loss": 0.5338, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0407447814941406, "rewards/margins": 0.5176315307617188, "rewards/rejected": -3.5583760738372803, "step": 393 }, { "epoch": 0.2573271287452037, "grad_norm": 14.437300919389857, "learning_rate": 1.8538003202209186e-07, "logits/chosen": -0.7896380424499512, "logits/rejected": -0.6721420884132385, "logps/chosen": -718.7591552734375, "logps/rejected": -794.776123046875, "loss": 0.5163, "rewards/accuracies": 0.6875, "rewards/chosen": -2.381721258163452, "rewards/margins": 1.0894372463226318, "rewards/rejected": -3.471158266067505, "step": 394 }, { "epoch": 0.2579802432851661, "grad_norm": 11.73360711692685, "learning_rate": 1.852610361266902e-07, "logits/chosen": -1.0355563163757324, "logits/rejected": -0.9479397535324097, "logps/chosen": -734.6851806640625, "logps/rejected": -792.4885864257812, "loss": 0.5116, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9483344554901123, "rewards/margins": 0.9447464942932129, "rewards/rejected": -3.893080711364746, "step": 395 }, { "epoch": 0.2586333578251286, "grad_norm": 8.69098030663428, "learning_rate": 1.8514159643641366e-07, "logits/chosen": -0.7818084359169006, "logits/rejected": -0.7556504011154175, "logps/chosen": -737.1483764648438, "logps/rejected": -807.0700073242188, "loss": 0.5289, "rewards/accuracies": 0.84375, "rewards/chosen": -2.768253803253174, "rewards/margins": 0.621762752532959, "rewards/rejected": -3.3900163173675537, "step": 396 }, { "epoch": 0.259286472365091, "grad_norm": 8.173290849151792, "learning_rate": 1.850217135729614e-07, "logits/chosen": -0.8113851547241211, "logits/rejected": -0.7199157476425171, "logps/chosen": -760.3164672851562, "logps/rejected": -797.7694091796875, "loss": 0.4421, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6380648612976074, "rewards/margins": 0.7701491117477417, "rewards/rejected": -3.4082140922546387, "step": 397 }, { "epoch": 0.2599395869050535, "grad_norm": 9.015185202130558, "learning_rate": 1.8490138816033953e-07, "logits/chosen": -0.9135926961898804, "logits/rejected": -0.9324383735656738, "logps/chosen": -755.5042724609375, "logps/rejected": -805.4848022460938, "loss": 0.5094, "rewards/accuracies": 0.8125, "rewards/chosen": -2.365133047103882, "rewards/margins": 0.7335975766181946, "rewards/rejected": -3.0987303256988525, "step": 398 }, { "epoch": 0.2605927014450159, "grad_norm": 10.620101711003125, "learning_rate": 1.8478062082485754e-07, "logits/chosen": -0.8709986805915833, "logits/rejected": -0.8939104080200195, "logps/chosen": -766.480712890625, "logps/rejected": -870.7758178710938, "loss": 0.518, "rewards/accuracies": 0.875, "rewards/chosen": -2.472724676132202, "rewards/margins": 1.0182551145553589, "rewards/rejected": -3.4909799098968506, "step": 399 }, { "epoch": 0.26124581598497837, "grad_norm": 8.904432929316213, "learning_rate": 1.8465941219512531e-07, "logits/chosen": -0.8301805853843689, "logits/rejected": -0.809867262840271, "logps/chosen": -749.3978881835938, "logps/rejected": -825.1639404296875, "loss": 0.4606, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6916451454162598, "rewards/margins": 0.8717960119247437, "rewards/rejected": -3.563441038131714, "step": 400 }, { "epoch": 0.26124581598497837, "eval_logits/chosen": -0.724721372127533, "eval_logits/rejected": -0.6812699437141418, "eval_logps/chosen": -741.7116088867188, "eval_logps/rejected": -802.9892578125, "eval_loss": 0.5081213712692261, "eval_rewards/accuracies": 0.7559999823570251, "eval_rewards/chosen": -2.6780450344085693, "eval_rewards/margins": 0.7001611590385437, "eval_rewards/rejected": -3.378206253051758, "eval_runtime": 616.1892, "eval_samples_per_second": 6.492, "eval_steps_per_second": 0.406, "step": 400 }, { "epoch": 0.2618989305249408, "grad_norm": 9.783139118919728, "learning_rate": 1.8453776290204963e-07, "logits/chosen": -0.9069001078605652, "logits/rejected": -0.810458779335022, "logps/chosen": -743.5487060546875, "logps/rejected": -779.9725952148438, "loss": 0.4883, "rewards/accuracies": 0.75, "rewards/chosen": -2.6744909286499023, "rewards/margins": 0.6230872273445129, "rewards/rejected": -3.2975778579711914, "step": 401 }, { "epoch": 0.26255204506490326, "grad_norm": 12.767255063577581, "learning_rate": 1.84415673578831e-07, "logits/chosen": -0.851786732673645, "logits/rejected": -0.8314003348350525, "logps/chosen": -705.4389038085938, "logps/rejected": -778.4620361328125, "loss": 0.5321, "rewards/accuracies": 0.8125, "rewards/chosen": -2.4630203247070312, "rewards/margins": 0.6230435967445374, "rewards/rejected": -3.086063861846924, "step": 402 }, { "epoch": 0.2632051596048657, "grad_norm": 10.075368467499747, "learning_rate": 1.8429314486096038e-07, "logits/chosen": -0.8277249336242676, "logits/rejected": -0.788428008556366, "logps/chosen": -705.606689453125, "logps/rejected": -752.8212280273438, "loss": 0.5266, "rewards/accuracies": 0.625, "rewards/chosen": -2.5161733627319336, "rewards/margins": 0.48541557788848877, "rewards/rejected": -3.001589298248291, "step": 403 }, { "epoch": 0.26385827414482815, "grad_norm": 17.296320200401933, "learning_rate": 1.8417017738621584e-07, "logits/chosen": -0.9016965627670288, "logits/rejected": -0.9234187006950378, "logps/chosen": -815.3984985351562, "logps/rejected": -888.422119140625, "loss": 0.5682, "rewards/accuracies": 0.75, "rewards/chosen": -3.178356885910034, "rewards/margins": 0.6038446426391602, "rewards/rejected": -3.7822020053863525, "step": 404 }, { "epoch": 0.26451138868479057, "grad_norm": 11.424520481589685, "learning_rate": 1.8404677179465918e-07, "logits/chosen": -0.9801906943321228, "logits/rejected": -1.028713583946228, "logps/chosen": -837.5343017578125, "logps/rejected": -964.4091796875, "loss": 0.5529, "rewards/accuracies": 0.625, "rewards/chosen": -3.271747589111328, "rewards/margins": 0.5638325214385986, "rewards/rejected": -3.8355798721313477, "step": 405 }, { "epoch": 0.26516450322475305, "grad_norm": 9.475445804066252, "learning_rate": 1.8392292872863267e-07, "logits/chosen": -0.8906446099281311, "logits/rejected": -0.9547228813171387, "logps/chosen": -705.4547729492188, "logps/rejected": -792.2675170898438, "loss": 0.4856, "rewards/accuracies": 0.75, "rewards/chosen": -2.604658842086792, "rewards/margins": 0.9246249198913574, "rewards/rejected": -3.5292837619781494, "step": 406 }, { "epoch": 0.26581761776471546, "grad_norm": 10.536985619426973, "learning_rate": 1.8379864883275574e-07, "logits/chosen": -0.8332792520523071, "logits/rejected": -0.8485990762710571, "logps/chosen": -736.0517578125, "logps/rejected": -838.329833984375, "loss": 0.5022, "rewards/accuracies": 0.875, "rewards/chosen": -2.6248157024383545, "rewards/margins": 0.8121111989021301, "rewards/rejected": -3.4369266033172607, "step": 407 }, { "epoch": 0.26647073230467794, "grad_norm": 10.424698516252935, "learning_rate": 1.8367393275392153e-07, "logits/chosen": -0.7887847423553467, "logits/rejected": -0.785426139831543, "logps/chosen": -787.7622680664062, "logps/rejected": -902.406005859375, "loss": 0.4796, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8995308876037598, "rewards/margins": 0.9673147201538086, "rewards/rejected": -3.8668456077575684, "step": 408 }, { "epoch": 0.26712384684464036, "grad_norm": 11.643817212358961, "learning_rate": 1.8354878114129364e-07, "logits/chosen": -0.8421759009361267, "logits/rejected": -0.6650703549385071, "logps/chosen": -787.9586791992188, "logps/rejected": -809.9829711914062, "loss": 0.5951, "rewards/accuracies": 0.65625, "rewards/chosen": -3.101632595062256, "rewards/margins": 0.4137914776802063, "rewards/rejected": -3.5154237747192383, "step": 409 }, { "epoch": 0.26777696138460283, "grad_norm": 9.561014123560037, "learning_rate": 1.8342319464630255e-07, "logits/chosen": -0.8494737148284912, "logits/rejected": -0.79804927110672, "logps/chosen": -699.1937255859375, "logps/rejected": -756.2304077148438, "loss": 0.4578, "rewards/accuracies": 0.71875, "rewards/chosen": -2.6116859912872314, "rewards/margins": 0.5693447589874268, "rewards/rejected": -3.1810309886932373, "step": 410 }, { "epoch": 0.26843007592456525, "grad_norm": 9.478844905600644, "learning_rate": 1.832971739226425e-07, "logits/chosen": -0.7649250030517578, "logits/rejected": -0.71089106798172, "logps/chosen": -726.0282592773438, "logps/rejected": -825.573486328125, "loss": 0.4979, "rewards/accuracies": 0.78125, "rewards/chosen": -2.829139232635498, "rewards/margins": 1.0223064422607422, "rewards/rejected": -3.8514456748962402, "step": 411 }, { "epoch": 0.2690831904645277, "grad_norm": 9.805450680592982, "learning_rate": 1.8317071962626787e-07, "logits/chosen": -0.7251761555671692, "logits/rejected": -0.6697893738746643, "logps/chosen": -771.7799072265625, "logps/rejected": -803.2220458984375, "loss": 0.5073, "rewards/accuracies": 0.78125, "rewards/chosen": -3.042149305343628, "rewards/margins": 0.6425183415412903, "rewards/rejected": -3.6846675872802734, "step": 412 }, { "epoch": 0.26973630500449014, "grad_norm": 10.50244351453918, "learning_rate": 1.830438324153898e-07, "logits/chosen": -0.8642072677612305, "logits/rejected": -0.8176953792572021, "logps/chosen": -699.1476440429688, "logps/rejected": -819.7857666015625, "loss": 0.4711, "rewards/accuracies": 0.90625, "rewards/chosen": -2.5096065998077393, "rewards/margins": 1.047548532485962, "rewards/rejected": -3.557155132293701, "step": 413 }, { "epoch": 0.2703894195444526, "grad_norm": 11.849045341650994, "learning_rate": 1.8291651295047295e-07, "logits/chosen": -0.9802454113960266, "logits/rejected": -0.8233616352081299, "logps/chosen": -824.2813110351562, "logps/rejected": -878.6183471679688, "loss": 0.4647, "rewards/accuracies": 0.71875, "rewards/chosen": -3.1092634201049805, "rewards/margins": 1.1087751388549805, "rewards/rejected": -4.218039035797119, "step": 414 }, { "epoch": 0.27104253408441503, "grad_norm": 13.177473741686534, "learning_rate": 1.8278876189423178e-07, "logits/chosen": -0.8334654569625854, "logits/rejected": -0.8360169529914856, "logps/chosen": -736.35302734375, "logps/rejected": -806.1726684570312, "loss": 0.4901, "rewards/accuracies": 0.75, "rewards/chosen": -2.874499559402466, "rewards/margins": 0.6783380508422852, "rewards/rejected": -3.552837371826172, "step": 415 }, { "epoch": 0.2716956486243775, "grad_norm": 11.16394727704828, "learning_rate": 1.826605799116273e-07, "logits/chosen": -0.8869356513023376, "logits/rejected": -0.8545706272125244, "logps/chosen": -725.5807495117188, "logps/rejected": -868.1986083984375, "loss": 0.4506, "rewards/accuracies": 0.875, "rewards/chosen": -2.486783504486084, "rewards/margins": 1.0288233757019043, "rewards/rejected": -3.515606641769409, "step": 416 }, { "epoch": 0.2723487631643399, "grad_norm": 11.663692808166209, "learning_rate": 1.8253196766986353e-07, "logits/chosen": -0.9029356241226196, "logits/rejected": -0.9125173091888428, "logps/chosen": -704.21533203125, "logps/rejected": -785.9641723632812, "loss": 0.4987, "rewards/accuracies": 0.75, "rewards/chosen": -2.766026735305786, "rewards/margins": 0.57279372215271, "rewards/rejected": -3.338820219039917, "step": 417 }, { "epoch": 0.2730018777043024, "grad_norm": 8.729822521753496, "learning_rate": 1.824029258383841e-07, "logits/chosen": -0.7340871095657349, "logits/rejected": -0.7211796045303345, "logps/chosen": -796.06298828125, "logps/rejected": -913.9825439453125, "loss": 0.4293, "rewards/accuracies": 0.8125, "rewards/chosen": -3.40975284576416, "rewards/margins": 1.0494564771652222, "rewards/rejected": -4.459209442138672, "step": 418 }, { "epoch": 0.2736549922442648, "grad_norm": 10.156017954308194, "learning_rate": 1.8227345508886862e-07, "logits/chosen": -0.8956701159477234, "logits/rejected": -0.8679780960083008, "logps/chosen": -777.0938110351562, "logps/rejected": -852.8966064453125, "loss": 0.5182, "rewards/accuracies": 0.84375, "rewards/chosen": -2.8934662342071533, "rewards/margins": 0.8560830950737, "rewards/rejected": -3.749549627304077, "step": 419 }, { "epoch": 0.2743081067842273, "grad_norm": 8.640630705596932, "learning_rate": 1.8214355609522934e-07, "logits/chosen": -0.821373701095581, "logits/rejected": -0.7505570650100708, "logps/chosen": -764.0152587890625, "logps/rejected": -791.8469848632812, "loss": 0.4431, "rewards/accuracies": 0.875, "rewards/chosen": -2.9267923831939697, "rewards/margins": 0.4508010745048523, "rewards/rejected": -3.377593517303467, "step": 420 }, { "epoch": 0.2749612213241897, "grad_norm": 13.171977563436153, "learning_rate": 1.8201322953360756e-07, "logits/chosen": -0.7890115976333618, "logits/rejected": -0.7287498712539673, "logps/chosen": -774.0021362304688, "logps/rejected": -767.4495239257812, "loss": 0.5292, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0860540866851807, "rewards/margins": 0.4356900155544281, "rewards/rejected": -3.5217440128326416, "step": 421 }, { "epoch": 0.2756143358641522, "grad_norm": 11.91644818150085, "learning_rate": 1.818824760823701e-07, "logits/chosen": -0.97590571641922, "logits/rejected": -0.7994486689567566, "logps/chosen": -663.6700439453125, "logps/rejected": -690.9847412109375, "loss": 0.4976, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7485709190368652, "rewards/margins": 0.5975130796432495, "rewards/rejected": -3.3460841178894043, "step": 422 }, { "epoch": 0.2762674504041146, "grad_norm": 8.62314149072794, "learning_rate": 1.8175129642210586e-07, "logits/chosen": -0.8519734740257263, "logits/rejected": -0.9017986059188843, "logps/chosen": -743.6911010742188, "logps/rejected": -866.9241333007812, "loss": 0.4576, "rewards/accuracies": 0.8125, "rewards/chosen": -2.9078967571258545, "rewards/margins": 0.8721412420272827, "rewards/rejected": -3.780038356781006, "step": 423 }, { "epoch": 0.2769205649440771, "grad_norm": 16.78129564723599, "learning_rate": 1.8161969123562217e-07, "logits/chosen": -0.8358048796653748, "logits/rejected": -0.8505182266235352, "logps/chosen": -754.3642578125, "logps/rejected": -818.9282836914062, "loss": 0.5837, "rewards/accuracies": 0.75, "rewards/chosen": -3.47281813621521, "rewards/margins": 0.578090250492096, "rewards/rejected": -4.05090856552124, "step": 424 }, { "epoch": 0.2775736794840395, "grad_norm": 17.991126052561494, "learning_rate": 1.8148766120794125e-07, "logits/chosen": -0.9127295017242432, "logits/rejected": -0.8362393379211426, "logps/chosen": -791.9930419921875, "logps/rejected": -843.0182495117188, "loss": 0.5182, "rewards/accuracies": 0.59375, "rewards/chosen": -3.202291488647461, "rewards/margins": 0.5794678330421448, "rewards/rejected": -3.78175950050354, "step": 425 }, { "epoch": 0.27822679402400197, "grad_norm": 8.6575913642869, "learning_rate": 1.8135520702629673e-07, "logits/chosen": -0.8415990471839905, "logits/rejected": -0.8752740621566772, "logps/chosen": -771.759765625, "logps/rejected": -900.0613403320312, "loss": 0.4585, "rewards/accuracies": 0.84375, "rewards/chosen": -2.997281789779663, "rewards/margins": 1.1854169368743896, "rewards/rejected": -4.182698726654053, "step": 426 }, { "epoch": 0.2788799085639644, "grad_norm": 12.81180624184605, "learning_rate": 1.8122232938013005e-07, "logits/chosen": -0.8370864391326904, "logits/rejected": -0.7737575769424438, "logps/chosen": -855.8829345703125, "logps/rejected": -937.1058349609375, "loss": 0.5611, "rewards/accuracies": 0.78125, "rewards/chosen": -3.7345356941223145, "rewards/margins": 0.9496172070503235, "rewards/rejected": -4.684152603149414, "step": 427 }, { "epoch": 0.27953302310392686, "grad_norm": 9.737215085975576, "learning_rate": 1.8108902896108668e-07, "logits/chosen": -1.0026570558547974, "logits/rejected": -0.9746906757354736, "logps/chosen": -857.1223754882812, "logps/rejected": -890.031982421875, "loss": 0.4786, "rewards/accuracies": 0.8125, "rewards/chosen": -3.579835891723633, "rewards/margins": 0.7876998782157898, "rewards/rejected": -4.367535591125488, "step": 428 }, { "epoch": 0.2801861376438893, "grad_norm": 14.133058490229988, "learning_rate": 1.8095530646301287e-07, "logits/chosen": -0.6362062096595764, "logits/rejected": -0.6157896518707275, "logps/chosen": -618.6807861328125, "logps/rejected": -654.237548828125, "loss": 0.565, "rewards/accuracies": 0.75, "rewards/chosen": -2.4280242919921875, "rewards/margins": 0.5356500148773193, "rewards/rejected": -2.9636740684509277, "step": 429 }, { "epoch": 0.28083925218385175, "grad_norm": 10.687127431453712, "learning_rate": 1.808211625819517e-07, "logits/chosen": -0.8555097579956055, "logits/rejected": -0.8589435815811157, "logps/chosen": -838.960205078125, "logps/rejected": -951.1204833984375, "loss": 0.538, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5574240684509277, "rewards/margins": 0.9826168417930603, "rewards/rejected": -4.540040969848633, "step": 430 }, { "epoch": 0.28149236672381417, "grad_norm": 10.123521301197844, "learning_rate": 1.8068659801613972e-07, "logits/chosen": -0.8224723935127258, "logits/rejected": -0.7616609334945679, "logps/chosen": -730.337646484375, "logps/rejected": -856.6796875, "loss": 0.4953, "rewards/accuracies": 0.6875, "rewards/chosen": -3.277148485183716, "rewards/margins": 1.0432851314544678, "rewards/rejected": -4.320433616638184, "step": 431 }, { "epoch": 0.28214548126377664, "grad_norm": 10.291835377836938, "learning_rate": 1.805516134660031e-07, "logits/chosen": -0.7565049529075623, "logits/rejected": -0.760954737663269, "logps/chosen": -889.3624877929688, "logps/rejected": -1007.882568359375, "loss": 0.5203, "rewards/accuracies": 0.6875, "rewards/chosen": -3.83842134475708, "rewards/margins": 1.0104892253875732, "rewards/rejected": -4.848910808563232, "step": 432 }, { "epoch": 0.28279859580373906, "grad_norm": 12.003520975391313, "learning_rate": 1.8041620963415415e-07, "logits/chosen": -0.8923713564872742, "logits/rejected": -0.9106241464614868, "logps/chosen": -777.1085205078125, "logps/rejected": -870.7319946289062, "loss": 0.4846, "rewards/accuracies": 0.71875, "rewards/chosen": -3.4676287174224854, "rewards/margins": 0.7948607206344604, "rewards/rejected": -4.262489318847656, "step": 433 }, { "epoch": 0.28345171034370154, "grad_norm": 19.10825250816021, "learning_rate": 1.8028038722538758e-07, "logits/chosen": -0.799106776714325, "logits/rejected": -0.7920759916305542, "logps/chosen": -881.0488891601562, "logps/rejected": -954.8224487304688, "loss": 0.4382, "rewards/accuracies": 0.84375, "rewards/chosen": -3.942882776260376, "rewards/margins": 0.945711076259613, "rewards/rejected": -4.888593673706055, "step": 434 }, { "epoch": 0.28410482488366395, "grad_norm": 8.867140403896737, "learning_rate": 1.8014414694667682e-07, "logits/chosen": -0.9495159983634949, "logits/rejected": -0.8596460223197937, "logps/chosen": -788.0202026367188, "logps/rejected": -844.4072265625, "loss": 0.4697, "rewards/accuracies": 0.90625, "rewards/chosen": -3.2976248264312744, "rewards/margins": 0.9562715291976929, "rewards/rejected": -4.253896713256836, "step": 435 }, { "epoch": 0.2847579394236264, "grad_norm": 11.495282189006529, "learning_rate": 1.8000748950717038e-07, "logits/chosen": -0.9166377782821655, "logits/rejected": -0.8431532979011536, "logps/chosen": -684.1121215820312, "logps/rejected": -756.2694091796875, "loss": 0.4631, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7140069007873535, "rewards/margins": 0.9040411710739136, "rewards/rejected": -3.6180481910705566, "step": 436 }, { "epoch": 0.28541105396358885, "grad_norm": 10.927848876345342, "learning_rate": 1.7987041561818816e-07, "logits/chosen": -0.962665319442749, "logits/rejected": -0.8971413969993591, "logps/chosen": -834.0377807617188, "logps/rejected": -889.9393310546875, "loss": 0.497, "rewards/accuracies": 0.625, "rewards/chosen": -3.379364013671875, "rewards/margins": 0.7841745615005493, "rewards/rejected": -4.163538932800293, "step": 437 }, { "epoch": 0.2860641685035513, "grad_norm": 11.650368628524, "learning_rate": 1.7973292599321776e-07, "logits/chosen": -0.769073486328125, "logits/rejected": -0.8234167695045471, "logps/chosen": -847.42724609375, "logps/rejected": -985.3370361328125, "loss": 0.4925, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9037604331970215, "rewards/margins": 1.1833317279815674, "rewards/rejected": -5.087091445922852, "step": 438 }, { "epoch": 0.28671728304351374, "grad_norm": 10.739880288424589, "learning_rate": 1.7959502134791067e-07, "logits/chosen": -1.0507196187973022, "logits/rejected": -0.9440267086029053, "logps/chosen": -842.0238647460938, "logps/rejected": -932.1236572265625, "loss": 0.4573, "rewards/accuracies": 0.78125, "rewards/chosen": -3.467390537261963, "rewards/margins": 1.0047094821929932, "rewards/rejected": -4.472099781036377, "step": 439 }, { "epoch": 0.2873703975834762, "grad_norm": 15.061086520968807, "learning_rate": 1.794567024000787e-07, "logits/chosen": -0.7447534799575806, "logits/rejected": -0.7312377095222473, "logps/chosen": -877.2549438476562, "logps/rejected": -976.9591064453125, "loss": 0.5878, "rewards/accuracies": 0.8125, "rewards/chosen": -3.914492607116699, "rewards/margins": 0.9334661960601807, "rewards/rejected": -4.847959041595459, "step": 440 }, { "epoch": 0.28802351212343863, "grad_norm": 11.378904599041363, "learning_rate": 1.7931796986969006e-07, "logits/chosen": -0.8733054995536804, "logits/rejected": -0.8079696297645569, "logps/chosen": -897.7353515625, "logps/rejected": -923.5629272460938, "loss": 0.5043, "rewards/accuracies": 0.8125, "rewards/chosen": -3.33026385307312, "rewards/margins": 1.067804217338562, "rewards/rejected": -4.398067951202393, "step": 441 }, { "epoch": 0.2886766266634011, "grad_norm": 8.524372415698174, "learning_rate": 1.791788244788658e-07, "logits/chosen": -0.7420032620429993, "logits/rejected": -0.7593204379081726, "logps/chosen": -712.6714477539062, "logps/rejected": -809.899169921875, "loss": 0.4781, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9074902534484863, "rewards/margins": 0.6758562326431274, "rewards/rejected": -3.5833466053009033, "step": 442 }, { "epoch": 0.2893297412033635, "grad_norm": 11.012683005962536, "learning_rate": 1.790392669518759e-07, "logits/chosen": -0.8415663242340088, "logits/rejected": -0.8038774728775024, "logps/chosen": -842.3577270507812, "logps/rejected": -907.7183837890625, "loss": 0.4267, "rewards/accuracies": 0.71875, "rewards/chosen": -3.889575481414795, "rewards/margins": 0.7025194764137268, "rewards/rejected": -4.592095375061035, "step": 443 }, { "epoch": 0.289982855743326, "grad_norm": 12.889931357077444, "learning_rate": 1.7889929801513565e-07, "logits/chosen": -0.6964072585105896, "logits/rejected": -0.6732711791992188, "logps/chosen": -691.6309814453125, "logps/rejected": -762.2383422851562, "loss": 0.5427, "rewards/accuracies": 0.8125, "rewards/chosen": -3.066455602645874, "rewards/margins": 0.6046935319900513, "rewards/rejected": -3.6711490154266357, "step": 444 }, { "epoch": 0.2906359702832884, "grad_norm": 9.731628947128165, "learning_rate": 1.787589183972017e-07, "logits/chosen": -0.9657904505729675, "logits/rejected": -0.9944557547569275, "logps/chosen": -841.9929809570312, "logps/rejected": -989.1988525390625, "loss": 0.4848, "rewards/accuracies": 0.9375, "rewards/chosen": -3.4814305305480957, "rewards/margins": 1.1318271160125732, "rewards/rejected": -4.61325740814209, "step": 445 }, { "epoch": 0.2912890848232509, "grad_norm": 8.084542897402226, "learning_rate": 1.786181288287683e-07, "logits/chosen": -0.8106024861335754, "logits/rejected": -0.7048735022544861, "logps/chosen": -859.118896484375, "logps/rejected": -954.908447265625, "loss": 0.4515, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0667357444763184, "rewards/margins": 1.035430669784546, "rewards/rejected": -4.102166652679443, "step": 446 }, { "epoch": 0.2919421993632133, "grad_norm": 14.616888201159512, "learning_rate": 1.7847693004266367e-07, "logits/chosen": -0.8901810050010681, "logits/rejected": -0.7865488529205322, "logps/chosen": -800.8839111328125, "logps/rejected": -816.6807250976562, "loss": 0.5162, "rewards/accuracies": 0.8125, "rewards/chosen": -2.978163719177246, "rewards/margins": 0.6770980954170227, "rewards/rejected": -3.655261516571045, "step": 447 }, { "epoch": 0.2925953139031758, "grad_norm": 14.867760488305729, "learning_rate": 1.7833532277384604e-07, "logits/chosen": -0.8480335474014282, "logits/rejected": -0.8119291663169861, "logps/chosen": -682.2476196289062, "logps/rejected": -718.3956909179688, "loss": 0.5572, "rewards/accuracies": 0.71875, "rewards/chosen": -2.687140941619873, "rewards/margins": 0.585093080997467, "rewards/rejected": -3.2722342014312744, "step": 448 }, { "epoch": 0.2932484284431382, "grad_norm": 12.346656292636492, "learning_rate": 1.7819330775939978e-07, "logits/chosen": -0.8786594271659851, "logits/rejected": -0.8197569251060486, "logps/chosen": -890.4720458984375, "logps/rejected": -925.9500122070312, "loss": 0.5493, "rewards/accuracies": 0.6875, "rewards/chosen": -3.352853775024414, "rewards/margins": 0.9036799669265747, "rewards/rejected": -4.256534099578857, "step": 449 }, { "epoch": 0.29390154298310067, "grad_norm": 14.374790747191138, "learning_rate": 1.780508857385317e-07, "logits/chosen": -0.776336669921875, "logits/rejected": -0.6683730483055115, "logps/chosen": -728.320068359375, "logps/rejected": -827.4588623046875, "loss": 0.5504, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0526700019836426, "rewards/margins": 1.0733567476272583, "rewards/rejected": -4.126026630401611, "step": 450 }, { "epoch": 0.2945546575230631, "grad_norm": 12.060785088732436, "learning_rate": 1.7790805745256703e-07, "logits/chosen": -0.6378570795059204, "logits/rejected": -0.5599284768104553, "logps/chosen": -788.5953979492188, "logps/rejected": -829.5595092773438, "loss": 0.4914, "rewards/accuracies": 0.6875, "rewards/chosen": -3.06785249710083, "rewards/margins": 0.6147899627685547, "rewards/rejected": -3.6826422214508057, "step": 451 }, { "epoch": 0.29520777206302556, "grad_norm": 9.973656930649923, "learning_rate": 1.7776482364494579e-07, "logits/chosen": -0.7137709856033325, "logits/rejected": -0.7110152244567871, "logps/chosen": -865.0806884765625, "logps/rejected": -980.2341918945312, "loss": 0.4741, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4852802753448486, "rewards/margins": 0.8627204895019531, "rewards/rejected": -4.348001003265381, "step": 452 }, { "epoch": 0.295860886602988, "grad_norm": 13.638033723344053, "learning_rate": 1.7762118506121873e-07, "logits/chosen": -0.8766645789146423, "logits/rejected": -0.8333470225334167, "logps/chosen": -756.6890869140625, "logps/rejected": -816.2777709960938, "loss": 0.5171, "rewards/accuracies": 0.65625, "rewards/chosen": -2.8761587142944336, "rewards/margins": 0.5043472051620483, "rewards/rejected": -3.3805060386657715, "step": 453 }, { "epoch": 0.29651400114295046, "grad_norm": 10.692905485205573, "learning_rate": 1.7747714244904346e-07, "logits/chosen": -0.7567205429077148, "logits/rejected": -0.7571355104446411, "logps/chosen": -794.180908203125, "logps/rejected": -876.698974609375, "loss": 0.4673, "rewards/accuracies": 0.875, "rewards/chosen": -3.093803644180298, "rewards/margins": 0.8284895420074463, "rewards/rejected": -3.922293186187744, "step": 454 }, { "epoch": 0.2971671156829129, "grad_norm": 9.5645932096092, "learning_rate": 1.7733269655818076e-07, "logits/chosen": -0.8513656258583069, "logits/rejected": -0.7877851724624634, "logps/chosen": -722.7457885742188, "logps/rejected": -867.5543212890625, "loss": 0.5202, "rewards/accuracies": 0.71875, "rewards/chosen": -3.243468999862671, "rewards/margins": 0.9194523692131042, "rewards/rejected": -4.16292142868042, "step": 455 }, { "epoch": 0.29782023022287535, "grad_norm": 17.81961025980672, "learning_rate": 1.7718784814049036e-07, "logits/chosen": -0.8771448135375977, "logits/rejected": -0.8124470710754395, "logps/chosen": -787.6842651367188, "logps/rejected": -852.7453002929688, "loss": 0.477, "rewards/accuracies": 0.78125, "rewards/chosen": -2.917278528213501, "rewards/margins": 0.9118640422821045, "rewards/rejected": -3.8291425704956055, "step": 456 }, { "epoch": 0.29847334476283777, "grad_norm": 10.521938568244815, "learning_rate": 1.770425979499273e-07, "logits/chosen": -0.8637176752090454, "logits/rejected": -0.8220848441123962, "logps/chosen": -747.9457397460938, "logps/rejected": -764.8699340820312, "loss": 0.5601, "rewards/accuracies": 0.65625, "rewards/chosen": -3.3360836505889893, "rewards/margins": 0.39987143874168396, "rewards/rejected": -3.7359557151794434, "step": 457 }, { "epoch": 0.29912645930280024, "grad_norm": 13.778767197295677, "learning_rate": 1.7689694674253784e-07, "logits/chosen": -0.8967228531837463, "logits/rejected": -0.8589329719543457, "logps/chosen": -817.915771484375, "logps/rejected": -875.2073974609375, "loss": 0.4576, "rewards/accuracies": 0.90625, "rewards/chosen": -3.1167476177215576, "rewards/margins": 1.0523903369903564, "rewards/rejected": -4.169137954711914, "step": 458 }, { "epoch": 0.29977957384276266, "grad_norm": 10.659382563282882, "learning_rate": 1.7675089527645568e-07, "logits/chosen": -0.940302312374115, "logits/rejected": -0.8791934251785278, "logps/chosen": -788.9204711914062, "logps/rejected": -813.0576782226562, "loss": 0.4806, "rewards/accuracies": 0.65625, "rewards/chosen": -2.944727659225464, "rewards/margins": 0.4880099892616272, "rewards/rejected": -3.4327378273010254, "step": 459 }, { "epoch": 0.30043268838272513, "grad_norm": 20.58820840229225, "learning_rate": 1.766044443118978e-07, "logits/chosen": -0.9948773384094238, "logits/rejected": -0.871425211429596, "logps/chosen": -749.33154296875, "logps/rejected": -809.860595703125, "loss": 0.4706, "rewards/accuracies": 0.84375, "rewards/chosen": -2.934537410736084, "rewards/margins": 1.2990574836730957, "rewards/rejected": -4.2335944175720215, "step": 460 }, { "epoch": 0.30108580292268755, "grad_norm": 8.561878783064682, "learning_rate": 1.764575946111607e-07, "logits/chosen": -0.8827503323554993, "logits/rejected": -0.8580729365348816, "logps/chosen": -705.9119873046875, "logps/rejected": -750.5169677734375, "loss": 0.5087, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4296109676361084, "rewards/margins": 0.5230472087860107, "rewards/rejected": -2.95265793800354, "step": 461 }, { "epoch": 0.30173891746265, "grad_norm": 10.058893086562453, "learning_rate": 1.7631034693861633e-07, "logits/chosen": -0.8686078190803528, "logits/rejected": -0.8488516807556152, "logps/chosen": -806.1150512695312, "logps/rejected": -868.5146484375, "loss": 0.5019, "rewards/accuracies": 0.875, "rewards/chosen": -3.1653387546539307, "rewards/margins": 0.8287073373794556, "rewards/rejected": -3.994046211242676, "step": 462 }, { "epoch": 0.30239203200261244, "grad_norm": 10.19039827134051, "learning_rate": 1.7616270206070811e-07, "logits/chosen": -0.8264525532722473, "logits/rejected": -0.7664126753807068, "logps/chosen": -784.3181762695312, "logps/rejected": -865.5259399414062, "loss": 0.4936, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1958301067352295, "rewards/margins": 0.8845147490501404, "rewards/rejected": -4.080345153808594, "step": 463 }, { "epoch": 0.3030451465425749, "grad_norm": 13.006511852492745, "learning_rate": 1.7601466074594705e-07, "logits/chosen": -0.7952624559402466, "logits/rejected": -0.7655043601989746, "logps/chosen": -725.4571533203125, "logps/rejected": -828.1234741210938, "loss": 0.4174, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6289877891540527, "rewards/margins": 0.9357907772064209, "rewards/rejected": -3.5647785663604736, "step": 464 }, { "epoch": 0.30369826108253734, "grad_norm": 10.478801051027006, "learning_rate": 1.7586622376490755e-07, "logits/chosen": -0.8945034742355347, "logits/rejected": -0.8918284773826599, "logps/chosen": -796.9361572265625, "logps/rejected": -890.4472045898438, "loss": 0.45, "rewards/accuracies": 0.75, "rewards/chosen": -3.1761415004730225, "rewards/margins": 0.8999980688095093, "rewards/rejected": -4.0761399269104, "step": 465 }, { "epoch": 0.3043513756224998, "grad_norm": 9.345422496827316, "learning_rate": 1.7571739189022363e-07, "logits/chosen": -0.7118120193481445, "logits/rejected": -0.6660512089729309, "logps/chosen": -831.6751708984375, "logps/rejected": -888.5519409179688, "loss": 0.4966, "rewards/accuracies": 0.84375, "rewards/chosen": -3.2425475120544434, "rewards/margins": 0.8526361584663391, "rewards/rejected": -4.095183849334717, "step": 466 }, { "epoch": 0.3050044901624622, "grad_norm": 10.501170102059676, "learning_rate": 1.7556816589658463e-07, "logits/chosen": -0.7806082963943481, "logits/rejected": -0.7936585545539856, "logps/chosen": -868.0274047851562, "logps/rejected": -948.4763793945312, "loss": 0.5316, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5230109691619873, "rewards/margins": 0.8672765493392944, "rewards/rejected": -4.390287399291992, "step": 467 }, { "epoch": 0.3056576047024247, "grad_norm": 16.5631467557302, "learning_rate": 1.754185465607315e-07, "logits/chosen": -0.8917798399925232, "logits/rejected": -0.8434329628944397, "logps/chosen": -786.859375, "logps/rejected": -862.1466064453125, "loss": 0.4638, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0098447799682617, "rewards/margins": 0.9299499988555908, "rewards/rejected": -3.9397945404052734, "step": 468 }, { "epoch": 0.3063107192423871, "grad_norm": 13.399040798349395, "learning_rate": 1.7526853466145243e-07, "logits/chosen": -0.7421934604644775, "logits/rejected": -0.705778956413269, "logps/chosen": -756.4699096679688, "logps/rejected": -843.1365356445312, "loss": 0.48, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0707902908325195, "rewards/margins": 1.0485197305679321, "rewards/rejected": -4.119309902191162, "step": 469 }, { "epoch": 0.3069638337823496, "grad_norm": 10.847863165499646, "learning_rate": 1.751181309795791e-07, "logits/chosen": -0.7847117185592651, "logits/rejected": -0.8583936095237732, "logps/chosen": -699.90087890625, "logps/rejected": -841.529541015625, "loss": 0.4531, "rewards/accuracies": 0.78125, "rewards/chosen": -2.96787691116333, "rewards/margins": 0.8856784105300903, "rewards/rejected": -3.853555202484131, "step": 470 }, { "epoch": 0.307616948322312, "grad_norm": 10.623297131564883, "learning_rate": 1.7496733629798236e-07, "logits/chosen": -0.7139683961868286, "logits/rejected": -0.7501896023750305, "logps/chosen": -771.0042114257812, "logps/rejected": -972.8804931640625, "loss": 0.5007, "rewards/accuracies": 0.8125, "rewards/chosen": -3.053046464920044, "rewards/margins": 1.4686146974563599, "rewards/rejected": -4.521660804748535, "step": 471 }, { "epoch": 0.3082700628622745, "grad_norm": 9.209388737884883, "learning_rate": 1.7481615140156833e-07, "logits/chosen": -0.8231265544891357, "logits/rejected": -0.8001554608345032, "logps/chosen": -827.5318603515625, "logps/rejected": -872.6943359375, "loss": 0.4664, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0368971824645996, "rewards/margins": 0.6198225021362305, "rewards/rejected": -3.656719923019409, "step": 472 }, { "epoch": 0.3089231774022369, "grad_norm": 9.143141888933894, "learning_rate": 1.746645770772742e-07, "logits/chosen": -0.7907102108001709, "logits/rejected": -0.7144389748573303, "logps/chosen": -691.955322265625, "logps/rejected": -750.7633056640625, "loss": 0.461, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7297940254211426, "rewards/margins": 0.870061457157135, "rewards/rejected": -3.599855422973633, "step": 473 }, { "epoch": 0.3095762919421994, "grad_norm": 12.054264508592999, "learning_rate": 1.7451261411406418e-07, "logits/chosen": -0.7955212593078613, "logits/rejected": -0.7325343489646912, "logps/chosen": -816.0054321289062, "logps/rejected": -856.5308837890625, "loss": 0.4835, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0509090423583984, "rewards/margins": 0.724595844745636, "rewards/rejected": -3.7755050659179688, "step": 474 }, { "epoch": 0.3102294064821618, "grad_norm": 10.79946259326026, "learning_rate": 1.743602633029255e-07, "logits/chosen": -0.8787297606468201, "logits/rejected": -0.8972852826118469, "logps/chosen": -676.2521362304688, "logps/rejected": -836.7784423828125, "loss": 0.438, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7497074604034424, "rewards/margins": 1.409981369972229, "rewards/rejected": -4.159688949584961, "step": 475 }, { "epoch": 0.31088252102212427, "grad_norm": 9.53436829750779, "learning_rate": 1.7420752543686404e-07, "logits/chosen": -0.879591166973114, "logits/rejected": -0.8345413208007812, "logps/chosen": -809.7759399414062, "logps/rejected": -808.8768310546875, "loss": 0.4768, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2994282245635986, "rewards/margins": 0.5916791558265686, "rewards/rejected": -3.8911075592041016, "step": 476 }, { "epoch": 0.3115356355620867, "grad_norm": 9.592920851797404, "learning_rate": 1.7405440131090046e-07, "logits/chosen": -0.9241241216659546, "logits/rejected": -0.8796841502189636, "logps/chosen": -859.171142578125, "logps/rejected": -917.967041015625, "loss": 0.4416, "rewards/accuracies": 0.875, "rewards/chosen": -3.2931156158447266, "rewards/margins": 1.0294749736785889, "rewards/rejected": -4.3225908279418945, "step": 477 }, { "epoch": 0.31218875010204916, "grad_norm": 19.895354822148523, "learning_rate": 1.739008917220659e-07, "logits/chosen": -0.9288607835769653, "logits/rejected": -0.8922178745269775, "logps/chosen": -867.878173828125, "logps/rejected": -910.6178588867188, "loss": 0.5138, "rewards/accuracies": 0.6875, "rewards/chosen": -3.8489792346954346, "rewards/margins": 0.71727454662323, "rewards/rejected": -4.566253662109375, "step": 478 }, { "epoch": 0.3128418646420116, "grad_norm": 10.578879623870158, "learning_rate": 1.737469974693979e-07, "logits/chosen": -0.8969875574111938, "logits/rejected": -0.8435537219047546, "logps/chosen": -891.6130981445312, "logps/rejected": -1026.607177734375, "loss": 0.4548, "rewards/accuracies": 0.875, "rewards/chosen": -3.6824443340301514, "rewards/margins": 1.2574917078018188, "rewards/rejected": -4.939936637878418, "step": 479 }, { "epoch": 0.31349497918197405, "grad_norm": 13.352402212334288, "learning_rate": 1.735927193539363e-07, "logits/chosen": -0.8216950297355652, "logits/rejected": -0.7246571183204651, "logps/chosen": -759.7047119140625, "logps/rejected": -856.9074096679688, "loss": 0.4726, "rewards/accuracies": 0.96875, "rewards/chosen": -2.9738543033599854, "rewards/margins": 1.1688733100891113, "rewards/rejected": -4.142727851867676, "step": 480 }, { "epoch": 0.31414809372193647, "grad_norm": 11.869022792008412, "learning_rate": 1.7343805817871884e-07, "logits/chosen": -0.8111241459846497, "logits/rejected": -0.766447901725769, "logps/chosen": -888.6497802734375, "logps/rejected": -962.759765625, "loss": 0.4559, "rewards/accuracies": 0.875, "rewards/chosen": -3.6871464252471924, "rewards/margins": 1.016096830368042, "rewards/rejected": -4.703243255615234, "step": 481 }, { "epoch": 0.31480120826189895, "grad_norm": 14.231372028726907, "learning_rate": 1.7328301474877723e-07, "logits/chosen": -0.9776662588119507, "logits/rejected": -0.9157834649085999, "logps/chosen": -805.4437866210938, "logps/rejected": -909.315673828125, "loss": 0.4274, "rewards/accuracies": 0.75, "rewards/chosen": -3.1832499504089355, "rewards/margins": 0.964979887008667, "rewards/rejected": -4.148229598999023, "step": 482 }, { "epoch": 0.31545432280186136, "grad_norm": 13.141716217042305, "learning_rate": 1.731275898711329e-07, "logits/chosen": -0.6212536692619324, "logits/rejected": -0.658240795135498, "logps/chosen": -847.568115234375, "logps/rejected": -932.3016967773438, "loss": 0.4936, "rewards/accuracies": 0.875, "rewards/chosen": -3.397820472717285, "rewards/margins": 0.9783123135566711, "rewards/rejected": -4.376132488250732, "step": 483 }, { "epoch": 0.31610743734182384, "grad_norm": 11.377521596330835, "learning_rate": 1.7297178435479267e-07, "logits/chosen": -0.6372016668319702, "logits/rejected": -0.6643779873847961, "logps/chosen": -765.1443481445312, "logps/rejected": -921.8984985351562, "loss": 0.4603, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6738381385803223, "rewards/margins": 1.214465856552124, "rewards/rejected": -4.888303756713867, "step": 484 }, { "epoch": 0.31676055188178626, "grad_norm": 10.093475452329823, "learning_rate": 1.7281559901074472e-07, "logits/chosen": -0.6943804621696472, "logits/rejected": -0.7027254104614258, "logps/chosen": -842.590576171875, "logps/rejected": -997.2478637695312, "loss": 0.4715, "rewards/accuracies": 0.875, "rewards/chosen": -3.864556312561035, "rewards/margins": 1.3417086601257324, "rewards/rejected": -5.206264972686768, "step": 485 }, { "epoch": 0.31741366642174873, "grad_norm": 11.537418032619593, "learning_rate": 1.7265903465195427e-07, "logits/chosen": -0.7280626893043518, "logits/rejected": -0.6906183958053589, "logps/chosen": -821.1693115234375, "logps/rejected": -924.5863037109375, "loss": 0.4554, "rewards/accuracies": 0.75, "rewards/chosen": -3.9434003829956055, "rewards/margins": 1.0203464031219482, "rewards/rejected": -4.963746547698975, "step": 486 }, { "epoch": 0.31806678096171115, "grad_norm": 12.21030922364517, "learning_rate": 1.7250209209335926e-07, "logits/chosen": -0.8060334920883179, "logits/rejected": -0.84015953540802, "logps/chosen": -825.5, "logps/rejected": -943.2487182617188, "loss": 0.5405, "rewards/accuracies": 0.625, "rewards/chosen": -3.8686962127685547, "rewards/margins": 0.5992237329483032, "rewards/rejected": -4.467920303344727, "step": 487 }, { "epoch": 0.3187198955016736, "grad_norm": 11.897072585977261, "learning_rate": 1.7234477215186636e-07, "logits/chosen": -0.7122973799705505, "logits/rejected": -0.7898483276367188, "logps/chosen": -792.6326904296875, "logps/rejected": -949.6519165039062, "loss": 0.4476, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5841124057769775, "rewards/margins": 1.0407400131225586, "rewards/rejected": -4.624852657318115, "step": 488 }, { "epoch": 0.31937301004163604, "grad_norm": 12.129067136065453, "learning_rate": 1.721870756463465e-07, "logits/chosen": -0.880097508430481, "logits/rejected": -0.7950392365455627, "logps/chosen": -844.4869995117188, "logps/rejected": -939.5963745117188, "loss": 0.4325, "rewards/accuracies": 0.90625, "rewards/chosen": -4.066311359405518, "rewards/margins": 1.0431554317474365, "rewards/rejected": -5.109466552734375, "step": 489 }, { "epoch": 0.3200261245815985, "grad_norm": 11.35584429849149, "learning_rate": 1.7202900339763064e-07, "logits/chosen": -0.841032087802887, "logits/rejected": -0.799349308013916, "logps/chosen": -821.346923828125, "logps/rejected": -912.546630859375, "loss": 0.4661, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8169021606445312, "rewards/margins": 1.0876867771148682, "rewards/rejected": -4.90458869934082, "step": 490 }, { "epoch": 0.32067923912156093, "grad_norm": 15.041059756588488, "learning_rate": 1.7187055622850558e-07, "logits/chosen": -0.930271327495575, "logits/rejected": -0.6977147459983826, "logps/chosen": -942.312744140625, "logps/rejected": -976.447509765625, "loss": 0.5137, "rewards/accuracies": 0.75, "rewards/chosen": -4.337576866149902, "rewards/margins": 0.8670297861099243, "rewards/rejected": -5.204606533050537, "step": 491 }, { "epoch": 0.3213323536615234, "grad_norm": 16.154803531987202, "learning_rate": 1.717117349637096e-07, "logits/chosen": -0.728052020072937, "logits/rejected": -0.7592598795890808, "logps/chosen": -822.1433715820312, "logps/rejected": -954.33642578125, "loss": 0.5059, "rewards/accuracies": 0.78125, "rewards/chosen": -4.139540672302246, "rewards/margins": 1.180368423461914, "rewards/rejected": -5.319908142089844, "step": 492 }, { "epoch": 0.3219854682014858, "grad_norm": 19.037823685112883, "learning_rate": 1.7155254042992825e-07, "logits/chosen": -0.8321860432624817, "logits/rejected": -0.7546296119689941, "logps/chosen": -870.90234375, "logps/rejected": -969.2771606445312, "loss": 0.5401, "rewards/accuracies": 0.625, "rewards/chosen": -3.6551272869110107, "rewards/margins": 0.9787754416465759, "rewards/rejected": -4.6339030265808105, "step": 493 }, { "epoch": 0.3226385827414483, "grad_norm": 12.255497260985175, "learning_rate": 1.7139297345578992e-07, "logits/chosen": -0.8148177862167358, "logits/rejected": -0.8120337724685669, "logps/chosen": -818.4249267578125, "logps/rejected": -935.2034912109375, "loss": 0.4886, "rewards/accuracies": 0.84375, "rewards/chosen": -3.5292749404907227, "rewards/margins": 1.132380723953247, "rewards/rejected": -4.661655902862549, "step": 494 }, { "epoch": 0.3232916972814107, "grad_norm": 12.380876583809687, "learning_rate": 1.7123303487186163e-07, "logits/chosen": -0.8092105984687805, "logits/rejected": -0.802417516708374, "logps/chosen": -872.7374267578125, "logps/rejected": -993.07275390625, "loss": 0.4734, "rewards/accuracies": 0.875, "rewards/chosen": -3.427180290222168, "rewards/margins": 1.040173053741455, "rewards/rejected": -4.467353343963623, "step": 495 }, { "epoch": 0.3239448118213732, "grad_norm": 13.011638430489379, "learning_rate": 1.710727255106447e-07, "logits/chosen": -0.9116150736808777, "logits/rejected": -0.8956907987594604, "logps/chosen": -693.85986328125, "logps/rejected": -819.0209350585938, "loss": 0.4843, "rewards/accuracies": 0.75, "rewards/chosen": -3.104858636856079, "rewards/margins": 1.0460671186447144, "rewards/rejected": -4.150925159454346, "step": 496 }, { "epoch": 0.3245979263613356, "grad_norm": 13.157305554266493, "learning_rate": 1.7091204620657042e-07, "logits/chosen": -0.9443058967590332, "logits/rejected": -0.8911035060882568, "logps/chosen": -843.8384399414062, "logps/rejected": -942.85693359375, "loss": 0.4831, "rewards/accuracies": 0.75, "rewards/chosen": -4.061310768127441, "rewards/margins": 1.07249116897583, "rewards/rejected": -5.13380241394043, "step": 497 }, { "epoch": 0.3252510409012981, "grad_norm": 13.572167660903137, "learning_rate": 1.707509977959956e-07, "logits/chosen": -0.8992602229118347, "logits/rejected": -0.900657057762146, "logps/chosen": -860.68115234375, "logps/rejected": -970.5743408203125, "loss": 0.4518, "rewards/accuracies": 0.84375, "rewards/chosen": -3.872074604034424, "rewards/margins": 1.0880197286605835, "rewards/rejected": -4.960093975067139, "step": 498 }, { "epoch": 0.3259041554412605, "grad_norm": 13.585680096052258, "learning_rate": 1.7058958111719835e-07, "logits/chosen": -0.7684957385063171, "logits/rejected": -0.7548912167549133, "logps/chosen": -855.7137451171875, "logps/rejected": -943.9931030273438, "loss": 0.4719, "rewards/accuracies": 0.84375, "rewards/chosen": -3.692744731903076, "rewards/margins": 1.0405030250549316, "rewards/rejected": -4.733248233795166, "step": 499 }, { "epoch": 0.326557269981223, "grad_norm": 9.503533989046367, "learning_rate": 1.704277970103736e-07, "logits/chosen": -0.8868258595466614, "logits/rejected": -0.8633951544761658, "logps/chosen": -849.567626953125, "logps/rejected": -934.948486328125, "loss": 0.4314, "rewards/accuracies": 0.75, "rewards/chosen": -3.8034567832946777, "rewards/margins": 0.9566541314125061, "rewards/rejected": -4.760110855102539, "step": 500 }, { "epoch": 0.326557269981223, "eval_logits/chosen": -0.669052243232727, "eval_logits/rejected": -0.6189415454864502, "eval_logps/chosen": -840.8739624023438, "eval_logps/rejected": -925.4282836914062, "eval_loss": 0.4787108898162842, "eval_rewards/accuracies": 0.7630000114440918, "eval_rewards/chosen": -3.6696672439575195, "eval_rewards/margins": 0.9329305291175842, "eval_rewards/rejected": -4.602597236633301, "eval_runtime": 615.294, "eval_samples_per_second": 6.501, "eval_steps_per_second": 0.406, "step": 500 }, { "epoch": 0.3272103845211854, "grad_norm": 15.147382044291335, "learning_rate": 1.7026564631762895e-07, "logits/chosen": -0.7561497092247009, "logits/rejected": -0.7421559691429138, "logps/chosen": -769.6187133789062, "logps/rejected": -881.5009765625, "loss": 0.5245, "rewards/accuracies": 0.84375, "rewards/chosen": -3.4119911193847656, "rewards/margins": 1.0296821594238281, "rewards/rejected": -4.441673755645752, "step": 501 }, { "epoch": 0.32786349906114787, "grad_norm": 10.968419509869916, "learning_rate": 1.7010312988297993e-07, "logits/chosen": -0.8724136352539062, "logits/rejected": -0.9291731119155884, "logps/chosen": -879.1983032226562, "logps/rejected": -1025.105712890625, "loss": 0.4349, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4728522300720215, "rewards/margins": 1.3036978244781494, "rewards/rejected": -4.776549816131592, "step": 502 }, { "epoch": 0.3285166136011103, "grad_norm": 10.141199307712194, "learning_rate": 1.6994024855234592e-07, "logits/chosen": -0.8684702515602112, "logits/rejected": -0.8307234048843384, "logps/chosen": -816.9854125976562, "logps/rejected": -867.1416015625, "loss": 0.4723, "rewards/accuracies": 0.75, "rewards/chosen": -3.5080981254577637, "rewards/margins": 0.7219822406768799, "rewards/rejected": -4.2300801277160645, "step": 503 }, { "epoch": 0.32916972814107276, "grad_norm": 14.673164446644734, "learning_rate": 1.6977700317354565e-07, "logits/chosen": -0.7906508445739746, "logits/rejected": -0.7534193992614746, "logps/chosen": -878.6422119140625, "logps/rejected": -905.3970947265625, "loss": 0.4902, "rewards/accuracies": 0.71875, "rewards/chosen": -3.976510524749756, "rewards/margins": 0.7006040215492249, "rewards/rejected": -4.677114963531494, "step": 504 }, { "epoch": 0.3298228426810352, "grad_norm": 9.7205031042585, "learning_rate": 1.6961339459629266e-07, "logits/chosen": -0.8221117258071899, "logits/rejected": -0.7521359920501709, "logps/chosen": -899.8609619140625, "logps/rejected": -955.1153564453125, "loss": 0.4702, "rewards/accuracies": 0.75, "rewards/chosen": -4.543219566345215, "rewards/margins": 0.982782781124115, "rewards/rejected": -5.526001930236816, "step": 505 }, { "epoch": 0.33047595722099765, "grad_norm": 12.414035243869511, "learning_rate": 1.694494236721911e-07, "logits/chosen": -1.0194346904754639, "logits/rejected": -0.9434860944747925, "logps/chosen": -840.7852172851562, "logps/rejected": -910.69140625, "loss": 0.452, "rewards/accuracies": 0.75, "rewards/chosen": -3.716850996017456, "rewards/margins": 0.9896608591079712, "rewards/rejected": -4.706511497497559, "step": 506 }, { "epoch": 0.33112907176096007, "grad_norm": 16.29842962578957, "learning_rate": 1.6928509125473109e-07, "logits/chosen": -0.9049715995788574, "logits/rejected": -0.9679229855537415, "logps/chosen": -842.1791381835938, "logps/rejected": -965.1322631835938, "loss": 0.5553, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8672585487365723, "rewards/margins": 1.0312442779541016, "rewards/rejected": -4.898502826690674, "step": 507 }, { "epoch": 0.33178218630092254, "grad_norm": 11.824506292597153, "learning_rate": 1.6912039819928444e-07, "logits/chosen": -0.9393632411956787, "logits/rejected": -0.8065717816352844, "logps/chosen": -909.3609008789062, "logps/rejected": -944.26806640625, "loss": 0.4606, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5138916969299316, "rewards/margins": 1.1851271390914917, "rewards/rejected": -4.699018955230713, "step": 508 }, { "epoch": 0.33243530084088496, "grad_norm": 13.178303937450522, "learning_rate": 1.6895534536310016e-07, "logits/chosen": -0.8816686868667603, "logits/rejected": -0.9407558441162109, "logps/chosen": -825.6542358398438, "logps/rejected": -1041.97900390625, "loss": 0.5393, "rewards/accuracies": 0.75, "rewards/chosen": -3.608908176422119, "rewards/margins": 1.2655761241912842, "rewards/rejected": -4.874484062194824, "step": 509 }, { "epoch": 0.33308841538084744, "grad_norm": 14.045988235199069, "learning_rate": 1.6878993360529982e-07, "logits/chosen": -1.021334171295166, "logits/rejected": -0.8892602920532227, "logps/chosen": -886.7183837890625, "logps/rejected": -941.6819458007812, "loss": 0.4527, "rewards/accuracies": 0.90625, "rewards/chosen": -4.115851402282715, "rewards/margins": 0.8323267698287964, "rewards/rejected": -4.948177814483643, "step": 510 }, { "epoch": 0.33374152992080985, "grad_norm": 13.698262023902034, "learning_rate": 1.6862416378687336e-07, "logits/chosen": -0.9176092743873596, "logits/rejected": -0.834775447845459, "logps/chosen": -844.972412109375, "logps/rejected": -964.8734741210938, "loss": 0.5024, "rewards/accuracies": 0.71875, "rewards/chosen": -3.549609899520874, "rewards/margins": 1.238707423210144, "rewards/rejected": -4.7883172035217285, "step": 511 }, { "epoch": 0.33439464446077233, "grad_norm": 13.51360210625101, "learning_rate": 1.684580367706744e-07, "logits/chosen": -0.9627776145935059, "logits/rejected": -0.9435927271842957, "logps/chosen": -916.3948364257812, "logps/rejected": -1032.2822265625, "loss": 0.4764, "rewards/accuracies": 0.84375, "rewards/chosen": -4.037001609802246, "rewards/margins": 1.3188434839248657, "rewards/rejected": -5.355844497680664, "step": 512 }, { "epoch": 0.33504775900073475, "grad_norm": 12.919673658707383, "learning_rate": 1.6829155342141586e-07, "logits/chosen": -0.7160369157791138, "logits/rejected": -0.7462060451507568, "logps/chosen": -783.5517578125, "logps/rejected": -832.207275390625, "loss": 0.4687, "rewards/accuracies": 0.75, "rewards/chosen": -3.499671220779419, "rewards/margins": 0.49122992157936096, "rewards/rejected": -3.990901231765747, "step": 513 }, { "epoch": 0.3357008735406972, "grad_norm": 16.359086592503015, "learning_rate": 1.681247146056654e-07, "logits/chosen": -0.9952366948127747, "logits/rejected": -0.8325803875923157, "logps/chosen": -982.23828125, "logps/rejected": -1018.0582275390625, "loss": 0.5182, "rewards/accuracies": 0.59375, "rewards/chosen": -4.51038122177124, "rewards/margins": 0.5027068853378296, "rewards/rejected": -5.013088226318359, "step": 514 }, { "epoch": 0.33635398808065964, "grad_norm": 12.599243151717252, "learning_rate": 1.6795752119184096e-07, "logits/chosen": -0.7795126438140869, "logits/rejected": -0.7431491017341614, "logps/chosen": -834.5680541992188, "logps/rejected": -903.7908935546875, "loss": 0.4307, "rewards/accuracies": 0.65625, "rewards/chosen": -3.54948091506958, "rewards/margins": 0.811899721622467, "rewards/rejected": -4.3613810539245605, "step": 515 }, { "epoch": 0.3370071026206221, "grad_norm": 23.221262136804224, "learning_rate": 1.6778997405020616e-07, "logits/chosen": -0.8013187646865845, "logits/rejected": -0.7317193746566772, "logps/chosen": -978.86865234375, "logps/rejected": -1063.277587890625, "loss": 0.5415, "rewards/accuracies": 0.75, "rewards/chosen": -3.981942653656006, "rewards/margins": 1.0086599588394165, "rewards/rejected": -4.990602016448975, "step": 516 }, { "epoch": 0.33766021716058453, "grad_norm": 10.605337134032375, "learning_rate": 1.6762207405286586e-07, "logits/chosen": -0.62137770652771, "logits/rejected": -0.6129266619682312, "logps/chosen": -876.7236328125, "logps/rejected": -995.8892211914062, "loss": 0.5172, "rewards/accuracies": 0.78125, "rewards/chosen": -3.8768973350524902, "rewards/margins": 1.032037377357483, "rewards/rejected": -4.908934593200684, "step": 517 }, { "epoch": 0.338313331700547, "grad_norm": 12.079127195544842, "learning_rate": 1.6745382207376156e-07, "logits/chosen": -0.8155550956726074, "logits/rejected": -0.7862538695335388, "logps/chosen": -860.969482421875, "logps/rejected": -986.3485717773438, "loss": 0.5162, "rewards/accuracies": 0.78125, "rewards/chosen": -3.910642385482788, "rewards/margins": 1.0282100439071655, "rewards/rejected": -4.938852787017822, "step": 518 }, { "epoch": 0.3389664462405094, "grad_norm": 12.099481524777941, "learning_rate": 1.6728521898866686e-07, "logits/chosen": -0.9201481342315674, "logits/rejected": -0.9237188696861267, "logps/chosen": -874.0413208007812, "logps/rejected": -971.56494140625, "loss": 0.4838, "rewards/accuracies": 0.71875, "rewards/chosen": -3.7183971405029297, "rewards/margins": 0.771868109703064, "rewards/rejected": -4.490265846252441, "step": 519 }, { "epoch": 0.3396195607804719, "grad_norm": 12.606539455519421, "learning_rate": 1.6711626567518297e-07, "logits/chosen": -1.0150597095489502, "logits/rejected": -1.0211997032165527, "logps/chosen": -820.2203369140625, "logps/rejected": -894.1910400390625, "loss": 0.4724, "rewards/accuracies": 0.75, "rewards/chosen": -3.321272134780884, "rewards/margins": 0.7952224612236023, "rewards/rejected": -4.116494655609131, "step": 520 }, { "epoch": 0.3402726753204343, "grad_norm": 9.918000148345703, "learning_rate": 1.6694696301273402e-07, "logits/chosen": -0.7031683325767517, "logits/rejected": -0.5711356401443481, "logps/chosen": -777.73193359375, "logps/rejected": -904.4061889648438, "loss": 0.4397, "rewards/accuracies": 0.875, "rewards/chosen": -3.679725170135498, "rewards/margins": 1.2326750755310059, "rewards/rejected": -4.912400245666504, "step": 521 }, { "epoch": 0.3409257898603968, "grad_norm": 9.780704201122273, "learning_rate": 1.6677731188256257e-07, "logits/chosen": -0.9383131861686707, "logits/rejected": -0.9517966508865356, "logps/chosen": -805.9428100585938, "logps/rejected": -924.341552734375, "loss": 0.4209, "rewards/accuracies": 0.84375, "rewards/chosen": -3.364539384841919, "rewards/margins": 1.0484657287597656, "rewards/rejected": -4.4130048751831055, "step": 522 }, { "epoch": 0.3415789044003592, "grad_norm": 11.866203216090135, "learning_rate": 1.6660731316772502e-07, "logits/chosen": -0.796733021736145, "logits/rejected": -0.7914215326309204, "logps/chosen": -848.482421875, "logps/rejected": -931.0703125, "loss": 0.5249, "rewards/accuracies": 0.75, "rewards/chosen": -3.7563669681549072, "rewards/margins": 0.6706068515777588, "rewards/rejected": -4.426973819732666, "step": 523 }, { "epoch": 0.3422320189403217, "grad_norm": 13.399215080432462, "learning_rate": 1.6643696775308694e-07, "logits/chosen": -0.8221777081489563, "logits/rejected": -0.8325170874595642, "logps/chosen": -776.5897216796875, "logps/rejected": -887.011962890625, "loss": 0.4454, "rewards/accuracies": 0.875, "rewards/chosen": -3.084960699081421, "rewards/margins": 1.0111218690872192, "rewards/rejected": -4.09608268737793, "step": 524 }, { "epoch": 0.3428851334802841, "grad_norm": 13.323830997149015, "learning_rate": 1.662662765253186e-07, "logits/chosen": -0.9103497266769409, "logits/rejected": -0.9971468448638916, "logps/chosen": -743.458740234375, "logps/rejected": -852.3600463867188, "loss": 0.4597, "rewards/accuracies": 0.875, "rewards/chosen": -3.1231021881103516, "rewards/margins": 0.6584322452545166, "rewards/rejected": -3.7815346717834473, "step": 525 }, { "epoch": 0.3435382480202466, "grad_norm": 11.479976007473763, "learning_rate": 1.6609524037289016e-07, "logits/chosen": -0.8146867752075195, "logits/rejected": -0.8755562901496887, "logps/chosen": -821.8370971679688, "logps/rejected": -998.6825561523438, "loss": 0.3882, "rewards/accuracies": 0.84375, "rewards/chosen": -3.435415744781494, "rewards/margins": 1.529471755027771, "rewards/rejected": -4.964887619018555, "step": 526 }, { "epoch": 0.344191362560209, "grad_norm": 16.99793649417262, "learning_rate": 1.6592386018606735e-07, "logits/chosen": -0.7995474934577942, "logits/rejected": -0.8219078779220581, "logps/chosen": -768.4069213867188, "logps/rejected": -880.92724609375, "loss": 0.505, "rewards/accuracies": 0.6875, "rewards/chosen": -3.4240992069244385, "rewards/margins": 0.8240652680397034, "rewards/rejected": -4.248164653778076, "step": 527 }, { "epoch": 0.34484447710017146, "grad_norm": 22.950737519034714, "learning_rate": 1.6575213685690636e-07, "logits/chosen": -0.7773298025131226, "logits/rejected": -0.6868129968643188, "logps/chosen": -749.68798828125, "logps/rejected": -865.9345092773438, "loss": 0.494, "rewards/accuracies": 0.8125, "rewards/chosen": -3.298440456390381, "rewards/margins": 1.106008768081665, "rewards/rejected": -4.404449462890625, "step": 528 }, { "epoch": 0.3454975916401339, "grad_norm": 13.084048334963885, "learning_rate": 1.6558007127924977e-07, "logits/chosen": -0.8282909989356995, "logits/rejected": -0.8078904151916504, "logps/chosen": -876.759765625, "logps/rejected": -973.1893310546875, "loss": 0.471, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8432815074920654, "rewards/margins": 1.1956998109817505, "rewards/rejected": -5.0389814376831055, "step": 529 }, { "epoch": 0.34615070618009636, "grad_norm": 13.458405988710624, "learning_rate": 1.6540766434872137e-07, "logits/chosen": -0.7838751077651978, "logits/rejected": -0.7649997472763062, "logps/chosen": -879.0845336914062, "logps/rejected": -1020.4249877929688, "loss": 0.4086, "rewards/accuracies": 0.78125, "rewards/chosen": -3.94573712348938, "rewards/margins": 1.4766623973846436, "rewards/rejected": -5.422399520874023, "step": 530 }, { "epoch": 0.3468038207200588, "grad_norm": 14.043989120826666, "learning_rate": 1.6523491696272192e-07, "logits/chosen": -0.9524602293968201, "logits/rejected": -0.9211701154708862, "logps/chosen": -890.7154541015625, "logps/rejected": -965.8680419921875, "loss": 0.463, "rewards/accuracies": 0.78125, "rewards/chosen": -4.362317085266113, "rewards/margins": 0.8987084031105042, "rewards/rejected": -5.261025428771973, "step": 531 }, { "epoch": 0.34745693526002125, "grad_norm": 11.383924987842038, "learning_rate": 1.650618300204242e-07, "logits/chosen": -0.9213005900382996, "logits/rejected": -0.9080460071563721, "logps/chosen": -811.111083984375, "logps/rejected": -971.2776489257812, "loss": 0.4346, "rewards/accuracies": 0.78125, "rewards/chosen": -3.729093551635742, "rewards/margins": 1.4446587562561035, "rewards/rejected": -5.173752307891846, "step": 532 }, { "epoch": 0.34811004979998367, "grad_norm": 11.180079833888518, "learning_rate": 1.6488840442276846e-07, "logits/chosen": -0.8584150671958923, "logits/rejected": -0.9680982828140259, "logps/chosen": -807.1697387695312, "logps/rejected": -950.741455078125, "loss": 0.4583, "rewards/accuracies": 0.75, "rewards/chosen": -3.821699619293213, "rewards/margins": 1.069732904434204, "rewards/rejected": -4.891432762145996, "step": 533 }, { "epoch": 0.34876316433994614, "grad_norm": 12.714767456829277, "learning_rate": 1.6471464107245766e-07, "logits/chosen": -1.0752252340316772, "logits/rejected": -1.076012134552002, "logps/chosen": -833.3370971679688, "logps/rejected": -881.901611328125, "loss": 0.558, "rewards/accuracies": 0.71875, "rewards/chosen": -3.728384256362915, "rewards/margins": 0.7215151786804199, "rewards/rejected": -4.449899673461914, "step": 534 }, { "epoch": 0.34941627887990856, "grad_norm": 13.100511603907313, "learning_rate": 1.645405408739528e-07, "logits/chosen": -0.8571863174438477, "logits/rejected": -0.7581591606140137, "logps/chosen": -853.043212890625, "logps/rejected": -943.9117431640625, "loss": 0.4645, "rewards/accuracies": 0.78125, "rewards/chosen": -3.86018443107605, "rewards/margins": 0.9599357843399048, "rewards/rejected": -4.820120334625244, "step": 535 }, { "epoch": 0.35006939341987103, "grad_norm": 11.192981080205437, "learning_rate": 1.643661047334683e-07, "logits/chosen": -0.8171364665031433, "logits/rejected": -0.8735244274139404, "logps/chosen": -1039.5997314453125, "logps/rejected": -1199.798583984375, "loss": 0.4044, "rewards/accuracies": 0.71875, "rewards/chosen": -4.921164512634277, "rewards/margins": 1.3224613666534424, "rewards/rejected": -6.243625640869141, "step": 536 }, { "epoch": 0.35072250795983345, "grad_norm": 12.109010785072018, "learning_rate": 1.6419133355896713e-07, "logits/chosen": -0.9537902474403381, "logits/rejected": -0.8713423013687134, "logps/chosen": -931.1124877929688, "logps/rejected": -981.86572265625, "loss": 0.4852, "rewards/accuracies": 0.8125, "rewards/chosen": -4.270295143127441, "rewards/margins": 1.0137115716934204, "rewards/rejected": -5.284006118774414, "step": 537 }, { "epoch": 0.3513756224997959, "grad_norm": 19.462264324301042, "learning_rate": 1.6401622826015612e-07, "logits/chosen": -0.9751338958740234, "logits/rejected": -0.8156272172927856, "logps/chosen": -881.4043579101562, "logps/rejected": -954.9631958007812, "loss": 0.4636, "rewards/accuracies": 0.78125, "rewards/chosen": -4.002389907836914, "rewards/margins": 0.9239901304244995, "rewards/rejected": -4.926379203796387, "step": 538 }, { "epoch": 0.35202873703975834, "grad_norm": 17.303434296903287, "learning_rate": 1.6384078974848142e-07, "logits/chosen": -0.858856737613678, "logits/rejected": -0.8381833434104919, "logps/chosen": -914.42041015625, "logps/rejected": -981.6102905273438, "loss": 0.4921, "rewards/accuracies": 0.84375, "rewards/chosen": -4.198989391326904, "rewards/margins": 0.9757477641105652, "rewards/rejected": -5.174736976623535, "step": 539 }, { "epoch": 0.3526818515797208, "grad_norm": 19.533240518764423, "learning_rate": 1.6366501893712344e-07, "logits/chosen": -0.9275886416435242, "logits/rejected": -0.8129714727401733, "logps/chosen": -907.5596313476562, "logps/rejected": -1000.3069458007812, "loss": 0.4501, "rewards/accuracies": 0.84375, "rewards/chosen": -4.100757598876953, "rewards/margins": 1.2760422229766846, "rewards/rejected": -5.376799583435059, "step": 540 }, { "epoch": 0.35333496611968324, "grad_norm": 11.810356313020227, "learning_rate": 1.6348891674099229e-07, "logits/chosen": -0.8514102101325989, "logits/rejected": -0.8506395816802979, "logps/chosen": -914.7454223632812, "logps/rejected": -980.7003173828125, "loss": 0.429, "rewards/accuracies": 0.78125, "rewards/chosen": -4.303524971008301, "rewards/margins": 0.7691353559494019, "rewards/rejected": -5.072659969329834, "step": 541 }, { "epoch": 0.3539880806596457, "grad_norm": 15.2440846867984, "learning_rate": 1.6331248407672298e-07, "logits/chosen": -0.8096214532852173, "logits/rejected": -0.718659520149231, "logps/chosen": -847.4376220703125, "logps/rejected": -919.3460693359375, "loss": 0.4332, "rewards/accuracies": 0.84375, "rewards/chosen": -3.8864338397979736, "rewards/margins": 0.990598201751709, "rewards/rejected": -4.8770318031311035, "step": 542 }, { "epoch": 0.35464119519960813, "grad_norm": 13.212471074381327, "learning_rate": 1.6313572186267072e-07, "logits/chosen": -0.698846697807312, "logits/rejected": -0.6679378151893616, "logps/chosen": -854.2268676757812, "logps/rejected": -1125.93310546875, "loss": 0.4416, "rewards/accuracies": 0.78125, "rewards/chosen": -4.361400604248047, "rewards/margins": 2.335489511489868, "rewards/rejected": -6.696890354156494, "step": 543 }, { "epoch": 0.3552943097395706, "grad_norm": 15.42268532063693, "learning_rate": 1.62958631018906e-07, "logits/chosen": -0.8758993148803711, "logits/rejected": -0.8772597908973694, "logps/chosen": -919.5452880859375, "logps/rejected": -1081.818115234375, "loss": 0.4518, "rewards/accuracies": 0.75, "rewards/chosen": -4.379350185394287, "rewards/margins": 1.225996971130371, "rewards/rejected": -5.605347633361816, "step": 544 }, { "epoch": 0.355947424279533, "grad_norm": 12.72993505285054, "learning_rate": 1.6278121246720986e-07, "logits/chosen": -0.733549952507019, "logits/rejected": -0.7166369557380676, "logps/chosen": -890.005859375, "logps/rejected": -1006.06689453125, "loss": 0.4022, "rewards/accuracies": 0.8125, "rewards/chosen": -4.903335094451904, "rewards/margins": 1.2002079486846924, "rewards/rejected": -6.103543281555176, "step": 545 }, { "epoch": 0.3566005388194955, "grad_norm": 17.091713363040604, "learning_rate": 1.6260346713106915e-07, "logits/chosen": -0.9864925146102905, "logits/rejected": -0.87857985496521, "logps/chosen": -908.6575927734375, "logps/rejected": -1028.0980224609375, "loss": 0.4571, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8206915855407715, "rewards/margins": 1.3013036251068115, "rewards/rejected": -5.121994495391846, "step": 546 }, { "epoch": 0.3572536533594579, "grad_norm": 13.27253227088685, "learning_rate": 1.6242539593567167e-07, "logits/chosen": -0.8658774495124817, "logits/rejected": -0.7811622619628906, "logps/chosen": -884.6961669921875, "logps/rejected": -967.5526733398438, "loss": 0.4739, "rewards/accuracies": 0.65625, "rewards/chosen": -4.198709487915039, "rewards/margins": 0.9260251522064209, "rewards/rejected": -5.124734878540039, "step": 547 }, { "epoch": 0.3579067678994204, "grad_norm": 16.130528314621436, "learning_rate": 1.6224699980790128e-07, "logits/chosen": -0.8342747688293457, "logits/rejected": -0.8980854153633118, "logps/chosen": -939.5543212890625, "logps/rejected": -1134.8359375, "loss": 0.4595, "rewards/accuracies": 0.90625, "rewards/chosen": -4.8620781898498535, "rewards/margins": 1.3039054870605469, "rewards/rejected": -6.165983200073242, "step": 548 }, { "epoch": 0.3585598824393828, "grad_norm": 15.939077041425103, "learning_rate": 1.620682796763333e-07, "logits/chosen": -0.8608918190002441, "logits/rejected": -0.9147299528121948, "logps/chosen": -849.9219970703125, "logps/rejected": -998.6673583984375, "loss": 0.52, "rewards/accuracies": 0.875, "rewards/chosen": -4.1205244064331055, "rewards/margins": 1.2329450845718384, "rewards/rejected": -5.353469371795654, "step": 549 }, { "epoch": 0.3592129969793453, "grad_norm": 10.980661099727842, "learning_rate": 1.6188923647122945e-07, "logits/chosen": -0.882325291633606, "logits/rejected": -0.8491942286491394, "logps/chosen": -932.6527709960938, "logps/rejected": -1025.0045166015625, "loss": 0.424, "rewards/accuracies": 0.8125, "rewards/chosen": -4.657686710357666, "rewards/margins": 1.2477612495422363, "rewards/rejected": -5.9054484367370605, "step": 550 }, { "epoch": 0.3598661115193077, "grad_norm": 21.708736418823243, "learning_rate": 1.6170987112453305e-07, "logits/chosen": -0.7525062561035156, "logits/rejected": -0.7757470607757568, "logps/chosen": -857.36572265625, "logps/rejected": -877.4985961914062, "loss": 0.464, "rewards/accuracies": 0.75, "rewards/chosen": -3.827180862426758, "rewards/margins": 0.5651108622550964, "rewards/rejected": -4.39229154586792, "step": 551 }, { "epoch": 0.36051922605927017, "grad_norm": 16.678917268433715, "learning_rate": 1.6153018456986428e-07, "logits/chosen": -0.929282546043396, "logits/rejected": -0.9308112859725952, "logps/chosen": -797.1315307617188, "logps/rejected": -901.5902709960938, "loss": 0.4569, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8772454261779785, "rewards/margins": 0.9484346508979797, "rewards/rejected": -4.825679779052734, "step": 552 }, { "epoch": 0.3611723405992326, "grad_norm": 11.43691597383882, "learning_rate": 1.6135017774251518e-07, "logits/chosen": -0.7681158781051636, "logits/rejected": -0.7736382484436035, "logps/chosen": -892.3456420898438, "logps/rejected": -1131.2122802734375, "loss": 0.3717, "rewards/accuracies": 0.84375, "rewards/chosen": -4.3778395652771, "rewards/margins": 1.9359546899795532, "rewards/rejected": -6.3137946128845215, "step": 553 }, { "epoch": 0.36182545513919506, "grad_norm": 13.839372124571064, "learning_rate": 1.6116985157944494e-07, "logits/chosen": -0.7676699757575989, "logits/rejected": -0.7218711972236633, "logps/chosen": -903.3311157226562, "logps/rejected": -1085.003173828125, "loss": 0.4355, "rewards/accuracies": 0.90625, "rewards/chosen": -4.102015495300293, "rewards/margins": 1.7181568145751953, "rewards/rejected": -5.820172309875488, "step": 554 }, { "epoch": 0.3624785696791575, "grad_norm": 26.180670172356372, "learning_rate": 1.609892070192749e-07, "logits/chosen": -0.9137950539588928, "logits/rejected": -0.9119899272918701, "logps/chosen": -804.7276611328125, "logps/rejected": -930.9756469726562, "loss": 0.4846, "rewards/accuracies": 0.8125, "rewards/chosen": -3.975982666015625, "rewards/margins": 1.0355627536773682, "rewards/rejected": -5.011545658111572, "step": 555 }, { "epoch": 0.36313168421911995, "grad_norm": 14.581822298383303, "learning_rate": 1.6080824500228366e-07, "logits/chosen": -0.9392572045326233, "logits/rejected": -0.8843672275543213, "logps/chosen": -817.7110595703125, "logps/rejected": -892.86279296875, "loss": 0.4697, "rewards/accuracies": 0.8125, "rewards/chosen": -3.838777542114258, "rewards/margins": 0.9610627293586731, "rewards/rejected": -4.799839973449707, "step": 556 }, { "epoch": 0.3637847987590824, "grad_norm": 27.455679946964146, "learning_rate": 1.6062696647040224e-07, "logits/chosen": -0.9277936816215515, "logits/rejected": -0.8799358606338501, "logps/chosen": -852.052978515625, "logps/rejected": -1005.009521484375, "loss": 0.4074, "rewards/accuracies": 0.84375, "rewards/chosen": -4.25396203994751, "rewards/margins": 1.4232234954833984, "rewards/rejected": -5.67718505859375, "step": 557 }, { "epoch": 0.36443791329904485, "grad_norm": 19.915542369028874, "learning_rate": 1.604453723672092e-07, "logits/chosen": -0.7865580320358276, "logits/rejected": -0.8904578685760498, "logps/chosen": -826.33984375, "logps/rejected": -1009.6666259765625, "loss": 0.4737, "rewards/accuracies": 0.71875, "rewards/chosen": -3.9453487396240234, "rewards/margins": 1.032300353050232, "rewards/rejected": -4.977649211883545, "step": 558 }, { "epoch": 0.36509102783900726, "grad_norm": 9.98716425060218, "learning_rate": 1.6026346363792565e-07, "logits/chosen": -0.9274966716766357, "logits/rejected": -0.8684262037277222, "logps/chosen": -877.6671142578125, "logps/rejected": -977.3114624023438, "loss": 0.4311, "rewards/accuracies": 0.8125, "rewards/chosen": -4.041975975036621, "rewards/margins": 1.2339956760406494, "rewards/rejected": -5.275971412658691, "step": 559 }, { "epoch": 0.36574414237896974, "grad_norm": 19.342674323255824, "learning_rate": 1.6008124122941037e-07, "logits/chosen": -0.9390403628349304, "logits/rejected": -0.9198365211486816, "logps/chosen": -984.45556640625, "logps/rejected": -1059.18359375, "loss": 0.4974, "rewards/accuracies": 0.71875, "rewards/chosen": -4.400831699371338, "rewards/margins": 0.9896097183227539, "rewards/rejected": -5.390440940856934, "step": 560 }, { "epoch": 0.36639725691893216, "grad_norm": 10.44761304304428, "learning_rate": 1.5989870609015492e-07, "logits/chosen": -0.7790169715881348, "logits/rejected": -0.7415274381637573, "logps/chosen": -809.3233642578125, "logps/rejected": -942.257568359375, "loss": 0.4109, "rewards/accuracies": 0.875, "rewards/chosen": -3.4561712741851807, "rewards/margins": 1.2330386638641357, "rewards/rejected": -4.689209938049316, "step": 561 }, { "epoch": 0.36705037145889463, "grad_norm": 11.425228413665803, "learning_rate": 1.597158591702786e-07, "logits/chosen": -0.7430764436721802, "logits/rejected": -0.7750738859176636, "logps/chosen": -899.6544799804688, "logps/rejected": -1123.5078125, "loss": 0.3709, "rewards/accuracies": 0.84375, "rewards/chosen": -4.073906421661377, "rewards/margins": 1.5739132165908813, "rewards/rejected": -5.647819519042969, "step": 562 }, { "epoch": 0.36770348599885705, "grad_norm": 13.93278816074745, "learning_rate": 1.5953270142152367e-07, "logits/chosen": -0.6818071603775024, "logits/rejected": -0.6695794463157654, "logps/chosen": -877.8299560546875, "logps/rejected": -1009.041259765625, "loss": 0.5168, "rewards/accuracies": 0.71875, "rewards/chosen": -4.205145835876465, "rewards/margins": 0.9694371819496155, "rewards/rejected": -5.174582481384277, "step": 563 }, { "epoch": 0.3683566005388195, "grad_norm": 21.432721183984388, "learning_rate": 1.5934923379725018e-07, "logits/chosen": -0.7447420358657837, "logits/rejected": -0.6917985677719116, "logps/chosen": -836.9746704101562, "logps/rejected": -931.629638671875, "loss": 0.5602, "rewards/accuracies": 0.6875, "rewards/chosen": -3.975611925125122, "rewards/margins": 1.0695677995681763, "rewards/rejected": -5.045180320739746, "step": 564 }, { "epoch": 0.36900971507878194, "grad_norm": 19.384954342788763, "learning_rate": 1.591654572524312e-07, "logits/chosen": -0.8880128860473633, "logits/rejected": -0.8889778256416321, "logps/chosen": -841.0914306640625, "logps/rejected": -1005.8532104492188, "loss": 0.5231, "rewards/accuracies": 0.84375, "rewards/chosen": -3.652195453643799, "rewards/margins": 1.1500074863433838, "rewards/rejected": -4.8022027015686035, "step": 565 }, { "epoch": 0.36966282961874436, "grad_norm": 12.62412588033666, "learning_rate": 1.5898137274364774e-07, "logits/chosen": -0.8240070939064026, "logits/rejected": -0.8247821927070618, "logps/chosen": -926.2063598632812, "logps/rejected": -1109.5235595703125, "loss": 0.3979, "rewards/accuracies": 0.90625, "rewards/chosen": -4.213892459869385, "rewards/margins": 1.5136077404022217, "rewards/rejected": -5.7275004386901855, "step": 566 }, { "epoch": 0.37031594415870683, "grad_norm": 14.566856813447973, "learning_rate": 1.5879698122908382e-07, "logits/chosen": -0.8523387908935547, "logits/rejected": -0.8664197325706482, "logps/chosen": -848.1416015625, "logps/rejected": -976.9271240234375, "loss": 0.4588, "rewards/accuracies": 0.84375, "rewards/chosen": -4.19114351272583, "rewards/margins": 1.157962441444397, "rewards/rejected": -5.3491058349609375, "step": 567 }, { "epoch": 0.37096905869866925, "grad_norm": 12.639812881204627, "learning_rate": 1.5861228366852145e-07, "logits/chosen": -0.8494150042533875, "logits/rejected": -0.8036295771598816, "logps/chosen": -861.272705078125, "logps/rejected": -1024.7142333984375, "loss": 0.3876, "rewards/accuracies": 0.875, "rewards/chosen": -4.1044416427612305, "rewards/margins": 1.5496019124984741, "rewards/rejected": -5.654043197631836, "step": 568 }, { "epoch": 0.3716221732386317, "grad_norm": 13.43873158244553, "learning_rate": 1.5842728102333566e-07, "logits/chosen": -0.8592506051063538, "logits/rejected": -0.8374245166778564, "logps/chosen": -888.4620971679688, "logps/rejected": -990.207763671875, "loss": 0.467, "rewards/accuracies": 0.65625, "rewards/chosen": -3.955009937286377, "rewards/margins": 0.9639407992362976, "rewards/rejected": -4.91895055770874, "step": 569 }, { "epoch": 0.37227528777859414, "grad_norm": 16.138561564920185, "learning_rate": 1.5824197425648947e-07, "logits/chosen": -0.9307665824890137, "logits/rejected": -1.0155502557754517, "logps/chosen": -817.697998046875, "logps/rejected": -1008.6148681640625, "loss": 0.482, "rewards/accuracies": 0.8125, "rewards/chosen": -3.6846015453338623, "rewards/margins": 1.3351447582244873, "rewards/rejected": -5.019746780395508, "step": 570 }, { "epoch": 0.3729284023185566, "grad_norm": 15.515416540842121, "learning_rate": 1.580563643325289e-07, "logits/chosen": -0.8352770209312439, "logits/rejected": -0.8319212794303894, "logps/chosen": -889.775146484375, "logps/rejected": -1052.1566162109375, "loss": 0.4446, "rewards/accuracies": 0.75, "rewards/chosen": -4.425132751464844, "rewards/margins": 1.2785844802856445, "rewards/rejected": -5.7037177085876465, "step": 571 }, { "epoch": 0.37358151685851904, "grad_norm": 10.82779016449659, "learning_rate": 1.5787045221757796e-07, "logits/chosen": -0.8251916170120239, "logits/rejected": -0.7899847626686096, "logps/chosen": -837.8982543945312, "logps/rejected": -941.0586547851562, "loss": 0.4451, "rewards/accuracies": 0.8125, "rewards/chosen": -3.964031219482422, "rewards/margins": 0.9509406089782715, "rewards/rejected": -4.914971351623535, "step": 572 }, { "epoch": 0.3742346313984815, "grad_norm": 13.742934988865395, "learning_rate": 1.576842388793336e-07, "logits/chosen": -0.9121626615524292, "logits/rejected": -0.8413507342338562, "logps/chosen": -892.6531982421875, "logps/rejected": -1024.99609375, "loss": 0.4286, "rewards/accuracies": 0.78125, "rewards/chosen": -4.081020832061768, "rewards/margins": 1.357172966003418, "rewards/rejected": -5.4381937980651855, "step": 573 }, { "epoch": 0.37488774593844393, "grad_norm": 13.772268083761096, "learning_rate": 1.5749772528706066e-07, "logits/chosen": -0.8252898454666138, "logits/rejected": -0.8235541582107544, "logps/chosen": -879.6044311523438, "logps/rejected": -1059.9500732421875, "loss": 0.4267, "rewards/accuracies": 0.875, "rewards/chosen": -3.773669958114624, "rewards/margins": 1.4883966445922852, "rewards/rejected": -5.262066841125488, "step": 574 }, { "epoch": 0.3755408604784064, "grad_norm": 15.892411969332281, "learning_rate": 1.5731091241158683e-07, "logits/chosen": -0.9504250288009644, "logits/rejected": -0.8796836733818054, "logps/chosen": -975.87548828125, "logps/rejected": -1043.329345703125, "loss": 0.4615, "rewards/accuracies": 0.75, "rewards/chosen": -4.551794052124023, "rewards/margins": 0.715241014957428, "rewards/rejected": -5.267035007476807, "step": 575 }, { "epoch": 0.3761939750183688, "grad_norm": 15.377040417284569, "learning_rate": 1.5712380122529763e-07, "logits/chosen": -0.8614749312400818, "logits/rejected": -0.8348824977874756, "logps/chosen": -857.7217407226562, "logps/rejected": -966.0911865234375, "loss": 0.4592, "rewards/accuracies": 0.75, "rewards/chosen": -3.91888427734375, "rewards/margins": 1.0167591571807861, "rewards/rejected": -4.935643196105957, "step": 576 }, { "epoch": 0.3768470895583313, "grad_norm": 12.483624917174899, "learning_rate": 1.5693639270213135e-07, "logits/chosen": -0.8079325556755066, "logits/rejected": -0.8149147629737854, "logps/chosen": -945.904541015625, "logps/rejected": -1057.374267578125, "loss": 0.4637, "rewards/accuracies": 0.8125, "rewards/chosen": -4.423439025878906, "rewards/margins": 0.7207327485084534, "rewards/rejected": -5.144172191619873, "step": 577 }, { "epoch": 0.3775002040982937, "grad_norm": 21.60003125968158, "learning_rate": 1.5674868781757393e-07, "logits/chosen": -0.9885172247886658, "logits/rejected": -0.9942512512207031, "logps/chosen": -956.927001953125, "logps/rejected": -1092.0496826171875, "loss": 0.432, "rewards/accuracies": 0.8125, "rewards/chosen": -4.42249870300293, "rewards/margins": 1.3156774044036865, "rewards/rejected": -5.738176345825195, "step": 578 }, { "epoch": 0.3781533186382562, "grad_norm": 17.308152308920473, "learning_rate": 1.5656068754865386e-07, "logits/chosen": -0.9299103617668152, "logits/rejected": -0.8897454142570496, "logps/chosen": -879.7733764648438, "logps/rejected": -983.5267333984375, "loss": 0.4288, "rewards/accuracies": 0.8125, "rewards/chosen": -4.471696853637695, "rewards/margins": 1.16510009765625, "rewards/rejected": -5.636796951293945, "step": 579 }, { "epoch": 0.3788064331782186, "grad_norm": 15.53883891948394, "learning_rate": 1.5637239287393724e-07, "logits/chosen": -0.8626022934913635, "logits/rejected": -0.9244422912597656, "logps/chosen": -860.1614990234375, "logps/rejected": -1011.134033203125, "loss": 0.4128, "rewards/accuracies": 0.75, "rewards/chosen": -3.882162094116211, "rewards/margins": 1.0723750591278076, "rewards/rejected": -4.954537391662598, "step": 580 }, { "epoch": 0.3794595477181811, "grad_norm": 14.768077938371574, "learning_rate": 1.5618380477352258e-07, "logits/chosen": -0.7988356351852417, "logits/rejected": -0.7182334661483765, "logps/chosen": -869.6566162109375, "logps/rejected": -993.8335571289062, "loss": 0.4059, "rewards/accuracies": 0.90625, "rewards/chosen": -4.16757345199585, "rewards/margins": 1.371941328048706, "rewards/rejected": -5.539514541625977, "step": 581 }, { "epoch": 0.3801126622581435, "grad_norm": 17.211114890337342, "learning_rate": 1.5599492422903557e-07, "logits/chosen": -0.8917167782783508, "logits/rejected": -0.8692957758903503, "logps/chosen": -906.6569213867188, "logps/rejected": -1106.05126953125, "loss": 0.5268, "rewards/accuracies": 0.75, "rewards/chosen": -4.61083984375, "rewards/margins": 1.3148231506347656, "rewards/rejected": -5.925662517547607, "step": 582 }, { "epoch": 0.38076577679810597, "grad_norm": 15.588565948663359, "learning_rate": 1.5580575222362433e-07, "logits/chosen": -0.705355703830719, "logits/rejected": -0.722810685634613, "logps/chosen": -926.6598510742188, "logps/rejected": -1159.2498779296875, "loss": 0.4284, "rewards/accuracies": 0.875, "rewards/chosen": -4.331436634063721, "rewards/margins": 1.9407066106796265, "rewards/rejected": -6.272141933441162, "step": 583 }, { "epoch": 0.3814188913380684, "grad_norm": 12.961352491987626, "learning_rate": 1.556162897419539e-07, "logits/chosen": -0.7959637641906738, "logits/rejected": -0.7493060827255249, "logps/chosen": -850.8128662109375, "logps/rejected": -948.0916748046875, "loss": 0.4445, "rewards/accuracies": 0.8125, "rewards/chosen": -4.118077754974365, "rewards/margins": 1.0578035116195679, "rewards/rejected": -5.175881862640381, "step": 584 }, { "epoch": 0.38207200587803086, "grad_norm": 16.31279891925311, "learning_rate": 1.5542653777020136e-07, "logits/chosen": -0.7824969291687012, "logits/rejected": -0.7763348817825317, "logps/chosen": -896.5308837890625, "logps/rejected": -1009.3015747070312, "loss": 0.437, "rewards/accuracies": 0.78125, "rewards/chosen": -4.351218223571777, "rewards/margins": 1.1964832544326782, "rewards/rejected": -5.547701358795166, "step": 585 }, { "epoch": 0.3827251204179933, "grad_norm": 13.918482499547776, "learning_rate": 1.5523649729605057e-07, "logits/chosen": -0.76658034324646, "logits/rejected": -0.8511514663696289, "logps/chosen": -870.3497924804688, "logps/rejected": -1045.263916015625, "loss": 0.4695, "rewards/accuracies": 0.78125, "rewards/chosen": -4.4781718254089355, "rewards/margins": 1.291435718536377, "rewards/rejected": -5.769607067108154, "step": 586 }, { "epoch": 0.38337823495795575, "grad_norm": 16.569196717169596, "learning_rate": 1.5504616930868716e-07, "logits/chosen": -0.8880733251571655, "logits/rejected": -0.8038618564605713, "logps/chosen": -889.2333984375, "logps/rejected": -1001.71240234375, "loss": 0.4786, "rewards/accuracies": 0.78125, "rewards/chosen": -3.7782959938049316, "rewards/margins": 1.6770806312561035, "rewards/rejected": -5.455376625061035, "step": 587 }, { "epoch": 0.3840313494979182, "grad_norm": 17.10491428925674, "learning_rate": 1.548555547987933e-07, "logits/chosen": -0.9725980758666992, "logits/rejected": -0.989176869392395, "logps/chosen": -989.8197631835938, "logps/rejected": -1179.978759765625, "loss": 0.4301, "rewards/accuracies": 0.9375, "rewards/chosen": -4.106828689575195, "rewards/margins": 1.6448149681091309, "rewards/rejected": -5.751643657684326, "step": 588 }, { "epoch": 0.38468446403788065, "grad_norm": 13.922269822079555, "learning_rate": 1.5466465475854244e-07, "logits/chosen": -0.9255604147911072, "logits/rejected": -0.8810290694236755, "logps/chosen": -990.11474609375, "logps/rejected": -1163.693115234375, "loss": 0.4654, "rewards/accuracies": 0.84375, "rewards/chosen": -4.530111312866211, "rewards/margins": 1.9725072383880615, "rewards/rejected": -6.502618789672852, "step": 589 }, { "epoch": 0.38533757857784307, "grad_norm": 16.843076991151882, "learning_rate": 1.5447347018159436e-07, "logits/chosen": -0.8135560154914856, "logits/rejected": -0.8068975210189819, "logps/chosen": -900.1400756835938, "logps/rejected": -950.0714721679688, "loss": 0.5216, "rewards/accuracies": 0.71875, "rewards/chosen": -4.988783836364746, "rewards/margins": 0.508838415145874, "rewards/rejected": -5.497622013092041, "step": 590 }, { "epoch": 0.38599069311780554, "grad_norm": 16.55186264161944, "learning_rate": 1.5428200206308986e-07, "logits/chosen": -0.713176429271698, "logits/rejected": -0.7015882134437561, "logps/chosen": -850.7107543945312, "logps/rejected": -964.1630249023438, "loss": 0.4294, "rewards/accuracies": 0.8125, "rewards/chosen": -4.00892448425293, "rewards/margins": 1.1998018026351929, "rewards/rejected": -5.20872688293457, "step": 591 }, { "epoch": 0.38664380765776796, "grad_norm": 12.843591605410206, "learning_rate": 1.5409025139964559e-07, "logits/chosen": -0.8123986721038818, "logits/rejected": -0.6921762824058533, "logps/chosen": -867.6967163085938, "logps/rejected": -967.98193359375, "loss": 0.4517, "rewards/accuracies": 0.84375, "rewards/chosen": -4.409775257110596, "rewards/margins": 1.2882235050201416, "rewards/rejected": -5.697999000549316, "step": 592 }, { "epoch": 0.38729692219773043, "grad_norm": 22.22126012857482, "learning_rate": 1.5389821918934894e-07, "logits/chosen": -0.8245276212692261, "logits/rejected": -0.7239007949829102, "logps/chosen": -1058.098388671875, "logps/rejected": -1149.383544921875, "loss": 0.5199, "rewards/accuracies": 0.71875, "rewards/chosen": -5.164212703704834, "rewards/margins": 1.3136693239212036, "rewards/rejected": -6.477882385253906, "step": 593 }, { "epoch": 0.38795003673769285, "grad_norm": 18.370737993966067, "learning_rate": 1.537059064317527e-07, "logits/chosen": -1.0091485977172852, "logits/rejected": -1.0160248279571533, "logps/chosen": -943.4915161132812, "logps/rejected": -1050.677734375, "loss": 0.4994, "rewards/accuracies": 0.78125, "rewards/chosen": -3.813990592956543, "rewards/margins": 1.1202912330627441, "rewards/rejected": -4.934281826019287, "step": 594 }, { "epoch": 0.3886031512776553, "grad_norm": 31.88057796229377, "learning_rate": 1.5351331412787003e-07, "logits/chosen": -0.920621395111084, "logits/rejected": -0.8015599846839905, "logps/chosen": -944.1340942382812, "logps/rejected": -1074.1396484375, "loss": 0.4716, "rewards/accuracies": 0.71875, "rewards/chosen": -4.2517828941345215, "rewards/margins": 0.9702135324478149, "rewards/rejected": -5.221996307373047, "step": 595 }, { "epoch": 0.38925626581761774, "grad_norm": 28.48791105586222, "learning_rate": 1.5332044328016914e-07, "logits/chosen": -0.9262527823448181, "logits/rejected": -0.9940930008888245, "logps/chosen": -852.5736083984375, "logps/rejected": -992.1788940429688, "loss": 0.5288, "rewards/accuracies": 0.78125, "rewards/chosen": -3.7524843215942383, "rewards/margins": 1.3429107666015625, "rewards/rejected": -5.095395088195801, "step": 596 }, { "epoch": 0.3899093803575802, "grad_norm": 20.034683502025093, "learning_rate": 1.53127294892568e-07, "logits/chosen": -0.8578783869743347, "logits/rejected": -0.8358186483383179, "logps/chosen": -841.9410400390625, "logps/rejected": -914.41259765625, "loss": 0.4581, "rewards/accuracies": 0.625, "rewards/chosen": -3.9182372093200684, "rewards/margins": 0.5345401763916016, "rewards/rejected": -4.452776908874512, "step": 597 }, { "epoch": 0.39056249489754263, "grad_norm": 14.995040309280029, "learning_rate": 1.529338699704294e-07, "logits/chosen": -0.9003750681877136, "logits/rejected": -0.7470192313194275, "logps/chosen": -835.0977172851562, "logps/rejected": -924.6924438476562, "loss": 0.4515, "rewards/accuracies": 0.71875, "rewards/chosen": -3.865151882171631, "rewards/margins": 1.1951385736465454, "rewards/rejected": -5.060290813446045, "step": 598 }, { "epoch": 0.3912156094375051, "grad_norm": 17.660145347132392, "learning_rate": 1.527401695205554e-07, "logits/chosen": -0.8957807421684265, "logits/rejected": -0.8723628520965576, "logps/chosen": -869.9386596679688, "logps/rejected": -925.26318359375, "loss": 0.4511, "rewards/accuracies": 0.875, "rewards/chosen": -4.060848236083984, "rewards/margins": 0.71476811170578, "rewards/rejected": -4.775616645812988, "step": 599 }, { "epoch": 0.3918687239774675, "grad_norm": 13.004372705457447, "learning_rate": 1.5254619455118224e-07, "logits/chosen": -0.7671356201171875, "logits/rejected": -0.8021383285522461, "logps/chosen": -845.3877563476562, "logps/rejected": -985.086181640625, "loss": 0.449, "rewards/accuracies": 0.78125, "rewards/chosen": -3.7086658477783203, "rewards/margins": 1.1206787824630737, "rewards/rejected": -4.829344749450684, "step": 600 }, { "epoch": 0.3918687239774675, "eval_logits/chosen": -0.6681308150291443, "eval_logits/rejected": -0.6156617999076843, "eval_logps/chosen": -848.0513916015625, "eval_logps/rejected": -945.3562622070312, "eval_loss": 0.4532933235168457, "eval_rewards/accuracies": 0.7820000052452087, "eval_rewards/chosen": -3.7414422035217285, "eval_rewards/margins": 1.0604348182678223, "eval_rewards/rejected": -4.801877498626709, "eval_runtime": 614.2682, "eval_samples_per_second": 6.512, "eval_steps_per_second": 0.407, "step": 600 }, { "epoch": 0.39252183851743, "grad_norm": 16.493495105014954, "learning_rate": 1.5235194607197507e-07, "logits/chosen": -0.7642414569854736, "logits/rejected": -0.7581309676170349, "logps/chosen": -877.4822387695312, "logps/rejected": -1018.4113159179688, "loss": 0.4087, "rewards/accuracies": 0.78125, "rewards/chosen": -3.8799805641174316, "rewards/margins": 1.3261351585388184, "rewards/rejected": -5.206116199493408, "step": 601 }, { "epoch": 0.3931749530573924, "grad_norm": 12.37107065963141, "learning_rate": 1.521574250940227e-07, "logits/chosen": -0.8965670466423035, "logits/rejected": -0.8340004086494446, "logps/chosen": -920.9857788085938, "logps/rejected": -1078.318359375, "loss": 0.3523, "rewards/accuracies": 0.90625, "rewards/chosen": -3.6839547157287598, "rewards/margins": 1.4769634008407593, "rewards/rejected": -5.160918235778809, "step": 602 }, { "epoch": 0.3938280675973549, "grad_norm": 11.116216687638275, "learning_rate": 1.5196263262983232e-07, "logits/chosen": -0.7490851879119873, "logits/rejected": -0.771804928779602, "logps/chosen": -828.4022216796875, "logps/rejected": -1004.2670288085938, "loss": 0.4399, "rewards/accuracies": 0.84375, "rewards/chosen": -3.896117687225342, "rewards/margins": 1.3466899394989014, "rewards/rejected": -5.242807388305664, "step": 603 }, { "epoch": 0.3944811821373173, "grad_norm": 9.967638009299678, "learning_rate": 1.5176756969332425e-07, "logits/chosen": -0.8834438323974609, "logits/rejected": -0.8068808913230896, "logps/chosen": -795.0307006835938, "logps/rejected": -866.0194091796875, "loss": 0.4766, "rewards/accuracies": 0.84375, "rewards/chosen": -3.7699403762817383, "rewards/margins": 0.9773763418197632, "rewards/rejected": -4.747317314147949, "step": 604 }, { "epoch": 0.3951342966772798, "grad_norm": 26.596246532184516, "learning_rate": 1.5157223729982668e-07, "logits/chosen": -0.8761139512062073, "logits/rejected": -0.8892871141433716, "logps/chosen": -871.340576171875, "logps/rejected": -1007.2827758789062, "loss": 0.439, "rewards/accuracies": 0.8125, "rewards/chosen": -4.020564079284668, "rewards/margins": 1.1975598335266113, "rewards/rejected": -5.218123912811279, "step": 605 }, { "epoch": 0.3957874112172422, "grad_norm": 14.077534407772498, "learning_rate": 1.5137663646607032e-07, "logits/chosen": -0.8975901007652283, "logits/rejected": -0.8460850119590759, "logps/chosen": -890.2839965820312, "logps/rejected": -974.4508056640625, "loss": 0.4732, "rewards/accuracies": 0.875, "rewards/chosen": -3.964993715286255, "rewards/margins": 0.9587504863739014, "rewards/rejected": -4.923743724822998, "step": 606 }, { "epoch": 0.3964405257572047, "grad_norm": 17.96334354523122, "learning_rate": 1.511807682101832e-07, "logits/chosen": -0.9620600938796997, "logits/rejected": -0.8492072820663452, "logps/chosen": -863.880859375, "logps/rejected": -929.9119262695312, "loss": 0.4948, "rewards/accuracies": 0.71875, "rewards/chosen": -3.7883801460266113, "rewards/margins": 0.9228003621101379, "rewards/rejected": -4.711181163787842, "step": 607 }, { "epoch": 0.3970936402971671, "grad_norm": 11.485367827080513, "learning_rate": 1.5098463355168523e-07, "logits/chosen": -0.8942442536354065, "logits/rejected": -0.8808648586273193, "logps/chosen": -818.7298583984375, "logps/rejected": -917.2005004882812, "loss": 0.4495, "rewards/accuracies": 0.71875, "rewards/chosen": -3.760091543197632, "rewards/margins": 0.9165372252464294, "rewards/rejected": -4.676629066467285, "step": 608 }, { "epoch": 0.39774675483712957, "grad_norm": 14.959869339038608, "learning_rate": 1.5078823351148305e-07, "logits/chosen": -0.838761568069458, "logits/rejected": -0.7156088948249817, "logps/chosen": -863.39453125, "logps/rejected": -953.5706176757812, "loss": 0.5298, "rewards/accuracies": 0.84375, "rewards/chosen": -3.9802536964416504, "rewards/margins": 1.1790037155151367, "rewards/rejected": -5.159257888793945, "step": 609 }, { "epoch": 0.398399869377092, "grad_norm": 12.26194863361878, "learning_rate": 1.5059156911186462e-07, "logits/chosen": -1.0336946249008179, "logits/rejected": -0.951801598072052, "logps/chosen": -967.37744140625, "logps/rejected": -1024.5635986328125, "loss": 0.3817, "rewards/accuracies": 0.78125, "rewards/chosen": -4.440566539764404, "rewards/margins": 1.1911842823028564, "rewards/rejected": -5.631750583648682, "step": 610 }, { "epoch": 0.39905298391705446, "grad_norm": 19.1262138772951, "learning_rate": 1.5039464137649395e-07, "logits/chosen": -0.9519979953765869, "logits/rejected": -0.8136232495307922, "logps/chosen": -923.1121826171875, "logps/rejected": -981.0728759765625, "loss": 0.4884, "rewards/accuracies": 0.78125, "rewards/chosen": -4.264883041381836, "rewards/margins": 0.9207399487495422, "rewards/rejected": -5.1856231689453125, "step": 611 }, { "epoch": 0.3997060984570169, "grad_norm": 13.608525719342314, "learning_rate": 1.5019745133040571e-07, "logits/chosen": -0.876707911491394, "logits/rejected": -0.8673529624938965, "logps/chosen": -858.470947265625, "logps/rejected": -973.949951171875, "loss": 0.4167, "rewards/accuracies": 0.84375, "rewards/chosen": -4.141305446624756, "rewards/margins": 0.995867133140564, "rewards/rejected": -5.137172698974609, "step": 612 }, { "epoch": 0.40035921299697935, "grad_norm": 13.474406391717167, "learning_rate": 1.5e-07, "logits/chosen": -0.7090507745742798, "logits/rejected": -0.614541232585907, "logps/chosen": -914.9686279296875, "logps/rejected": -1066.94189453125, "loss": 0.4323, "rewards/accuracies": 0.78125, "rewards/chosen": -4.5194830894470215, "rewards/margins": 1.386902093887329, "rewards/rejected": -5.9063849449157715, "step": 613 }, { "epoch": 0.40101232753694177, "grad_norm": 14.172259835539906, "learning_rate": 1.4980228841303682e-07, "logits/chosen": -0.9724923372268677, "logits/rejected": -0.903152346611023, "logps/chosen": -885.5799560546875, "logps/rejected": -995.6898803710938, "loss": 0.4037, "rewards/accuracies": 0.78125, "rewards/chosen": -4.383858680725098, "rewards/margins": 1.104832649230957, "rewards/rejected": -5.4886908531188965, "step": 614 }, { "epoch": 0.40166544207690424, "grad_norm": 13.5707311262356, "learning_rate": 1.4960431759863093e-07, "logits/chosen": -0.8835554718971252, "logits/rejected": -0.7123602628707886, "logps/chosen": -854.8338623046875, "logps/rejected": -918.1431884765625, "loss": 0.4256, "rewards/accuracies": 0.84375, "rewards/chosen": -4.005169868469238, "rewards/margins": 1.493152379989624, "rewards/rejected": -5.498322486877441, "step": 615 }, { "epoch": 0.40231855661686666, "grad_norm": 18.247415237240777, "learning_rate": 1.494060885872464e-07, "logits/chosen": -1.0498430728912354, "logits/rejected": -0.9992644190788269, "logps/chosen": -970.97998046875, "logps/rejected": -1049.422119140625, "loss": 0.4306, "rewards/accuracies": 0.75, "rewards/chosen": -4.565814018249512, "rewards/margins": 1.0201760530471802, "rewards/rejected": -5.585990905761719, "step": 616 }, { "epoch": 0.40297167115682914, "grad_norm": 14.026453964756975, "learning_rate": 1.4920760241069124e-07, "logits/chosen": -0.9324491024017334, "logits/rejected": -0.917612612247467, "logps/chosen": -934.6063232421875, "logps/rejected": -1033.9951171875, "loss": 0.4023, "rewards/accuracies": 0.84375, "rewards/chosen": -4.982738494873047, "rewards/margins": 0.9660321474075317, "rewards/rejected": -5.948770999908447, "step": 617 }, { "epoch": 0.40362478569679155, "grad_norm": 15.386386169496118, "learning_rate": 1.49008860102112e-07, "logits/chosen": -0.7892769575119019, "logits/rejected": -0.7855625152587891, "logps/chosen": -942.093017578125, "logps/rejected": -1020.9521484375, "loss": 0.4783, "rewards/accuracies": 0.84375, "rewards/chosen": -4.989546775817871, "rewards/margins": 0.7590184211730957, "rewards/rejected": -5.748565196990967, "step": 618 }, { "epoch": 0.40427790023675403, "grad_norm": 20.36931108231902, "learning_rate": 1.4880986269598847e-07, "logits/chosen": -0.7886320352554321, "logits/rejected": -0.770696759223938, "logps/chosen": -1011.263671875, "logps/rejected": -1083.3687744140625, "loss": 0.4714, "rewards/accuracies": 0.78125, "rewards/chosen": -5.143796920776367, "rewards/margins": 0.9884477853775024, "rewards/rejected": -6.132245063781738, "step": 619 }, { "epoch": 0.40493101477671645, "grad_norm": 17.64904479691889, "learning_rate": 1.4861061122812828e-07, "logits/chosen": -0.8605690002441406, "logits/rejected": -0.910815417766571, "logps/chosen": -962.86767578125, "logps/rejected": -1095.0264892578125, "loss": 0.422, "rewards/accuracies": 0.84375, "rewards/chosen": -4.740562438964844, "rewards/margins": 1.1599054336547852, "rewards/rejected": -5.900468349456787, "step": 620 }, { "epoch": 0.4055841293166789, "grad_norm": 18.989849625404762, "learning_rate": 1.484111067356614e-07, "logits/chosen": -0.8971580266952515, "logits/rejected": -0.8848168849945068, "logps/chosen": -877.4755859375, "logps/rejected": -953.19287109375, "loss": 0.5492, "rewards/accuracies": 0.75, "rewards/chosen": -4.335120677947998, "rewards/margins": 0.7430107593536377, "rewards/rejected": -5.078131675720215, "step": 621 }, { "epoch": 0.40623724385664134, "grad_norm": 25.58374255467917, "learning_rate": 1.4821135025703488e-07, "logits/chosen": -0.8735688924789429, "logits/rejected": -0.8896888494491577, "logps/chosen": -969.5611572265625, "logps/rejected": -1103.0755615234375, "loss": 0.3992, "rewards/accuracies": 0.8125, "rewards/chosen": -4.727744102478027, "rewards/margins": 1.2061125040054321, "rewards/rejected": -5.933856010437012, "step": 622 }, { "epoch": 0.4068903583966038, "grad_norm": 13.450431430906283, "learning_rate": 1.4801134283200744e-07, "logits/chosen": -0.82172030210495, "logits/rejected": -0.806075930595398, "logps/chosen": -1026.8660888671875, "logps/rejected": -1136.55712890625, "loss": 0.4049, "rewards/accuracies": 0.8125, "rewards/chosen": -4.981423854827881, "rewards/margins": 1.1098735332489014, "rewards/rejected": -6.091297149658203, "step": 623 }, { "epoch": 0.40754347293656623, "grad_norm": 18.032806569030758, "learning_rate": 1.4781108550164395e-07, "logits/chosen": -0.8168790936470032, "logits/rejected": -0.7250760793685913, "logps/chosen": -916.8949584960938, "logps/rejected": -964.107177734375, "loss": 0.4845, "rewards/accuracies": 0.6875, "rewards/chosen": -4.447428226470947, "rewards/margins": 0.5497761368751526, "rewards/rejected": -4.997204780578613, "step": 624 }, { "epoch": 0.4081965874765287, "grad_norm": 18.94760554081831, "learning_rate": 1.4761057930831e-07, "logits/chosen": -0.9571875929832458, "logits/rejected": -0.8576517105102539, "logps/chosen": -993.7386474609375, "logps/rejected": -1106.987060546875, "loss": 0.3963, "rewards/accuracies": 0.8125, "rewards/chosen": -4.498230457305908, "rewards/margins": 1.3976860046386719, "rewards/rejected": -5.895916938781738, "step": 625 }, { "epoch": 0.4088497020164911, "grad_norm": 22.686857483265566, "learning_rate": 1.4740982529566672e-07, "logits/chosen": -1.0864641666412354, "logits/rejected": -0.9092856645584106, "logps/chosen": -960.5426635742188, "logps/rejected": -1040.1844482421875, "loss": 0.4542, "rewards/accuracies": 0.8125, "rewards/chosen": -4.545177459716797, "rewards/margins": 1.2752573490142822, "rewards/rejected": -5.8204345703125, "step": 626 }, { "epoch": 0.4095028165564536, "grad_norm": 13.831886695808532, "learning_rate": 1.4720882450866502e-07, "logits/chosen": -0.8931451439857483, "logits/rejected": -0.82686847448349, "logps/chosen": -875.4305419921875, "logps/rejected": -1009.8246459960938, "loss": 0.3634, "rewards/accuracies": 0.875, "rewards/chosen": -4.404606819152832, "rewards/margins": 1.4391083717346191, "rewards/rejected": -5.843715190887451, "step": 627 }, { "epoch": 0.410155931096416, "grad_norm": 12.748214814923802, "learning_rate": 1.470075779935404e-07, "logits/chosen": -0.926064133644104, "logits/rejected": -0.9559231400489807, "logps/chosen": -878.1412963867188, "logps/rejected": -1152.503662109375, "loss": 0.4009, "rewards/accuracies": 0.8125, "rewards/chosen": -4.183788299560547, "rewards/margins": 2.1225814819335938, "rewards/rejected": -6.306369781494141, "step": 628 }, { "epoch": 0.4108090456363785, "grad_norm": 41.87910147179948, "learning_rate": 1.468060867978073e-07, "logits/chosen": -0.9104610085487366, "logits/rejected": -0.9001246690750122, "logps/chosen": -1014.3042602539062, "logps/rejected": -1192.781005859375, "loss": 0.4064, "rewards/accuracies": 0.84375, "rewards/chosen": -4.662555694580078, "rewards/margins": 1.773787021636963, "rewards/rejected": -6.436343193054199, "step": 629 }, { "epoch": 0.4114621601763409, "grad_norm": 14.627953520544663, "learning_rate": 1.4660435197025388e-07, "logits/chosen": -0.8062002062797546, "logits/rejected": -0.854103147983551, "logps/chosen": -936.1939086914062, "logps/rejected": -1189.2581787109375, "loss": 0.4233, "rewards/accuracies": 0.8125, "rewards/chosen": -4.81528377532959, "rewards/margins": 1.5555047988891602, "rewards/rejected": -6.370789051055908, "step": 630 }, { "epoch": 0.4121152747163034, "grad_norm": 12.111758507125595, "learning_rate": 1.4640237456093634e-07, "logits/chosen": -0.6821016073226929, "logits/rejected": -0.5494420528411865, "logps/chosen": -972.8225708007812, "logps/rejected": -1031.09814453125, "loss": 0.4028, "rewards/accuracies": 0.8125, "rewards/chosen": -4.693094730377197, "rewards/margins": 1.2289015054702759, "rewards/rejected": -5.921996116638184, "step": 631 }, { "epoch": 0.4127683892562658, "grad_norm": 13.52571835630328, "learning_rate": 1.462001556211736e-07, "logits/chosen": -1.001874327659607, "logits/rejected": -0.975570797920227, "logps/chosen": -979.7161865234375, "logps/rejected": -1141.2294921875, "loss": 0.4469, "rewards/accuracies": 0.9375, "rewards/chosen": -4.602497577667236, "rewards/margins": 1.546646237373352, "rewards/rejected": -6.149144172668457, "step": 632 }, { "epoch": 0.4134215037962283, "grad_norm": 19.019281186072746, "learning_rate": 1.4599769620354174e-07, "logits/chosen": -0.7710694074630737, "logits/rejected": -0.7261401414871216, "logps/chosen": -937.7637939453125, "logps/rejected": -1017.58935546875, "loss": 0.5744, "rewards/accuracies": 0.6875, "rewards/chosen": -4.335657119750977, "rewards/margins": 0.7023254036903381, "rewards/rejected": -5.03798246383667, "step": 633 }, { "epoch": 0.4140746183361907, "grad_norm": 22.204343763079972, "learning_rate": 1.4579499736186863e-07, "logits/chosen": -0.6519548296928406, "logits/rejected": -0.6551141142845154, "logps/chosen": -1033.122802734375, "logps/rejected": -1234.211669921875, "loss": 0.4501, "rewards/accuracies": 0.6875, "rewards/chosen": -5.304633140563965, "rewards/margins": 1.2295292615890503, "rewards/rejected": -6.5341620445251465, "step": 634 }, { "epoch": 0.41472773287615317, "grad_norm": 13.511485762115198, "learning_rate": 1.4559206015122829e-07, "logits/chosen": -0.6418792605400085, "logits/rejected": -0.7200583219528198, "logps/chosen": -851.169189453125, "logps/rejected": -1106.4285888671875, "loss": 0.4361, "rewards/accuracies": 0.9375, "rewards/chosen": -4.105515956878662, "rewards/margins": 2.3324177265167236, "rewards/rejected": -6.437933444976807, "step": 635 }, { "epoch": 0.4153808474161156, "grad_norm": 34.33189845987089, "learning_rate": 1.453888856279355e-07, "logits/chosen": -0.8396845459938049, "logits/rejected": -0.7787867188453674, "logps/chosen": -1001.843017578125, "logps/rejected": -1041.985107421875, "loss": 0.5351, "rewards/accuracies": 0.71875, "rewards/chosen": -4.880048751831055, "rewards/margins": 0.6726697683334351, "rewards/rejected": -5.552718639373779, "step": 636 }, { "epoch": 0.41603396195607806, "grad_norm": 17.526487722839356, "learning_rate": 1.451854748495403e-07, "logits/chosen": -0.6934026479721069, "logits/rejected": -0.7079018950462341, "logps/chosen": -806.213134765625, "logps/rejected": -963.346923828125, "loss": 0.4063, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8228399753570557, "rewards/margins": 1.726550579071045, "rewards/rejected": -5.54939079284668, "step": 637 }, { "epoch": 0.4166870764960405, "grad_norm": 25.357018744901364, "learning_rate": 1.4498182887482252e-07, "logits/chosen": -0.8057832717895508, "logits/rejected": -0.7694687843322754, "logps/chosen": -928.901123046875, "logps/rejected": -1060.5814208984375, "loss": 0.3872, "rewards/accuracies": 0.90625, "rewards/chosen": -4.737601280212402, "rewards/margins": 1.5201215744018555, "rewards/rejected": -6.257723331451416, "step": 638 }, { "epoch": 0.41734019103600295, "grad_norm": 19.39989514006622, "learning_rate": 1.4477794876378612e-07, "logits/chosen": -0.7168833017349243, "logits/rejected": -0.7470553517341614, "logps/chosen": -852.4853515625, "logps/rejected": -945.4410400390625, "loss": 0.5718, "rewards/accuracies": 0.75, "rewards/chosen": -4.117666721343994, "rewards/margins": 1.0648146867752075, "rewards/rejected": -5.182481288909912, "step": 639 }, { "epoch": 0.41799330557596537, "grad_norm": 13.583508801617073, "learning_rate": 1.4457383557765383e-07, "logits/chosen": -0.8929958343505859, "logits/rejected": -0.885593831539154, "logps/chosen": -831.2813720703125, "logps/rejected": -959.333251953125, "loss": 0.4017, "rewards/accuracies": 0.90625, "rewards/chosen": -3.770887613296509, "rewards/margins": 1.2031605243682861, "rewards/rejected": -4.974048614501953, "step": 640 }, { "epoch": 0.41864642011592784, "grad_norm": 12.885105314816913, "learning_rate": 1.4436949037886155e-07, "logits/chosen": -0.8584612011909485, "logits/rejected": -0.9563580751419067, "logps/chosen": -973.1109619140625, "logps/rejected": -1170.2774658203125, "loss": 0.3817, "rewards/accuracies": 0.875, "rewards/chosen": -3.968479871749878, "rewards/margins": 1.4874968528747559, "rewards/rejected": -5.455976963043213, "step": 641 }, { "epoch": 0.41929953465589026, "grad_norm": 15.499673312261706, "learning_rate": 1.4416491423105285e-07, "logits/chosen": -0.5970382690429688, "logits/rejected": -0.4685077667236328, "logps/chosen": -756.71337890625, "logps/rejected": -854.1908569335938, "loss": 0.457, "rewards/accuracies": 0.84375, "rewards/chosen": -3.7428746223449707, "rewards/margins": 1.194618582725525, "rewards/rejected": -4.937493324279785, "step": 642 }, { "epoch": 0.41995264919585273, "grad_norm": 15.458842820703998, "learning_rate": 1.4396010819907338e-07, "logits/chosen": -0.7846943140029907, "logits/rejected": -0.790925145149231, "logps/chosen": -823.936279296875, "logps/rejected": -949.0782470703125, "loss": 0.4334, "rewards/accuracies": 0.78125, "rewards/chosen": -3.4161481857299805, "rewards/margins": 1.5297538042068481, "rewards/rejected": -4.945902347564697, "step": 643 }, { "epoch": 0.42060576373581515, "grad_norm": 15.070391792337258, "learning_rate": 1.4375507334896546e-07, "logits/chosen": -0.8074719309806824, "logits/rejected": -0.6977930068969727, "logps/chosen": -939.0458984375, "logps/rejected": -1025.165283203125, "loss": 0.4187, "rewards/accuracies": 0.78125, "rewards/chosen": -4.523794174194336, "rewards/margins": 1.1222283840179443, "rewards/rejected": -5.646022319793701, "step": 644 }, { "epoch": 0.4212588782757776, "grad_norm": 13.986160456147138, "learning_rate": 1.4354981074796232e-07, "logits/chosen": -0.8354430198669434, "logits/rejected": -0.6734627485275269, "logps/chosen": -740.5074462890625, "logps/rejected": -873.5528564453125, "loss": 0.4067, "rewards/accuracies": 0.84375, "rewards/chosen": -3.245814561843872, "rewards/margins": 1.577298641204834, "rewards/rejected": -4.823113441467285, "step": 645 }, { "epoch": 0.42191199281574004, "grad_norm": 12.125652049098292, "learning_rate": 1.433443214644827e-07, "logits/chosen": -0.7603698372840881, "logits/rejected": -0.6487007141113281, "logps/chosen": -890.7510375976562, "logps/rejected": -904.6582641601562, "loss": 0.426, "rewards/accuracies": 0.71875, "rewards/chosen": -3.7208075523376465, "rewards/margins": 0.801545262336731, "rewards/rejected": -4.522353172302246, "step": 646 }, { "epoch": 0.4225651073557025, "grad_norm": 20.464126839268978, "learning_rate": 1.4313860656812535e-07, "logits/chosen": -0.8351298570632935, "logits/rejected": -0.8417803049087524, "logps/chosen": -760.145751953125, "logps/rejected": -926.35546875, "loss": 0.4643, "rewards/accuracies": 0.84375, "rewards/chosen": -3.66671085357666, "rewards/margins": 1.3970162868499756, "rewards/rejected": -5.063727378845215, "step": 647 }, { "epoch": 0.42321822189566494, "grad_norm": 14.126796385995897, "learning_rate": 1.429326671296632e-07, "logits/chosen": -0.7931310534477234, "logits/rejected": -0.8309136629104614, "logps/chosen": -806.4236450195312, "logps/rejected": -945.98828125, "loss": 0.4526, "rewards/accuracies": 0.65625, "rewards/chosen": -4.0367279052734375, "rewards/margins": 0.9322269558906555, "rewards/rejected": -4.968955039978027, "step": 648 }, { "epoch": 0.4238713364356274, "grad_norm": 17.31979167681963, "learning_rate": 1.427265042210381e-07, "logits/chosen": -0.983731210231781, "logits/rejected": -0.9602134227752686, "logps/chosen": -863.263916015625, "logps/rejected": -1010.90234375, "loss": 0.4982, "rewards/accuracies": 0.875, "rewards/chosen": -3.9947147369384766, "rewards/margins": 1.413080096244812, "rewards/rejected": -5.407794952392578, "step": 649 }, { "epoch": 0.42452445097558983, "grad_norm": 26.22140907720917, "learning_rate": 1.4252011891535498e-07, "logits/chosen": -0.8368729948997498, "logits/rejected": -0.832813560962677, "logps/chosen": -890.2816162109375, "logps/rejected": -1042.296630859375, "loss": 0.414, "rewards/accuracies": 0.84375, "rewards/chosen": -4.053102016448975, "rewards/margins": 1.294475793838501, "rewards/rejected": -5.347578048706055, "step": 650 }, { "epoch": 0.4251775655155523, "grad_norm": 14.020114892351618, "learning_rate": 1.4231351228687644e-07, "logits/chosen": -0.8325139880180359, "logits/rejected": -0.6997128129005432, "logps/chosen": -826.017333984375, "logps/rejected": -1002.11181640625, "loss": 0.4405, "rewards/accuracies": 0.8125, "rewards/chosen": -3.8723702430725098, "rewards/margins": 1.774685263633728, "rewards/rejected": -5.647055149078369, "step": 651 }, { "epoch": 0.4258306800555147, "grad_norm": 12.265082335211462, "learning_rate": 1.421066854110171e-07, "logits/chosen": -0.9369992017745972, "logits/rejected": -0.8721651434898376, "logps/chosen": -889.888671875, "logps/rejected": -1001.230224609375, "loss": 0.4296, "rewards/accuracies": 0.75, "rewards/chosen": -4.376310348510742, "rewards/margins": 1.0269381999969482, "rewards/rejected": -5.403248310089111, "step": 652 }, { "epoch": 0.4264837945954772, "grad_norm": 11.090981454180124, "learning_rate": 1.4189963936433794e-07, "logits/chosen": -0.8955050706863403, "logits/rejected": -0.9158356189727783, "logps/chosen": -870.947021484375, "logps/rejected": -961.156494140625, "loss": 0.4353, "rewards/accuracies": 0.8125, "rewards/chosen": -4.03396463394165, "rewards/margins": 0.7431548833847046, "rewards/rejected": -4.777119159698486, "step": 653 }, { "epoch": 0.4271369091354396, "grad_norm": 30.52888285459347, "learning_rate": 1.4169237522454082e-07, "logits/chosen": -0.9265910983085632, "logits/rejected": -0.9025193452835083, "logps/chosen": -938.2091674804688, "logps/rejected": -1082.6446533203125, "loss": 0.462, "rewards/accuracies": 0.75, "rewards/chosen": -4.009795188903809, "rewards/margins": 1.1789731979370117, "rewards/rejected": -5.18876838684082, "step": 654 }, { "epoch": 0.4277900236754021, "grad_norm": 16.525087397047585, "learning_rate": 1.4148489407046272e-07, "logits/chosen": -0.8427670001983643, "logits/rejected": -0.8210782408714294, "logps/chosen": -1001.025146484375, "logps/rejected": -1058.7926025390625, "loss": 0.462, "rewards/accuracies": 0.8125, "rewards/chosen": -4.803376197814941, "rewards/margins": 0.7696138620376587, "rewards/rejected": -5.5729899406433105, "step": 655 }, { "epoch": 0.4284431382153645, "grad_norm": 14.107761532142954, "learning_rate": 1.412771969820703e-07, "logits/chosen": -0.7187002897262573, "logits/rejected": -0.6458846926689148, "logps/chosen": -889.3016967773438, "logps/rejected": -1031.6785888671875, "loss": 0.4785, "rewards/accuracies": 0.71875, "rewards/chosen": -4.112646579742432, "rewards/margins": 1.4312293529510498, "rewards/rejected": -5.543876647949219, "step": 656 }, { "epoch": 0.429096252755327, "grad_norm": 21.330855551906346, "learning_rate": 1.4106928504045414e-07, "logits/chosen": -0.7129623889923096, "logits/rejected": -0.7127180099487305, "logps/chosen": -966.5421752929688, "logps/rejected": -1104.1207275390625, "loss": 0.4478, "rewards/accuracies": 0.71875, "rewards/chosen": -4.271879196166992, "rewards/margins": 1.1747528314590454, "rewards/rejected": -5.44663143157959, "step": 657 }, { "epoch": 0.4297493672952894, "grad_norm": 14.415672966540916, "learning_rate": 1.4086115932782314e-07, "logits/chosen": -0.743895947933197, "logits/rejected": -0.8685862421989441, "logps/chosen": -790.4058227539062, "logps/rejected": -1067.7724609375, "loss": 0.4504, "rewards/accuracies": 0.8125, "rewards/chosen": -4.013057231903076, "rewards/margins": 1.7089455127716064, "rewards/rejected": -5.722002983093262, "step": 658 }, { "epoch": 0.43040248183525187, "grad_norm": 22.65133168747345, "learning_rate": 1.4065282092749898e-07, "logits/chosen": -0.8140444159507751, "logits/rejected": -0.7156209349632263, "logps/chosen": -901.9187622070312, "logps/rejected": -1011.9653930664062, "loss": 0.4053, "rewards/accuracies": 0.8125, "rewards/chosen": -4.2266526222229, "rewards/margins": 1.2894474267959595, "rewards/rejected": -5.5161004066467285, "step": 659 }, { "epoch": 0.4310555963752143, "grad_norm": 17.697468761787118, "learning_rate": 1.404442709239103e-07, "logits/chosen": -0.788597822189331, "logits/rejected": -0.812627375125885, "logps/chosen": -907.12109375, "logps/rejected": -1006.6986694335938, "loss": 0.3981, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9818243980407715, "rewards/margins": 1.407388687133789, "rewards/rejected": -5.3892130851745605, "step": 660 }, { "epoch": 0.43170871091517676, "grad_norm": 20.96372589073248, "learning_rate": 1.4023551040258722e-07, "logits/chosen": -0.8250223994255066, "logits/rejected": -0.8321212530136108, "logps/chosen": -907.60205078125, "logps/rejected": -1042.2557373046875, "loss": 0.4675, "rewards/accuracies": 0.78125, "rewards/chosen": -4.117773532867432, "rewards/margins": 1.2371721267700195, "rewards/rejected": -5.354945659637451, "step": 661 }, { "epoch": 0.4323618254551392, "grad_norm": 20.57356587015438, "learning_rate": 1.4002654045015573e-07, "logits/chosen": -0.8451354503631592, "logits/rejected": -0.8059527277946472, "logps/chosen": -847.0043334960938, "logps/rejected": -965.06787109375, "loss": 0.4463, "rewards/accuracies": 0.8125, "rewards/chosen": -4.082802772521973, "rewards/margins": 1.3779957294464111, "rewards/rejected": -5.4607977867126465, "step": 662 }, { "epoch": 0.43301493999510166, "grad_norm": 20.39338992868369, "learning_rate": 1.3981736215433168e-07, "logits/chosen": -0.8249819278717041, "logits/rejected": -0.7393394708633423, "logps/chosen": -887.72265625, "logps/rejected": -1003.6444091796875, "loss": 0.532, "rewards/accuracies": 0.78125, "rewards/chosen": -4.219474792480469, "rewards/margins": 1.191009283065796, "rewards/rejected": -5.4104838371276855, "step": 663 }, { "epoch": 0.4336680545350641, "grad_norm": 29.528290842632007, "learning_rate": 1.3960797660391568e-07, "logits/chosen": -1.028712511062622, "logits/rejected": -0.87873375415802, "logps/chosen": -936.6950073242188, "logps/rejected": -1085.1673583984375, "loss": 0.4479, "rewards/accuracies": 0.875, "rewards/chosen": -4.28346061706543, "rewards/margins": 1.7631897926330566, "rewards/rejected": -6.0466508865356445, "step": 664 }, { "epoch": 0.43432116907502655, "grad_norm": 25.500391101093133, "learning_rate": 1.393983848887869e-07, "logits/chosen": -0.7285988330841064, "logits/rejected": -0.785618245601654, "logps/chosen": -908.6463623046875, "logps/rejected": -1151.10546875, "loss": 0.4249, "rewards/accuracies": 0.75, "rewards/chosen": -4.462244510650635, "rewards/margins": 1.578071117401123, "rewards/rejected": -6.040315628051758, "step": 665 }, { "epoch": 0.43497428361498897, "grad_norm": 25.96776880620349, "learning_rate": 1.3918858809989772e-07, "logits/chosen": -0.8820406794548035, "logits/rejected": -0.8553460836410522, "logps/chosen": -833.931884765625, "logps/rejected": -943.404541015625, "loss": 0.4652, "rewards/accuracies": 0.78125, "rewards/chosen": -3.73722505569458, "rewards/margins": 1.2582281827926636, "rewards/rejected": -4.995452880859375, "step": 666 }, { "epoch": 0.43562739815495144, "grad_norm": 13.136623740418324, "learning_rate": 1.3897858732926794e-07, "logits/chosen": -0.9333454370498657, "logits/rejected": -0.7037588953971863, "logps/chosen": -877.7976684570312, "logps/rejected": -931.8585815429688, "loss": 0.3979, "rewards/accuracies": 0.875, "rewards/chosen": -4.332530975341797, "rewards/margins": 1.0355404615402222, "rewards/rejected": -5.36807107925415, "step": 667 }, { "epoch": 0.43628051269491386, "grad_norm": 15.93708248937252, "learning_rate": 1.387683836699791e-07, "logits/chosen": -0.7652785778045654, "logits/rejected": -0.712914228439331, "logps/chosen": -778.5343017578125, "logps/rejected": -999.83056640625, "loss": 0.4135, "rewards/accuracies": 0.9375, "rewards/chosen": -3.547429084777832, "rewards/margins": 1.8087109327316284, "rewards/rejected": -5.356139659881592, "step": 668 }, { "epoch": 0.43693362723487633, "grad_norm": 24.435455038292687, "learning_rate": 1.3855797821616888e-07, "logits/chosen": -0.8811261653900146, "logits/rejected": -0.8171039819717407, "logps/chosen": -949.3507080078125, "logps/rejected": -1035.7723388671875, "loss": 0.4635, "rewards/accuracies": 0.71875, "rewards/chosen": -4.635836601257324, "rewards/margins": 1.0779340267181396, "rewards/rejected": -5.713770866394043, "step": 669 }, { "epoch": 0.43758674177483875, "grad_norm": 22.738127013175763, "learning_rate": 1.3834737206302517e-07, "logits/chosen": -0.734673023223877, "logits/rejected": -0.7677603363990784, "logps/chosen": -923.2880859375, "logps/rejected": -1033.8685302734375, "loss": 0.485, "rewards/accuracies": 0.71875, "rewards/chosen": -4.669242858886719, "rewards/margins": 1.1409337520599365, "rewards/rejected": -5.810176849365234, "step": 670 }, { "epoch": 0.4382398563148012, "grad_norm": 20.543237861097456, "learning_rate": 1.3813656630678067e-07, "logits/chosen": -0.842354953289032, "logits/rejected": -0.7564468383789062, "logps/chosen": -931.6202392578125, "logps/rejected": -1077.499755859375, "loss": 0.4288, "rewards/accuracies": 0.78125, "rewards/chosen": -4.472886085510254, "rewards/margins": 1.3071494102478027, "rewards/rejected": -5.780035495758057, "step": 671 }, { "epoch": 0.43889297085476364, "grad_norm": 13.57877629047864, "learning_rate": 1.3792556204470697e-07, "logits/chosen": -0.8747435808181763, "logits/rejected": -0.8501456379890442, "logps/chosen": -927.4142456054688, "logps/rejected": -1086.185791015625, "loss": 0.4252, "rewards/accuracies": 0.78125, "rewards/chosen": -4.513038635253906, "rewards/margins": 1.0269978046417236, "rewards/rejected": -5.540036201477051, "step": 672 }, { "epoch": 0.4395460853947261, "grad_norm": 17.356762341065927, "learning_rate": 1.3771436037510896e-07, "logits/chosen": -0.7796288132667542, "logits/rejected": -0.6895512342453003, "logps/chosen": -933.6649169921875, "logps/rejected": -1087.0875244140625, "loss": 0.4412, "rewards/accuracies": 0.75, "rewards/chosen": -4.490601062774658, "rewards/margins": 1.5325762033462524, "rewards/rejected": -6.023177146911621, "step": 673 }, { "epoch": 0.44019919993468853, "grad_norm": 13.574005853463984, "learning_rate": 1.3750296239731897e-07, "logits/chosen": -0.7811002731323242, "logits/rejected": -0.851697564125061, "logps/chosen": -782.3571166992188, "logps/rejected": -907.9415283203125, "loss": 0.4683, "rewards/accuracies": 0.875, "rewards/chosen": -3.627669334411621, "rewards/margins": 1.0533050298690796, "rewards/rejected": -4.68097448348999, "step": 674 }, { "epoch": 0.440852314474651, "grad_norm": 19.44076719795671, "learning_rate": 1.3729136921169127e-07, "logits/chosen": -0.8856003880500793, "logits/rejected": -0.8591895699501038, "logps/chosen": -834.085693359375, "logps/rejected": -988.325927734375, "loss": 0.465, "rewards/accuracies": 0.78125, "rewards/chosen": -3.8806254863739014, "rewards/margins": 1.1521306037902832, "rewards/rejected": -5.0327558517456055, "step": 675 }, { "epoch": 0.4415054290146134, "grad_norm": 16.00239891382966, "learning_rate": 1.3707958191959609e-07, "logits/chosen": -0.8808274865150452, "logits/rejected": -0.8277475833892822, "logps/chosen": -858.7861938476562, "logps/rejected": -997.075439453125, "loss": 0.4338, "rewards/accuracies": 0.84375, "rewards/chosen": -3.8033220767974854, "rewards/margins": 1.478643774986267, "rewards/rejected": -5.281965732574463, "step": 676 }, { "epoch": 0.4421585435545759, "grad_norm": 21.70547381761294, "learning_rate": 1.3686760162341407e-07, "logits/chosen": -0.8979389071464539, "logits/rejected": -0.9083980917930603, "logps/chosen": -903.99560546875, "logps/rejected": -1149.4349365234375, "loss": 0.3465, "rewards/accuracies": 0.9375, "rewards/chosen": -4.51162576675415, "rewards/margins": 1.9777264595031738, "rewards/rejected": -6.489351749420166, "step": 677 }, { "epoch": 0.4428116580945383, "grad_norm": 14.144294296308374, "learning_rate": 1.3665542942653045e-07, "logits/chosen": -1.0189638137817383, "logits/rejected": -0.9529871940612793, "logps/chosen": -912.849609375, "logps/rejected": -973.5966186523438, "loss": 0.4492, "rewards/accuracies": 0.75, "rewards/chosen": -4.560410022735596, "rewards/margins": 0.8235808610916138, "rewards/rejected": -5.38399076461792, "step": 678 }, { "epoch": 0.4434647726345008, "grad_norm": 32.796961805908666, "learning_rate": 1.3644306643332938e-07, "logits/chosen": -0.6085844039916992, "logits/rejected": -0.6985953450202942, "logps/chosen": -815.2529907226562, "logps/rejected": -1073.2359619140625, "loss": 0.4895, "rewards/accuracies": 0.78125, "rewards/chosen": -4.089121341705322, "rewards/margins": 1.6417101621627808, "rewards/rejected": -5.730830669403076, "step": 679 }, { "epoch": 0.4441178871744632, "grad_norm": 13.862963460245135, "learning_rate": 1.3623051374918802e-07, "logits/chosen": -0.899061918258667, "logits/rejected": -0.8426032066345215, "logps/chosen": -995.0509033203125, "logps/rejected": -1092.8623046875, "loss": 0.4237, "rewards/accuracies": 0.8125, "rewards/chosen": -4.662611484527588, "rewards/margins": 1.1843613386154175, "rewards/rejected": -5.846972465515137, "step": 680 }, { "epoch": 0.4447710017144257, "grad_norm": 14.735642845594755, "learning_rate": 1.3601777248047104e-07, "logits/chosen": -0.6714251637458801, "logits/rejected": -0.747349739074707, "logps/chosen": -862.39697265625, "logps/rejected": -1099.48876953125, "loss": 0.4277, "rewards/accuracies": 0.8125, "rewards/chosen": -3.98651385307312, "rewards/margins": 1.7703197002410889, "rewards/rejected": -5.756833553314209, "step": 681 }, { "epoch": 0.4454241162543881, "grad_norm": 17.382626342442304, "learning_rate": 1.358048437345246e-07, "logits/chosen": -0.8407683968544006, "logits/rejected": -0.8594624400138855, "logps/chosen": -924.0106201171875, "logps/rejected": -1134.36865234375, "loss": 0.4479, "rewards/accuracies": 0.8125, "rewards/chosen": -4.400675296783447, "rewards/margins": 1.7937949895858765, "rewards/rejected": -6.194470405578613, "step": 682 }, { "epoch": 0.4460772307943506, "grad_norm": 14.86569909599256, "learning_rate": 1.3559172861967076e-07, "logits/chosen": -0.8785750865936279, "logits/rejected": -0.8424649238586426, "logps/chosen": -843.7998046875, "logps/rejected": -980.86767578125, "loss": 0.4113, "rewards/accuracies": 0.8125, "rewards/chosen": -3.896084785461426, "rewards/margins": 1.296176552772522, "rewards/rejected": -5.192261219024658, "step": 683 }, { "epoch": 0.446730345334313, "grad_norm": 13.07522953792525, "learning_rate": 1.3537842824520164e-07, "logits/chosen": -0.8316384553909302, "logits/rejected": -0.7714143991470337, "logps/chosen": -952.236572265625, "logps/rejected": -1082.39794921875, "loss": 0.3925, "rewards/accuracies": 0.84375, "rewards/chosen": -4.617538928985596, "rewards/margins": 1.4145046472549438, "rewards/rejected": -6.03204345703125, "step": 684 }, { "epoch": 0.44738345987427547, "grad_norm": 23.423333193136706, "learning_rate": 1.3516494372137366e-07, "logits/chosen": -0.641627311706543, "logits/rejected": -0.5470828413963318, "logps/chosen": -860.3857421875, "logps/rejected": -966.0579223632812, "loss": 0.4359, "rewards/accuracies": 0.8125, "rewards/chosen": -4.126357078552246, "rewards/margins": 1.1212894916534424, "rewards/rejected": -5.247646808624268, "step": 685 }, { "epoch": 0.4480365744142379, "grad_norm": 13.282071369818777, "learning_rate": 1.3495127615940178e-07, "logits/chosen": -0.8730409741401672, "logits/rejected": -0.7727816700935364, "logps/chosen": -948.9930419921875, "logps/rejected": -1041.864501953125, "loss": 0.386, "rewards/accuracies": 0.78125, "rewards/chosen": -4.173754692077637, "rewards/margins": 1.2896356582641602, "rewards/rejected": -5.463390827178955, "step": 686 }, { "epoch": 0.44868968895420036, "grad_norm": 14.831168635800106, "learning_rate": 1.347374266714537e-07, "logits/chosen": -0.6686832904815674, "logits/rejected": -0.7571280002593994, "logps/chosen": -874.6576538085938, "logps/rejected": -1133.1376953125, "loss": 0.3904, "rewards/accuracies": 0.9375, "rewards/chosen": -4.000096321105957, "rewards/margins": 2.115196466445923, "rewards/rejected": -6.115293502807617, "step": 687 }, { "epoch": 0.4493428034941628, "grad_norm": 23.559349783811268, "learning_rate": 1.34523396370644e-07, "logits/chosen": -0.7094777822494507, "logits/rejected": -0.715304434299469, "logps/chosen": -905.0133056640625, "logps/rejected": -1173.3590087890625, "loss": 0.4411, "rewards/accuracies": 0.90625, "rewards/chosen": -4.392083644866943, "rewards/margins": 2.4449682235717773, "rewards/rejected": -6.837051868438721, "step": 688 }, { "epoch": 0.44999591803412525, "grad_norm": 13.224110895762323, "learning_rate": 1.3430918637102846e-07, "logits/chosen": -0.9106907248497009, "logits/rejected": -0.8780658841133118, "logps/chosen": -859.8287353515625, "logps/rejected": -963.4639282226562, "loss": 0.3873, "rewards/accuracies": 0.90625, "rewards/chosen": -4.072977066040039, "rewards/margins": 1.1936553716659546, "rewards/rejected": -5.266632556915283, "step": 689 }, { "epoch": 0.45064903257408767, "grad_norm": 16.436858644686243, "learning_rate": 1.3409479778759828e-07, "logits/chosen": -0.858303427696228, "logits/rejected": -0.7822844982147217, "logps/chosen": -976.535400390625, "logps/rejected": -1135.9136962890625, "loss": 0.4392, "rewards/accuracies": 0.78125, "rewards/chosen": -5.008065700531006, "rewards/margins": 1.3923001289367676, "rewards/rejected": -6.400365829467773, "step": 690 }, { "epoch": 0.45130214711405015, "grad_norm": 13.43588871300013, "learning_rate": 1.3388023173627412e-07, "logits/chosen": -0.8778905272483826, "logits/rejected": -0.853691577911377, "logps/chosen": -942.0894165039062, "logps/rejected": -1106.460205078125, "loss": 0.3977, "rewards/accuracies": 0.84375, "rewards/chosen": -4.535030841827393, "rewards/margins": 1.272776484489441, "rewards/rejected": -5.807807445526123, "step": 691 }, { "epoch": 0.45195526165401256, "grad_norm": 16.480774943029072, "learning_rate": 1.3366548933390041e-07, "logits/chosen": -0.9276471138000488, "logits/rejected": -0.89919114112854, "logps/chosen": -926.0335693359375, "logps/rejected": -1051.7315673828125, "loss": 0.4608, "rewards/accuracies": 0.8125, "rewards/chosen": -4.60793399810791, "rewards/margins": 1.46094810962677, "rewards/rejected": -6.068882465362549, "step": 692 }, { "epoch": 0.45260837619397504, "grad_norm": 20.387508274609566, "learning_rate": 1.3345057169823951e-07, "logits/chosen": -0.7017565369606018, "logits/rejected": -0.7618687152862549, "logps/chosen": -863.0051879882812, "logps/rejected": -1051.503662109375, "loss": 0.4291, "rewards/accuracies": 0.75, "rewards/chosen": -4.024776458740234, "rewards/margins": 1.6188499927520752, "rewards/rejected": -5.6436262130737305, "step": 693 }, { "epoch": 0.45326149073393746, "grad_norm": 12.858851580750146, "learning_rate": 1.3323547994796595e-07, "logits/chosen": -0.9381989240646362, "logits/rejected": -0.8059816360473633, "logps/chosen": -887.5203247070312, "logps/rejected": -969.3154907226562, "loss": 0.4574, "rewards/accuracies": 0.8125, "rewards/chosen": -3.702977180480957, "rewards/margins": 1.40816068649292, "rewards/rejected": -5.111137390136719, "step": 694 }, { "epoch": 0.45391460527389993, "grad_norm": 23.743275342395183, "learning_rate": 1.3302021520266046e-07, "logits/chosen": -0.9305901527404785, "logits/rejected": -0.8084038496017456, "logps/chosen": -942.6619873046875, "logps/rejected": -1260.660400390625, "loss": 0.4013, "rewards/accuracies": 0.71875, "rewards/chosen": -4.92469596862793, "rewards/margins": 3.1635892391204834, "rewards/rejected": -8.088285446166992, "step": 695 }, { "epoch": 0.45456771981386235, "grad_norm": 11.805074350198156, "learning_rate": 1.3280477858280427e-07, "logits/chosen": -0.8544498682022095, "logits/rejected": -0.7820972800254822, "logps/chosen": -955.1072387695312, "logps/rejected": -1216.007568359375, "loss": 0.3515, "rewards/accuracies": 0.84375, "rewards/chosen": -4.631394863128662, "rewards/margins": 2.7073287963867188, "rewards/rejected": -7.3387227058410645, "step": 696 }, { "epoch": 0.4552208343538248, "grad_norm": 17.61036279740596, "learning_rate": 1.3258917120977328e-07, "logits/chosen": -0.7109684944152832, "logits/rejected": -0.7692880630493164, "logps/chosen": -930.1781005859375, "logps/rejected": -1101.009765625, "loss": 0.4095, "rewards/accuracies": 0.90625, "rewards/chosen": -4.458493232727051, "rewards/margins": 1.6143523454666138, "rewards/rejected": -6.072846412658691, "step": 697 }, { "epoch": 0.45587394889378724, "grad_norm": 22.083953625475885, "learning_rate": 1.3237339420583212e-07, "logits/chosen": -0.7885478734970093, "logits/rejected": -0.7560752034187317, "logps/chosen": -823.4185180664062, "logps/rejected": -974.829345703125, "loss": 0.421, "rewards/accuracies": 0.90625, "rewards/chosen": -3.8105380535125732, "rewards/margins": 1.4178028106689453, "rewards/rejected": -5.228341102600098, "step": 698 }, { "epoch": 0.4565270634337497, "grad_norm": 24.864037856177223, "learning_rate": 1.3215744869412835e-07, "logits/chosen": -0.801964521408081, "logits/rejected": -0.8227947950363159, "logps/chosen": -896.732177734375, "logps/rejected": -1030.0692138671875, "loss": 0.4344, "rewards/accuracies": 0.78125, "rewards/chosen": -4.365248680114746, "rewards/margins": 1.3140637874603271, "rewards/rejected": -5.679312705993652, "step": 699 }, { "epoch": 0.45718017797371213, "grad_norm": 26.55049991945704, "learning_rate": 1.3194133579868672e-07, "logits/chosen": -0.9605610370635986, "logits/rejected": -0.9034255146980286, "logps/chosen": -874.904296875, "logps/rejected": -1062.235107421875, "loss": 0.4538, "rewards/accuracies": 0.9375, "rewards/chosen": -4.245553493499756, "rewards/margins": 1.9700900316238403, "rewards/rejected": -6.215643882751465, "step": 700 }, { "epoch": 0.45718017797371213, "eval_logits/chosen": -0.6330902576446533, "eval_logits/rejected": -0.5789201855659485, "eval_logps/chosen": -912.4920043945312, "eval_logps/rejected": -1030.6561279296875, "eval_loss": 0.43503955006599426, "eval_rewards/accuracies": 0.7889999747276306, "eval_rewards/chosen": -4.385847568511963, "eval_rewards/margins": 1.2690269947052002, "eval_rewards/rejected": -5.654874324798584, "eval_runtime": 614.4325, "eval_samples_per_second": 6.51, "eval_steps_per_second": 0.407, "step": 700 }, { "epoch": 0.4578332925136746, "grad_norm": 22.12940309127682, "learning_rate": 1.317250566444032e-07, "logits/chosen": -0.9155985713005066, "logits/rejected": -0.7935531735420227, "logps/chosen": -908.0445556640625, "logps/rejected": -985.8743896484375, "loss": 0.4656, "rewards/accuracies": 0.78125, "rewards/chosen": -4.054415225982666, "rewards/margins": 0.8908652663230896, "rewards/rejected": -4.945281028747559, "step": 701 }, { "epoch": 0.458486407053637, "grad_norm": 15.953258643762798, "learning_rate": 1.3150861235703912e-07, "logits/chosen": -0.7887985706329346, "logits/rejected": -0.6869586110115051, "logps/chosen": -855.2444458007812, "logps/rejected": -927.878173828125, "loss": 0.4191, "rewards/accuracies": 0.8125, "rewards/chosen": -4.412371635437012, "rewards/margins": 0.866736888885498, "rewards/rejected": -5.279108047485352, "step": 702 }, { "epoch": 0.4591395215935995, "grad_norm": 28.696366226340047, "learning_rate": 1.3129200406321544e-07, "logits/chosen": -0.9091500043869019, "logits/rejected": -0.9133652448654175, "logps/chosen": -915.0131225585938, "logps/rejected": -1015.6453857421875, "loss": 0.4275, "rewards/accuracies": 0.78125, "rewards/chosen": -4.232908248901367, "rewards/margins": 1.2067118883132935, "rewards/rejected": -5.439619064331055, "step": 703 }, { "epoch": 0.4597926361335619, "grad_norm": 15.37871134771081, "learning_rate": 1.310752328904067e-07, "logits/chosen": -0.9574613571166992, "logits/rejected": -0.9025395512580872, "logps/chosen": -894.5764770507812, "logps/rejected": -1112.0906982421875, "loss": 0.4092, "rewards/accuracies": 0.875, "rewards/chosen": -4.34193754196167, "rewards/margins": 1.7960808277130127, "rewards/rejected": -6.1380181312561035, "step": 704 }, { "epoch": 0.4604457506735244, "grad_norm": 19.648239313623666, "learning_rate": 1.3085829996693524e-07, "logits/chosen": -0.7728855609893799, "logits/rejected": -0.755530595779419, "logps/chosen": -808.7198486328125, "logps/rejected": -996.7811279296875, "loss": 0.356, "rewards/accuracies": 0.84375, "rewards/chosen": -3.7491912841796875, "rewards/margins": 1.527405023574829, "rewards/rejected": -5.2765960693359375, "step": 705 }, { "epoch": 0.4610988652134868, "grad_norm": 16.7988475441762, "learning_rate": 1.3064120642196547e-07, "logits/chosen": -0.843777596950531, "logits/rejected": -0.9133669137954712, "logps/chosen": -928.4371948242188, "logps/rejected": -1059.2352294921875, "loss": 0.4309, "rewards/accuracies": 0.75, "rewards/chosen": -4.472860813140869, "rewards/margins": 1.21001398563385, "rewards/rejected": -5.68287467956543, "step": 706 }, { "epoch": 0.4617519797534493, "grad_norm": 26.756288502338244, "learning_rate": 1.304239533854977e-07, "logits/chosen": -0.7616496682167053, "logits/rejected": -0.7714404463768005, "logps/chosen": -957.3233032226562, "logps/rejected": -1242.140380859375, "loss": 0.3533, "rewards/accuracies": 0.875, "rewards/chosen": -4.836521625518799, "rewards/margins": 2.0891270637512207, "rewards/rejected": -6.925648212432861, "step": 707 }, { "epoch": 0.4624050942934117, "grad_norm": 20.678633601313965, "learning_rate": 1.3020654198836248e-07, "logits/chosen": -0.8029762506484985, "logits/rejected": -0.6649346947669983, "logps/chosen": -894.7766723632812, "logps/rejected": -961.3494262695312, "loss": 0.3826, "rewards/accuracies": 0.6875, "rewards/chosen": -4.524013042449951, "rewards/margins": 1.130009412765503, "rewards/rejected": -5.654022216796875, "step": 708 }, { "epoch": 0.4630582088333742, "grad_norm": 16.42456254623477, "learning_rate": 1.2998897336221468e-07, "logits/chosen": -0.6833517551422119, "logits/rejected": -0.796322226524353, "logps/chosen": -862.7698974609375, "logps/rejected": -1060.818359375, "loss": 0.3926, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9077136516571045, "rewards/margins": 1.6327791213989258, "rewards/rejected": -5.540492534637451, "step": 709 }, { "epoch": 0.4637113233733366, "grad_norm": 23.17514164745614, "learning_rate": 1.297712486395275e-07, "logits/chosen": -0.9213692545890808, "logits/rejected": -0.8998441100120544, "logps/chosen": -959.6565551757812, "logps/rejected": -1069.1971435546875, "loss": 0.4986, "rewards/accuracies": 0.78125, "rewards/chosen": -4.9136199951171875, "rewards/margins": 1.4247158765792847, "rewards/rejected": -6.338335990905762, "step": 710 }, { "epoch": 0.46436443791329907, "grad_norm": 16.192431938968557, "learning_rate": 1.295533689535867e-07, "logits/chosen": -0.7986994981765747, "logits/rejected": -0.826543390750885, "logps/chosen": -1040.4547119140625, "logps/rejected": -1238.6219482421875, "loss": 0.3827, "rewards/accuracies": 0.84375, "rewards/chosen": -5.081081867218018, "rewards/margins": 1.767729640007019, "rewards/rejected": -6.848812103271484, "step": 711 }, { "epoch": 0.4650175524532615, "grad_norm": 15.779594539219438, "learning_rate": 1.2933533543848462e-07, "logits/chosen": -0.5566374659538269, "logits/rejected": -0.5704978704452515, "logps/chosen": -902.4140625, "logps/rejected": -1187.8582763671875, "loss": 0.4045, "rewards/accuracies": 0.90625, "rewards/chosen": -4.638293743133545, "rewards/margins": 2.1582648754119873, "rewards/rejected": -6.796557903289795, "step": 712 }, { "epoch": 0.46567066699322396, "grad_norm": 21.262426685835567, "learning_rate": 1.2911714922911425e-07, "logits/chosen": -0.7912888526916504, "logits/rejected": -0.7453445792198181, "logps/chosen": -935.51513671875, "logps/rejected": -1078.4232177734375, "loss": 0.4809, "rewards/accuracies": 0.8125, "rewards/chosen": -4.693671226501465, "rewards/margins": 1.4786438941955566, "rewards/rejected": -6.172314643859863, "step": 713 }, { "epoch": 0.4663237815331864, "grad_norm": 17.990290034456308, "learning_rate": 1.2889881146116349e-07, "logits/chosen": -0.7075154185295105, "logits/rejected": -0.6138902902603149, "logps/chosen": -848.4879760742188, "logps/rejected": -1001.8304443359375, "loss": 0.4373, "rewards/accuracies": 0.84375, "rewards/chosen": -4.287723064422607, "rewards/margins": 1.51327383518219, "rewards/rejected": -5.800996780395508, "step": 714 }, { "epoch": 0.46697689607314885, "grad_norm": 20.00394069949329, "learning_rate": 1.2868032327110903e-07, "logits/chosen": -0.9849535226821899, "logits/rejected": -0.8561375737190247, "logps/chosen": -951.6922607421875, "logps/rejected": -1065.832275390625, "loss": 0.4215, "rewards/accuracies": 0.84375, "rewards/chosen": -4.533693313598633, "rewards/margins": 1.485561728477478, "rewards/rejected": -6.019254684448242, "step": 715 }, { "epoch": 0.46763001061311127, "grad_norm": 18.908563866657076, "learning_rate": 1.2846168579621054e-07, "logits/chosen": -0.8201822638511658, "logits/rejected": -0.7975326180458069, "logps/chosen": -933.5443725585938, "logps/rejected": -1024.7989501953125, "loss": 0.4563, "rewards/accuracies": 0.8125, "rewards/chosen": -4.578218936920166, "rewards/margins": 1.273808479309082, "rewards/rejected": -5.85202693939209, "step": 716 }, { "epoch": 0.46828312515307374, "grad_norm": 13.605006365729439, "learning_rate": 1.2824290017450478e-07, "logits/chosen": -0.8025465607643127, "logits/rejected": -0.8043302893638611, "logps/chosen": -825.5875244140625, "logps/rejected": -965.36962890625, "loss": 0.4272, "rewards/accuracies": 0.8125, "rewards/chosen": -3.7525651454925537, "rewards/margins": 1.5843777656555176, "rewards/rejected": -5.336942672729492, "step": 717 }, { "epoch": 0.46893623969303616, "grad_norm": 23.393158261741338, "learning_rate": 1.2802396754479957e-07, "logits/chosen": -0.9675735831260681, "logits/rejected": -0.8064472079277039, "logps/chosen": -1006.602294921875, "logps/rejected": -1092.76904296875, "loss": 0.4558, "rewards/accuracies": 0.90625, "rewards/chosen": -4.947066307067871, "rewards/margins": 1.4163665771484375, "rewards/rejected": -6.363432884216309, "step": 718 }, { "epoch": 0.46958935423299863, "grad_norm": 19.761890309512406, "learning_rate": 1.27804889046668e-07, "logits/chosen": -0.9035289883613586, "logits/rejected": -0.8827542066574097, "logps/chosen": -860.1765747070312, "logps/rejected": -967.9915771484375, "loss": 0.3329, "rewards/accuracies": 0.8125, "rewards/chosen": -4.125575065612793, "rewards/margins": 1.158940315246582, "rewards/rejected": -5.284515380859375, "step": 719 }, { "epoch": 0.47024246877296105, "grad_norm": 34.43053856299178, "learning_rate": 1.2758566582044235e-07, "logits/chosen": -0.8743783235549927, "logits/rejected": -0.8185849189758301, "logps/chosen": -904.7877807617188, "logps/rejected": -1013.7088012695312, "loss": 0.4482, "rewards/accuracies": 0.78125, "rewards/chosen": -4.368615627288818, "rewards/margins": 1.3164438009262085, "rewards/rejected": -5.685059547424316, "step": 720 }, { "epoch": 0.4708955833129235, "grad_norm": 17.2479400408533, "learning_rate": 1.273662990072083e-07, "logits/chosen": -0.5357871651649475, "logits/rejected": -0.5141369700431824, "logps/chosen": -915.4850463867188, "logps/rejected": -1039.861328125, "loss": 0.452, "rewards/accuracies": 0.75, "rewards/chosen": -4.497086524963379, "rewards/margins": 1.1182925701141357, "rewards/rejected": -5.615379333496094, "step": 721 }, { "epoch": 0.47154869785288595, "grad_norm": 30.52690046250129, "learning_rate": 1.2714678974879885e-07, "logits/chosen": -0.8510643839836121, "logits/rejected": -0.7935106754302979, "logps/chosen": -922.787841796875, "logps/rejected": -1031.25537109375, "loss": 0.4458, "rewards/accuracies": 0.75, "rewards/chosen": -4.576708793640137, "rewards/margins": 1.3589658737182617, "rewards/rejected": -5.935675144195557, "step": 722 }, { "epoch": 0.4722018123928484, "grad_norm": 12.51511568238771, "learning_rate": 1.2692713918778846e-07, "logits/chosen": -0.9695079326629639, "logits/rejected": -0.9029305577278137, "logps/chosen": -849.284423828125, "logps/rejected": -983.3785400390625, "loss": 0.3653, "rewards/accuracies": 0.9375, "rewards/chosen": -3.8184423446655273, "rewards/margins": 1.5116703510284424, "rewards/rejected": -5.330112457275391, "step": 723 }, { "epoch": 0.47285492693281084, "grad_norm": 21.528089592243187, "learning_rate": 1.2670734846748716e-07, "logits/chosen": -0.7152475714683533, "logits/rejected": -0.6749838590621948, "logps/chosen": -837.8988037109375, "logps/rejected": -995.8895263671875, "loss": 0.4078, "rewards/accuracies": 0.84375, "rewards/chosen": -3.6210410594940186, "rewards/margins": 1.5628869533538818, "rewards/rejected": -5.183927536010742, "step": 724 }, { "epoch": 0.4735080414727733, "grad_norm": 25.426263169248593, "learning_rate": 1.2648741873193445e-07, "logits/chosen": -0.9211537837982178, "logits/rejected": -0.8691627383232117, "logps/chosen": -927.373779296875, "logps/rejected": -1008.0757446289062, "loss": 0.4983, "rewards/accuracies": 0.75, "rewards/chosen": -4.5448899269104, "rewards/margins": 0.8503549695014954, "rewards/rejected": -5.395245552062988, "step": 725 }, { "epoch": 0.47416115601273573, "grad_norm": 13.836360831145551, "learning_rate": 1.2626735112589345e-07, "logits/chosen": -0.5572523474693298, "logits/rejected": -0.577164351940155, "logps/chosen": -775.7666625976562, "logps/rejected": -1003.75634765625, "loss": 0.3885, "rewards/accuracies": 0.84375, "rewards/chosen": -3.658046245574951, "rewards/margins": 1.7806352376937866, "rewards/rejected": -5.438681125640869, "step": 726 }, { "epoch": 0.4748142705526982, "grad_norm": 19.00950665056247, "learning_rate": 1.2604714679484488e-07, "logits/chosen": -0.9352363348007202, "logits/rejected": -0.9275373220443726, "logps/chosen": -850.5507202148438, "logps/rejected": -979.1115112304688, "loss": 0.4245, "rewards/accuracies": 0.875, "rewards/chosen": -4.22627067565918, "rewards/margins": 1.3016259670257568, "rewards/rejected": -5.527897357940674, "step": 727 }, { "epoch": 0.4754673850926606, "grad_norm": 16.93055884467082, "learning_rate": 1.2582680688498123e-07, "logits/chosen": -0.7490636706352234, "logits/rejected": -0.7202102541923523, "logps/chosen": -831.2103271484375, "logps/rejected": -961.669921875, "loss": 0.4218, "rewards/accuracies": 0.875, "rewards/chosen": -4.167855739593506, "rewards/margins": 1.4452309608459473, "rewards/rejected": -5.613086700439453, "step": 728 }, { "epoch": 0.4761204996326231, "grad_norm": 16.291144939826697, "learning_rate": 1.2560633254320057e-07, "logits/chosen": -0.7099978923797607, "logits/rejected": -0.9346813559532166, "logps/chosen": -838.0257568359375, "logps/rejected": -1143.6383056640625, "loss": 0.409, "rewards/accuracies": 0.8125, "rewards/chosen": -4.113926887512207, "rewards/margins": 1.7861765623092651, "rewards/rejected": -5.9001030921936035, "step": 729 }, { "epoch": 0.4767736141725855, "grad_norm": 27.67917554312354, "learning_rate": 1.2538572491710077e-07, "logits/chosen": -0.7133294939994812, "logits/rejected": -0.6676253080368042, "logps/chosen": -820.7550048828125, "logps/rejected": -966.079345703125, "loss": 0.4067, "rewards/accuracies": 0.84375, "rewards/chosen": -3.8138749599456787, "rewards/margins": 1.4336824417114258, "rewards/rejected": -5.247557163238525, "step": 730 }, { "epoch": 0.477426728712548, "grad_norm": 25.703958623986306, "learning_rate": 1.251649851549735e-07, "logits/chosen": -0.7251565456390381, "logits/rejected": -0.6387436985969543, "logps/chosen": -896.2338256835938, "logps/rejected": -1028.1199951171875, "loss": 0.4196, "rewards/accuracies": 0.875, "rewards/chosen": -4.099024295806885, "rewards/margins": 1.6823163032531738, "rewards/rejected": -5.781341075897217, "step": 731 }, { "epoch": 0.4780798432525104, "grad_norm": 37.02785000532457, "learning_rate": 1.2494411440579813e-07, "logits/chosen": -0.9833545088768005, "logits/rejected": -0.9329172968864441, "logps/chosen": -935.530517578125, "logps/rejected": -1039.677978515625, "loss": 0.3888, "rewards/accuracies": 0.8125, "rewards/chosen": -4.771054744720459, "rewards/margins": 1.2584686279296875, "rewards/rejected": -6.0295233726501465, "step": 732 }, { "epoch": 0.4787329577924729, "grad_norm": 25.79246674335453, "learning_rate": 1.2472311381923588e-07, "logits/chosen": -0.9473874568939209, "logits/rejected": -0.9712878465652466, "logps/chosen": -969.2967529296875, "logps/rejected": -1192.9619140625, "loss": 0.3645, "rewards/accuracies": 0.84375, "rewards/chosen": -3.9531495571136475, "rewards/margins": 2.2883496284484863, "rewards/rejected": -6.241498947143555, "step": 733 }, { "epoch": 0.4793860723324353, "grad_norm": 20.659869144712403, "learning_rate": 1.245019845456238e-07, "logits/chosen": -0.51751708984375, "logits/rejected": -0.42947906255722046, "logps/chosen": -836.533203125, "logps/rejected": -946.0396728515625, "loss": 0.4573, "rewards/accuracies": 0.6875, "rewards/chosen": -4.261209487915039, "rewards/margins": 1.200287103652954, "rewards/rejected": -5.461496829986572, "step": 734 }, { "epoch": 0.48003918687239777, "grad_norm": 21.72711267793662, "learning_rate": 1.2428072773596873e-07, "logits/chosen": -0.7008222937583923, "logits/rejected": -0.6094821691513062, "logps/chosen": -864.21435546875, "logps/rejected": -954.5537109375, "loss": 0.4959, "rewards/accuracies": 0.59375, "rewards/chosen": -4.461493015289307, "rewards/margins": 0.9662419557571411, "rewards/rejected": -5.427734375, "step": 735 }, { "epoch": 0.4806923014123602, "grad_norm": 15.65230950114833, "learning_rate": 1.2405934454194144e-07, "logits/chosen": -0.899868369102478, "logits/rejected": -0.8498630523681641, "logps/chosen": -852.909912109375, "logps/rejected": -932.8972778320312, "loss": 0.4457, "rewards/accuracies": 0.75, "rewards/chosen": -4.355042934417725, "rewards/margins": 0.8693320751190186, "rewards/rejected": -5.224374771118164, "step": 736 }, { "epoch": 0.48134541595232266, "grad_norm": 17.108068590790964, "learning_rate": 1.2383783611587044e-07, "logits/chosen": -0.8432599902153015, "logits/rejected": -0.843549907207489, "logps/chosen": -936.5597534179688, "logps/rejected": -1126.088134765625, "loss": 0.4673, "rewards/accuracies": 0.84375, "rewards/chosen": -4.594806671142578, "rewards/margins": 1.714801549911499, "rewards/rejected": -6.309607982635498, "step": 737 }, { "epoch": 0.4819985304922851, "grad_norm": 20.363289207881063, "learning_rate": 1.2361620361073617e-07, "logits/chosen": -1.0172386169433594, "logits/rejected": -0.8159488439559937, "logps/chosen": -1016.598388671875, "logps/rejected": -1100.7061767578125, "loss": 0.5102, "rewards/accuracies": 0.71875, "rewards/chosen": -5.072577476501465, "rewards/margins": 1.0971565246582031, "rewards/rejected": -6.169734001159668, "step": 738 }, { "epoch": 0.48265164503224756, "grad_norm": 14.671715900291483, "learning_rate": 1.2339444818016487e-07, "logits/chosen": -0.6325951218605042, "logits/rejected": -0.7422337532043457, "logps/chosen": -930.2354736328125, "logps/rejected": -1154.2427978515625, "loss": 0.4045, "rewards/accuracies": 0.8125, "rewards/chosen": -4.799300193786621, "rewards/margins": 1.4765405654907227, "rewards/rejected": -6.2758402824401855, "step": 739 }, { "epoch": 0.48330475957221, "grad_norm": 34.521623507158026, "learning_rate": 1.2317257097842262e-07, "logits/chosen": -0.9213210344314575, "logits/rejected": -0.7929114103317261, "logps/chosen": -853.6707763671875, "logps/rejected": -988.0006103515625, "loss": 0.5518, "rewards/accuracies": 0.75, "rewards/chosen": -4.203425407409668, "rewards/margins": 1.2507836818695068, "rewards/rejected": -5.454209327697754, "step": 740 }, { "epoch": 0.48395787411217245, "grad_norm": 43.15485782986652, "learning_rate": 1.2295057316040937e-07, "logits/chosen": -0.8311377763748169, "logits/rejected": -0.7532038688659668, "logps/chosen": -920.2929077148438, "logps/rejected": -970.626953125, "loss": 0.5301, "rewards/accuracies": 0.8125, "rewards/chosen": -4.7200517654418945, "rewards/margins": 0.9582325220108032, "rewards/rejected": -5.678284645080566, "step": 741 }, { "epoch": 0.48461098865213487, "grad_norm": 19.92124882456644, "learning_rate": 1.2272845588165287e-07, "logits/chosen": -0.7741049528121948, "logits/rejected": -0.9612672328948975, "logps/chosen": -770.1849365234375, "logps/rejected": -975.554931640625, "loss": 0.4058, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5785880088806152, "rewards/margins": 1.356839895248413, "rewards/rejected": -4.935428142547607, "step": 742 }, { "epoch": 0.48526410319209734, "grad_norm": 14.002013827452883, "learning_rate": 1.2250622029830272e-07, "logits/chosen": -0.8265233635902405, "logits/rejected": -0.7270675897598267, "logps/chosen": -918.1268310546875, "logps/rejected": -1094.3868408203125, "loss": 0.406, "rewards/accuracies": 0.8125, "rewards/chosen": -4.770752906799316, "rewards/margins": 1.5041320323944092, "rewards/rejected": -6.274885177612305, "step": 743 }, { "epoch": 0.48591721773205976, "grad_norm": 13.769152189829311, "learning_rate": 1.2228386756712425e-07, "logits/chosen": -0.9900491833686829, "logits/rejected": -0.978319525718689, "logps/chosen": -960.6400756835938, "logps/rejected": -1180.7225341796875, "loss": 0.4032, "rewards/accuracies": 0.875, "rewards/chosen": -4.634914398193359, "rewards/margins": 1.8906114101409912, "rewards/rejected": -6.5255255699157715, "step": 744 }, { "epoch": 0.48657033227202223, "grad_norm": 17.539591551900347, "learning_rate": 1.2206139884549258e-07, "logits/chosen": -0.8892086744308472, "logits/rejected": -0.9011514186859131, "logps/chosen": -989.2440185546875, "logps/rejected": -1094.5552978515625, "loss": 0.4855, "rewards/accuracies": 0.78125, "rewards/chosen": -4.743755340576172, "rewards/margins": 1.1824637651443481, "rewards/rejected": -5.926219940185547, "step": 745 }, { "epoch": 0.48722344681198465, "grad_norm": 16.1789735995074, "learning_rate": 1.218388152913866e-07, "logits/chosen": -0.6589198112487793, "logits/rejected": -0.5492499470710754, "logps/chosen": -767.7838745117188, "logps/rejected": -982.4749145507812, "loss": 0.4377, "rewards/accuracies": 0.6875, "rewards/chosen": -3.803617000579834, "rewards/margins": 1.8227593898773193, "rewards/rejected": -5.626376628875732, "step": 746 }, { "epoch": 0.4878765613519471, "grad_norm": 18.517762291790852, "learning_rate": 1.2161611806338287e-07, "logits/chosen": -0.8017712831497192, "logits/rejected": -0.7148299813270569, "logps/chosen": -857.5178833007812, "logps/rejected": -951.9879150390625, "loss": 0.407, "rewards/accuracies": 0.84375, "rewards/chosen": -3.7201151847839355, "rewards/margins": 1.41351318359375, "rewards/rejected": -5.1336283683776855, "step": 747 }, { "epoch": 0.48852967589190954, "grad_norm": 17.3742032384711, "learning_rate": 1.2139330832064973e-07, "logits/chosen": -0.933401346206665, "logits/rejected": -0.8776005506515503, "logps/chosen": -845.0396728515625, "logps/rejected": -936.0737915039062, "loss": 0.3894, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5093374252319336, "rewards/margins": 1.1748614311218262, "rewards/rejected": -4.68419885635376, "step": 748 }, { "epoch": 0.489182790431872, "grad_norm": 20.0598159118841, "learning_rate": 1.2117038722294108e-07, "logits/chosen": -0.8245954513549805, "logits/rejected": -0.8754929900169373, "logps/chosen": -804.7047729492188, "logps/rejected": -986.4611206054688, "loss": 0.4746, "rewards/accuracies": 0.8125, "rewards/chosen": -3.710789680480957, "rewards/margins": 1.3969413042068481, "rewards/rejected": -5.107730865478516, "step": 749 }, { "epoch": 0.48983590497183443, "grad_norm": 31.59240574933894, "learning_rate": 1.2094735593059044e-07, "logits/chosen": -0.9009556174278259, "logits/rejected": -0.8687411546707153, "logps/chosen": -878.0067749023438, "logps/rejected": -1024.1734619140625, "loss": 0.3774, "rewards/accuracies": 0.8125, "rewards/chosen": -4.03043270111084, "rewards/margins": 1.4627373218536377, "rewards/rejected": -5.493169784545898, "step": 750 }, { "epoch": 0.4904890195117969, "grad_norm": 24.791969285714874, "learning_rate": 1.2072421560450497e-07, "logits/chosen": -0.8229708075523376, "logits/rejected": -0.8028020262718201, "logps/chosen": -848.9623413085938, "logps/rejected": -948.7738037109375, "loss": 0.428, "rewards/accuracies": 0.8125, "rewards/chosen": -3.879707098007202, "rewards/margins": 1.1067575216293335, "rewards/rejected": -4.986464977264404, "step": 751 }, { "epoch": 0.4911421340517593, "grad_norm": 12.606071497414911, "learning_rate": 1.2050096740615933e-07, "logits/chosen": -0.8053447008132935, "logits/rejected": -0.8125733137130737, "logps/chosen": -849.5568237304688, "logps/rejected": -1000.6946411132812, "loss": 0.3875, "rewards/accuracies": 0.8125, "rewards/chosen": -4.221886157989502, "rewards/margins": 1.2606518268585205, "rewards/rejected": -5.482537269592285, "step": 752 }, { "epoch": 0.4917952485917218, "grad_norm": 14.52811675410469, "learning_rate": 1.2027761249758962e-07, "logits/chosen": -0.7513640522956848, "logits/rejected": -0.8120062351226807, "logps/chosen": -892.8887329101562, "logps/rejected": -1004.2350463867188, "loss": 0.449, "rewards/accuracies": 0.71875, "rewards/chosen": -4.481434345245361, "rewards/margins": 0.953445553779602, "rewards/rejected": -5.434879779815674, "step": 753 }, { "epoch": 0.4924483631316842, "grad_norm": 13.38995547174789, "learning_rate": 1.200541520413875e-07, "logits/chosen": -0.9067294597625732, "logits/rejected": -0.9803101420402527, "logps/chosen": -822.230224609375, "logps/rejected": -969.9388427734375, "loss": 0.3983, "rewards/accuracies": 0.90625, "rewards/chosen": -3.3725507259368896, "rewards/margins": 1.4337201118469238, "rewards/rejected": -4.806271076202393, "step": 754 }, { "epoch": 0.49310147767164664, "grad_norm": 19.628709746070033, "learning_rate": 1.1983058720069397e-07, "logits/chosen": -0.769875705242157, "logits/rejected": -0.6702077984809875, "logps/chosen": -857.1516723632812, "logps/rejected": -938.2532958984375, "loss": 0.4094, "rewards/accuracies": 0.78125, "rewards/chosen": -4.118803977966309, "rewards/margins": 1.0416544675827026, "rewards/rejected": -5.160459041595459, "step": 755 }, { "epoch": 0.4937545922116091, "grad_norm": 18.81970358020589, "learning_rate": 1.1960691913919326e-07, "logits/chosen": -0.7793917655944824, "logits/rejected": -0.7675676345825195, "logps/chosen": -877.9605102539062, "logps/rejected": -989.585205078125, "loss": 0.4474, "rewards/accuracies": 0.8125, "rewards/chosen": -4.07187032699585, "rewards/margins": 1.2758498191833496, "rewards/rejected": -5.347719192504883, "step": 756 }, { "epoch": 0.49440770675157153, "grad_norm": 20.09325182686205, "learning_rate": 1.19383149021107e-07, "logits/chosen": -0.8387940526008606, "logits/rejected": -0.8090571165084839, "logps/chosen": -885.7142944335938, "logps/rejected": -980.8731689453125, "loss": 0.3542, "rewards/accuracies": 0.875, "rewards/chosen": -3.5571885108947754, "rewards/margins": 1.6131629943847656, "rewards/rejected": -5.170351982116699, "step": 757 }, { "epoch": 0.495060821291534, "grad_norm": 27.816476529936935, "learning_rate": 1.1915927801118804e-07, "logits/chosen": -0.9003801345825195, "logits/rejected": -0.8655527830123901, "logps/chosen": -904.9478759765625, "logps/rejected": -1061.704345703125, "loss": 0.4283, "rewards/accuracies": 0.84375, "rewards/chosen": -4.322434902191162, "rewards/margins": 1.379239559173584, "rewards/rejected": -5.701674461364746, "step": 758 }, { "epoch": 0.4957139358314964, "grad_norm": 20.61033743233212, "learning_rate": 1.1893530727471428e-07, "logits/chosen": -0.9334514737129211, "logits/rejected": -0.8155126571655273, "logps/chosen": -946.715576171875, "logps/rejected": -1062.2564697265625, "loss": 0.4412, "rewards/accuracies": 0.8125, "rewards/chosen": -4.529923915863037, "rewards/margins": 1.363791823387146, "rewards/rejected": -5.8937153816223145, "step": 759 }, { "epoch": 0.4963670503714589, "grad_norm": 33.77412475553499, "learning_rate": 1.1871123797748283e-07, "logits/chosen": -0.6742445230484009, "logits/rejected": -0.5909000635147095, "logps/chosen": -949.4371337890625, "logps/rejected": -1071.842529296875, "loss": 0.4351, "rewards/accuracies": 0.78125, "rewards/chosen": -4.558448791503906, "rewards/margins": 1.3982633352279663, "rewards/rejected": -5.956711292266846, "step": 760 }, { "epoch": 0.4970201649114213, "grad_norm": 20.54838651893162, "learning_rate": 1.1848707128580375e-07, "logits/chosen": -0.7726311087608337, "logits/rejected": -0.775484561920166, "logps/chosen": -927.5745239257812, "logps/rejected": -1105.466552734375, "loss": 0.3525, "rewards/accuracies": 0.90625, "rewards/chosen": -4.482267379760742, "rewards/margins": 1.805901050567627, "rewards/rejected": -6.288168430328369, "step": 761 }, { "epoch": 0.4976732794513838, "grad_norm": 16.777204352831102, "learning_rate": 1.1826280836649409e-07, "logits/chosen": -0.7363873720169067, "logits/rejected": -0.6501478552818298, "logps/chosen": -897.8953857421875, "logps/rejected": -1098.8282470703125, "loss": 0.401, "rewards/accuracies": 0.9375, "rewards/chosen": -4.353043556213379, "rewards/margins": 1.7584478855133057, "rewards/rejected": -6.1114912033081055, "step": 762 }, { "epoch": 0.4983263939913462, "grad_norm": 24.636393241823914, "learning_rate": 1.180384503868717e-07, "logits/chosen": -0.7493852376937866, "logits/rejected": -0.723925769329071, "logps/chosen": -953.0501098632812, "logps/rejected": -1146.88134765625, "loss": 0.4202, "rewards/accuracies": 0.84375, "rewards/chosen": -4.643695831298828, "rewards/margins": 2.2372117042541504, "rewards/rejected": -6.8809075355529785, "step": 763 }, { "epoch": 0.4989795085313087, "grad_norm": 19.058482760823487, "learning_rate": 1.1781399851474931e-07, "logits/chosen": -0.9877965450286865, "logits/rejected": -0.9070134162902832, "logps/chosen": -930.2177734375, "logps/rejected": -1031.239501953125, "loss": 0.4575, "rewards/accuracies": 0.8125, "rewards/chosen": -4.326931476593018, "rewards/margins": 1.2556382417678833, "rewards/rejected": -5.582569599151611, "step": 764 }, { "epoch": 0.4996326230712711, "grad_norm": 16.17557117170361, "learning_rate": 1.175894539184284e-07, "logits/chosen": -0.9283071160316467, "logits/rejected": -0.8941454887390137, "logps/chosen": -929.899169921875, "logps/rejected": -1084.6240234375, "loss": 0.4407, "rewards/accuracies": 0.84375, "rewards/chosen": -4.056011199951172, "rewards/margins": 1.622609257698059, "rewards/rejected": -5.678620338439941, "step": 765 }, { "epoch": 0.5002857376112335, "grad_norm": 23.91162098054986, "learning_rate": 1.1736481776669305e-07, "logits/chosen": -0.8212779760360718, "logits/rejected": -0.8001962304115295, "logps/chosen": -878.6630859375, "logps/rejected": -988.41845703125, "loss": 0.4294, "rewards/accuracies": 0.71875, "rewards/chosen": -4.468136310577393, "rewards/margins": 1.1635451316833496, "rewards/rejected": -5.6316819190979, "step": 766 }, { "epoch": 0.500938852151196, "grad_norm": 27.218732486708504, "learning_rate": 1.171400912288038e-07, "logits/chosen": -0.8495869636535645, "logits/rejected": -0.702357292175293, "logps/chosen": -831.1826171875, "logps/rejected": -921.25390625, "loss": 0.4513, "rewards/accuracies": 0.8125, "rewards/chosen": -3.789102554321289, "rewards/margins": 1.2873955965042114, "rewards/rejected": -5.076498508453369, "step": 767 }, { "epoch": 0.5015919666911585, "grad_norm": 20.90437936826142, "learning_rate": 1.169152754744918e-07, "logits/chosen": -0.7195597290992737, "logits/rejected": -0.6627518534660339, "logps/chosen": -953.954345703125, "logps/rejected": -1121.614013671875, "loss": 0.4843, "rewards/accuracies": 0.84375, "rewards/chosen": -4.850507736206055, "rewards/margins": 1.3082853555679321, "rewards/rejected": -6.1587934494018555, "step": 768 }, { "epoch": 0.5022450812311209, "grad_norm": 13.808361407338461, "learning_rate": 1.1669037167395254e-07, "logits/chosen": -0.7798606157302856, "logits/rejected": -0.7846609354019165, "logps/chosen": -884.9368896484375, "logps/rejected": -1020.4482421875, "loss": 0.3611, "rewards/accuracies": 0.84375, "rewards/chosen": -4.367555618286133, "rewards/margins": 1.3237645626068115, "rewards/rejected": -5.691320419311523, "step": 769 }, { "epoch": 0.5028981957710833, "grad_norm": 15.626358450737365, "learning_rate": 1.164653809978398e-07, "logits/chosen": -0.8251121640205383, "logits/rejected": -0.8357376456260681, "logps/chosen": -813.7039794921875, "logps/rejected": -1000.0850830078125, "loss": 0.3555, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9221858978271484, "rewards/margins": 1.5360300540924072, "rewards/rejected": -5.458215713500977, "step": 770 }, { "epoch": 0.5035513103110458, "grad_norm": 16.252415595810294, "learning_rate": 1.1624030461725956e-07, "logits/chosen": -0.6500051021575928, "logits/rejected": -0.8026775121688843, "logps/chosen": -844.816162109375, "logps/rejected": -1006.4044799804688, "loss": 0.3957, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9311153888702393, "rewards/margins": 1.383944034576416, "rewards/rejected": -5.315058708190918, "step": 771 }, { "epoch": 0.5042044248510082, "grad_norm": 22.459666056011624, "learning_rate": 1.1601514370376389e-07, "logits/chosen": -0.5780275464057922, "logits/rejected": -0.5432345867156982, "logps/chosen": -926.1326904296875, "logps/rejected": -1077.6756591796875, "loss": 0.3715, "rewards/accuracies": 0.78125, "rewards/chosen": -4.724125862121582, "rewards/margins": 1.5845980644226074, "rewards/rejected": -6.3087239265441895, "step": 772 }, { "epoch": 0.5048575393909707, "grad_norm": 23.163130085012735, "learning_rate": 1.1578989942934488e-07, "logits/chosen": -0.9420453310012817, "logits/rejected": -0.7490870356559753, "logps/chosen": -1003.92822265625, "logps/rejected": -1117.533447265625, "loss": 0.3998, "rewards/accuracies": 0.78125, "rewards/chosen": -5.345033168792725, "rewards/margins": 1.4133516550064087, "rewards/rejected": -6.75838565826416, "step": 773 }, { "epoch": 0.5055106539309331, "grad_norm": 21.394993393127642, "learning_rate": 1.1556457296642847e-07, "logits/chosen": -0.7638024091720581, "logits/rejected": -0.85248202085495, "logps/chosen": -988.4688720703125, "logps/rejected": -1274.0184326171875, "loss": 0.4004, "rewards/accuracies": 0.8125, "rewards/chosen": -5.5560994148254395, "rewards/margins": 2.0841054916381836, "rewards/rejected": -7.640205383300781, "step": 774 }, { "epoch": 0.5061637684708956, "grad_norm": 18.81833935631505, "learning_rate": 1.1533916548786855e-07, "logits/chosen": -0.8116979002952576, "logits/rejected": -0.778827965259552, "logps/chosen": -870.8009643554688, "logps/rejected": -1005.8507080078125, "loss": 0.4318, "rewards/accuracies": 0.90625, "rewards/chosen": -4.558528423309326, "rewards/margins": 1.3536608219146729, "rewards/rejected": -5.912189483642578, "step": 775 }, { "epoch": 0.506816883010858, "grad_norm": 16.198840020111252, "learning_rate": 1.1511367816694051e-07, "logits/chosen": -0.7797116041183472, "logits/rejected": -0.7683050036430359, "logps/chosen": -939.5609741210938, "logps/rejected": -1083.218994140625, "loss": 0.4244, "rewards/accuracies": 0.84375, "rewards/chosen": -4.802543640136719, "rewards/margins": 1.5838314294815063, "rewards/rejected": -6.386374473571777, "step": 776 }, { "epoch": 0.5074699975508205, "grad_norm": 18.701770107685814, "learning_rate": 1.1488811217733549e-07, "logits/chosen": -0.977311372756958, "logits/rejected": -0.9297751188278198, "logps/chosen": -917.08642578125, "logps/rejected": -1083.6258544921875, "loss": 0.3714, "rewards/accuracies": 0.78125, "rewards/chosen": -4.588201999664307, "rewards/margins": 1.63888680934906, "rewards/rejected": -6.227088928222656, "step": 777 }, { "epoch": 0.5081231120907829, "grad_norm": 16.1923946331088, "learning_rate": 1.1466246869315406e-07, "logits/chosen": -1.0482027530670166, "logits/rejected": -0.9430612325668335, "logps/chosen": -1009.8977661132812, "logps/rejected": -1169.45556640625, "loss": 0.3695, "rewards/accuracies": 0.78125, "rewards/chosen": -5.51522970199585, "rewards/margins": 1.5812039375305176, "rewards/rejected": -7.096433639526367, "step": 778 }, { "epoch": 0.5087762266307454, "grad_norm": 18.00057572408695, "learning_rate": 1.1443674888890007e-07, "logits/chosen": -0.7985199093818665, "logits/rejected": -0.7626904249191284, "logps/chosen": -925.5953979492188, "logps/rejected": -1146.2274169921875, "loss": 0.3768, "rewards/accuracies": 0.90625, "rewards/chosen": -4.9369001388549805, "rewards/margins": 2.3211519718170166, "rewards/rejected": -7.258052349090576, "step": 779 }, { "epoch": 0.5094293411707078, "grad_norm": 15.115454811528636, "learning_rate": 1.1421095393947478e-07, "logits/chosen": -1.0339407920837402, "logits/rejected": -0.9772629737854004, "logps/chosen": -945.53662109375, "logps/rejected": -1116.2100830078125, "loss": 0.3776, "rewards/accuracies": 0.84375, "rewards/chosen": -4.286327838897705, "rewards/margins": 1.8986492156982422, "rewards/rejected": -6.184976577758789, "step": 780 }, { "epoch": 0.5100824557106702, "grad_norm": 20.665297297366077, "learning_rate": 1.1398508502017046e-07, "logits/chosen": -0.8846578001976013, "logits/rejected": -0.850710391998291, "logps/chosen": -868.7977294921875, "logps/rejected": -975.1135864257812, "loss": 0.4766, "rewards/accuracies": 0.75, "rewards/chosen": -4.497041702270508, "rewards/margins": 1.3014904260635376, "rewards/rejected": -5.798532485961914, "step": 781 }, { "epoch": 0.5107355702506327, "grad_norm": 18.340311788522833, "learning_rate": 1.1375914330666449e-07, "logits/chosen": -0.87464439868927, "logits/rejected": -0.9072346687316895, "logps/chosen": -1002.248291015625, "logps/rejected": -1136.2291259765625, "loss": 0.4377, "rewards/accuracies": 0.84375, "rewards/chosen": -4.940828800201416, "rewards/margins": 1.2929599285125732, "rewards/rejected": -6.233788013458252, "step": 782 }, { "epoch": 0.5113886847905952, "grad_norm": 17.06781548774511, "learning_rate": 1.1353312997501312e-07, "logits/chosen": -0.8003891110420227, "logits/rejected": -0.630307674407959, "logps/chosen": -902.5821533203125, "logps/rejected": -1034.3380126953125, "loss": 0.4738, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8484323024749756, "rewards/margins": 1.414013147354126, "rewards/rejected": -5.262445449829102, "step": 783 }, { "epoch": 0.5120417993305576, "grad_norm": 20.70545186944479, "learning_rate": 1.1330704620164537e-07, "logits/chosen": -0.901751697063446, "logits/rejected": -0.9353833198547363, "logps/chosen": -814.7222900390625, "logps/rejected": -914.0795288085938, "loss": 0.4576, "rewards/accuracies": 0.84375, "rewards/chosen": -4.048853397369385, "rewards/margins": 0.867429256439209, "rewards/rejected": -4.916282653808594, "step": 784 }, { "epoch": 0.51269491387052, "grad_norm": 19.132193339224816, "learning_rate": 1.1308089316335694e-07, "logits/chosen": -0.7908254265785217, "logits/rejected": -0.7592061161994934, "logps/chosen": -898.7520751953125, "logps/rejected": -1104.9476318359375, "loss": 0.3427, "rewards/accuracies": 0.9375, "rewards/chosen": -4.353885650634766, "rewards/margins": 1.955952763557434, "rewards/rejected": -6.309837818145752, "step": 785 }, { "epoch": 0.5133480284104824, "grad_norm": 19.766383065845503, "learning_rate": 1.1285467203730403e-07, "logits/chosen": -0.9346864819526672, "logits/rejected": -0.9409589171409607, "logps/chosen": -894.0338134765625, "logps/rejected": -956.7890014648438, "loss": 0.3966, "rewards/accuracies": 0.78125, "rewards/chosen": -4.301855087280273, "rewards/margins": 0.8575952649116516, "rewards/rejected": -5.159451007843018, "step": 786 }, { "epoch": 0.514001142950445, "grad_norm": 23.39755783333431, "learning_rate": 1.1262838400099733e-07, "logits/chosen": -0.7014452815055847, "logits/rejected": -0.716223418712616, "logps/chosen": -890.46142578125, "logps/rejected": -1055.3876953125, "loss": 0.4387, "rewards/accuracies": 0.875, "rewards/chosen": -4.431001663208008, "rewards/margins": 1.5780439376831055, "rewards/rejected": -6.009045600891113, "step": 787 }, { "epoch": 0.5146542574904074, "grad_norm": 21.934271494714906, "learning_rate": 1.1240203023229568e-07, "logits/chosen": -0.829107403755188, "logits/rejected": -0.712059497833252, "logps/chosen": -971.6460571289062, "logps/rejected": -1099.4808349609375, "loss": 0.4365, "rewards/accuracies": 0.75, "rewards/chosen": -4.3892669677734375, "rewards/margins": 1.1161062717437744, "rewards/rejected": -5.505373477935791, "step": 788 }, { "epoch": 0.5153073720303698, "grad_norm": 19.574727119246504, "learning_rate": 1.121756119094002e-07, "logits/chosen": -1.0405265092849731, "logits/rejected": -0.9297416806221008, "logps/chosen": -885.945556640625, "logps/rejected": -1020.0201416015625, "loss": 0.4007, "rewards/accuracies": 0.75, "rewards/chosen": -4.058424949645996, "rewards/margins": 1.41481351852417, "rewards/rejected": -5.473237991333008, "step": 789 }, { "epoch": 0.5159604865703322, "grad_norm": 18.852117751022757, "learning_rate": 1.1194913021084789e-07, "logits/chosen": -0.6187543272972107, "logits/rejected": -0.7156034111976624, "logps/chosen": -872.1130981445312, "logps/rejected": -1123.46875, "loss": 0.447, "rewards/accuracies": 0.8125, "rewards/chosen": -3.998103618621826, "rewards/margins": 1.690637469291687, "rewards/rejected": -5.688741683959961, "step": 790 }, { "epoch": 0.5166136011102948, "grad_norm": 32.281580806874125, "learning_rate": 1.1172258631550571e-07, "logits/chosen": -0.7386258244514465, "logits/rejected": -0.647053599357605, "logps/chosen": -860.7993774414062, "logps/rejected": -970.7279052734375, "loss": 0.4532, "rewards/accuracies": 0.8125, "rewards/chosen": -3.5283761024475098, "rewards/margins": 1.3366488218307495, "rewards/rejected": -4.865025043487549, "step": 791 }, { "epoch": 0.5172667156502572, "grad_norm": 26.745299451857544, "learning_rate": 1.1149598140256435e-07, "logits/chosen": -0.8861040472984314, "logits/rejected": -0.8817246556282043, "logps/chosen": -845.762451171875, "logps/rejected": -937.815185546875, "loss": 0.4173, "rewards/accuracies": 0.78125, "rewards/chosen": -3.8758373260498047, "rewards/margins": 0.9351273775100708, "rewards/rejected": -4.810964584350586, "step": 792 }, { "epoch": 0.5179198301902196, "grad_norm": 11.575282148376893, "learning_rate": 1.1126931665153212e-07, "logits/chosen": -0.7514525651931763, "logits/rejected": -0.7448426485061646, "logps/chosen": -910.6206665039062, "logps/rejected": -1089.5885009765625, "loss": 0.3624, "rewards/accuracies": 0.875, "rewards/chosen": -4.288670539855957, "rewards/margins": 1.8873575925827026, "rewards/rejected": -6.176028251647949, "step": 793 }, { "epoch": 0.518572944730182, "grad_norm": 16.52311325074282, "learning_rate": 1.1104259324222875e-07, "logits/chosen": -0.9778417944908142, "logits/rejected": -0.9302371144294739, "logps/chosen": -965.8187255859375, "logps/rejected": -1182.9871826171875, "loss": 0.3429, "rewards/accuracies": 0.96875, "rewards/chosen": -4.435500144958496, "rewards/margins": 2.29146671295166, "rewards/rejected": -6.7269673347473145, "step": 794 }, { "epoch": 0.5192260592701445, "grad_norm": 17.244378755919463, "learning_rate": 1.1081581235477935e-07, "logits/chosen": -0.9682803153991699, "logits/rejected": -0.9343181848526001, "logps/chosen": -969.009765625, "logps/rejected": -1048.1226806640625, "loss": 0.4333, "rewards/accuracies": 0.78125, "rewards/chosen": -4.81222677230835, "rewards/margins": 1.035292148590088, "rewards/rejected": -5.847518444061279, "step": 795 }, { "epoch": 0.519879173810107, "grad_norm": 16.66478304693279, "learning_rate": 1.1058897516960814e-07, "logits/chosen": -1.0090668201446533, "logits/rejected": -0.9655574560165405, "logps/chosen": -897.3173217773438, "logps/rejected": -1005.6980590820312, "loss": 0.3305, "rewards/accuracies": 0.9375, "rewards/chosen": -4.381283283233643, "rewards/margins": 1.1640375852584839, "rewards/rejected": -5.545321464538574, "step": 796 }, { "epoch": 0.5205322883500694, "grad_norm": 20.808507419653285, "learning_rate": 1.1036208286743245e-07, "logits/chosen": -0.7843279242515564, "logits/rejected": -0.7574375867843628, "logps/chosen": -872.26416015625, "logps/rejected": -1070.68798828125, "loss": 0.4145, "rewards/accuracies": 0.71875, "rewards/chosen": -4.312079429626465, "rewards/margins": 1.5701541900634766, "rewards/rejected": -5.882233619689941, "step": 797 }, { "epoch": 0.5211854028900318, "grad_norm": 20.86240755584441, "learning_rate": 1.1013513662925647e-07, "logits/chosen": -0.7959500551223755, "logits/rejected": -0.8046637773513794, "logps/chosen": -928.501953125, "logps/rejected": -1132.994140625, "loss": 0.4094, "rewards/accuracies": 0.875, "rewards/chosen": -4.8640923500061035, "rewards/margins": 1.7896808385849, "rewards/rejected": -6.653773307800293, "step": 798 }, { "epoch": 0.5218385174299943, "grad_norm": 17.250116169588317, "learning_rate": 1.0990813763636511e-07, "logits/chosen": -0.9496831297874451, "logits/rejected": -0.8449395895004272, "logps/chosen": -855.71630859375, "logps/rejected": -891.7100219726562, "loss": 0.4311, "rewards/accuracies": 0.65625, "rewards/chosen": -4.203281879425049, "rewards/margins": 0.8583158254623413, "rewards/rejected": -5.061598300933838, "step": 799 }, { "epoch": 0.5224916319699567, "grad_norm": 23.966712460917016, "learning_rate": 1.0968108707031791e-07, "logits/chosen": -0.9248429536819458, "logits/rejected": -0.8824765682220459, "logps/chosen": -1019.092041015625, "logps/rejected": -1253.128662109375, "loss": 0.35, "rewards/accuracies": 0.8125, "rewards/chosen": -4.74540376663208, "rewards/margins": 1.9570448398590088, "rewards/rejected": -6.70244836807251, "step": 800 }, { "epoch": 0.5224916319699567, "eval_logits/chosen": -0.6346727013587952, "eval_logits/rejected": -0.5778316855430603, "eval_logps/chosen": -945.1963500976562, "eval_logps/rejected": -1081.7843017578125, "eval_loss": 0.418577641248703, "eval_rewards/accuracies": 0.8009999990463257, "eval_rewards/chosen": -4.712891578674316, "eval_rewards/margins": 1.4532641172409058, "eval_rewards/rejected": -6.16615629196167, "eval_runtime": 620.2119, "eval_samples_per_second": 6.449, "eval_steps_per_second": 0.403, "step": 800 }, { "epoch": 0.5231447465099192, "grad_norm": 26.85233093827759, "learning_rate": 1.0945398611294285e-07, "logits/chosen": -0.7998533248901367, "logits/rejected": -0.5648460388183594, "logps/chosen": -1003.441162109375, "logps/rejected": -1087.8489990234375, "loss": 0.4996, "rewards/accuracies": 0.6875, "rewards/chosen": -5.341720104217529, "rewards/margins": 1.3586604595184326, "rewards/rejected": -6.700381278991699, "step": 801 }, { "epoch": 0.5237978610498816, "grad_norm": 17.38645527294449, "learning_rate": 1.092268359463302e-07, "logits/chosen": -0.7134398818016052, "logits/rejected": -0.6134901642799377, "logps/chosen": -907.7614135742188, "logps/rejected": -1068.71337890625, "loss": 0.4722, "rewards/accuracies": 0.8125, "rewards/chosen": -4.564774990081787, "rewards/margins": 1.775583267211914, "rewards/rejected": -6.340358734130859, "step": 802 }, { "epoch": 0.5244509755898441, "grad_norm": 19.71290467024556, "learning_rate": 1.0899963775282634e-07, "logits/chosen": -0.8641142249107361, "logits/rejected": -0.897304892539978, "logps/chosen": -932.3726196289062, "logps/rejected": -1118.64599609375, "loss": 0.4241, "rewards/accuracies": 0.78125, "rewards/chosen": -4.4526777267456055, "rewards/margins": 1.5689259767532349, "rewards/rejected": -6.021603584289551, "step": 803 }, { "epoch": 0.5251040901298065, "grad_norm": 18.33626618763272, "learning_rate": 1.0877239271502772e-07, "logits/chosen": -0.8803704977035522, "logits/rejected": -0.6588385105133057, "logps/chosen": -873.22021484375, "logps/rejected": -1023.843994140625, "loss": 0.3665, "rewards/accuracies": 0.9375, "rewards/chosen": -4.044747352600098, "rewards/margins": 1.8748221397399902, "rewards/rejected": -5.919569492340088, "step": 804 }, { "epoch": 0.5257572046697689, "grad_norm": 17.680349628821233, "learning_rate": 1.0854510201577449e-07, "logits/chosen": -0.8897794485092163, "logits/rejected": -0.9353486895561218, "logps/chosen": -917.898193359375, "logps/rejected": -1087.0511474609375, "loss": 0.4349, "rewards/accuracies": 0.78125, "rewards/chosen": -4.276698589324951, "rewards/margins": 1.452713131904602, "rewards/rejected": -5.729411602020264, "step": 805 }, { "epoch": 0.5264103192097314, "grad_norm": 21.11052512501825, "learning_rate": 1.0831776683814464e-07, "logits/chosen": -0.9259909987449646, "logits/rejected": -0.8548303842544556, "logps/chosen": -880.1243286132812, "logps/rejected": -971.2887573242188, "loss": 0.4331, "rewards/accuracies": 0.71875, "rewards/chosen": -4.141092300415039, "rewards/margins": 1.3644092082977295, "rewards/rejected": -5.5055012702941895, "step": 806 }, { "epoch": 0.5270634337496939, "grad_norm": 19.97601485174499, "learning_rate": 1.0809038836544751e-07, "logits/chosen": -0.7640643119812012, "logits/rejected": -0.7135207056999207, "logps/chosen": -939.5198974609375, "logps/rejected": -1051.8248291015625, "loss": 0.4668, "rewards/accuracies": 0.84375, "rewards/chosen": -4.619152069091797, "rewards/margins": 1.2952475547790527, "rewards/rejected": -5.914400100708008, "step": 807 }, { "epoch": 0.5277165482896563, "grad_norm": 23.39884997243589, "learning_rate": 1.0786296778121786e-07, "logits/chosen": -0.6651238203048706, "logits/rejected": -0.6047101020812988, "logps/chosen": -918.5106201171875, "logps/rejected": -1037.3143310546875, "loss": 0.3461, "rewards/accuracies": 0.84375, "rewards/chosen": -4.469816207885742, "rewards/margins": 1.4443023204803467, "rewards/rejected": -5.914118766784668, "step": 808 }, { "epoch": 0.5283696628296187, "grad_norm": 22.14266673501406, "learning_rate": 1.0763550626920972e-07, "logits/chosen": -0.8417829275131226, "logits/rejected": -0.6736387014389038, "logps/chosen": -863.1395874023438, "logps/rejected": -891.3818359375, "loss": 0.4795, "rewards/accuracies": 0.65625, "rewards/chosen": -4.130784034729004, "rewards/margins": 0.5567496418952942, "rewards/rejected": -4.687533378601074, "step": 809 }, { "epoch": 0.5290227773695811, "grad_norm": 16.303972428695925, "learning_rate": 1.0740800501339007e-07, "logits/chosen": -0.7094486951828003, "logits/rejected": -0.6242579221725464, "logps/chosen": -942.656005859375, "logps/rejected": -1013.3216552734375, "loss": 0.4109, "rewards/accuracies": 0.71875, "rewards/chosen": -4.7234578132629395, "rewards/margins": 1.2442445755004883, "rewards/rejected": -5.967702865600586, "step": 810 }, { "epoch": 0.5296758919095437, "grad_norm": 14.747642988422188, "learning_rate": 1.0718046519793276e-07, "logits/chosen": -0.995162308216095, "logits/rejected": -0.9179979562759399, "logps/chosen": -830.9282836914062, "logps/rejected": -976.679931640625, "loss": 0.4707, "rewards/accuracies": 0.90625, "rewards/chosen": -3.731590986251831, "rewards/margins": 1.6209721565246582, "rewards/rejected": -5.35256290435791, "step": 811 }, { "epoch": 0.5303290064495061, "grad_norm": 15.35127575032279, "learning_rate": 1.0695288800721239e-07, "logits/chosen": -0.7637011408805847, "logits/rejected": -0.7296819090843201, "logps/chosen": -851.900146484375, "logps/rejected": -968.2127685546875, "loss": 0.4265, "rewards/accuracies": 0.84375, "rewards/chosen": -3.94958758354187, "rewards/margins": 1.1997019052505493, "rewards/rejected": -5.149289131164551, "step": 812 }, { "epoch": 0.5309821209894685, "grad_norm": 13.572049988213855, "learning_rate": 1.0672527462579808e-07, "logits/chosen": -1.0328404903411865, "logits/rejected": -0.9988901019096375, "logps/chosen": -1081.924560546875, "logps/rejected": -1306.794189453125, "loss": 0.4245, "rewards/accuracies": 0.9375, "rewards/chosen": -5.272036552429199, "rewards/margins": 2.137697219848633, "rewards/rejected": -7.409733772277832, "step": 813 }, { "epoch": 0.5316352355294309, "grad_norm": 19.156616594792975, "learning_rate": 1.0649762623844732e-07, "logits/chosen": -0.9236339330673218, "logits/rejected": -0.8460508584976196, "logps/chosen": -954.2943115234375, "logps/rejected": -1075.7880859375, "loss": 0.3284, "rewards/accuracies": 0.84375, "rewards/chosen": -4.132830619812012, "rewards/margins": 1.6366045475006104, "rewards/rejected": -5.769435405731201, "step": 814 }, { "epoch": 0.5322883500693935, "grad_norm": 23.48249212079967, "learning_rate": 1.0626994403009984e-07, "logits/chosen": -0.8499413728713989, "logits/rejected": -0.8164854645729065, "logps/chosen": -830.4996337890625, "logps/rejected": -956.6808471679688, "loss": 0.4009, "rewards/accuracies": 0.84375, "rewards/chosen": -4.115767478942871, "rewards/margins": 1.3039559125900269, "rewards/rejected": -5.419723033905029, "step": 815 }, { "epoch": 0.5329414646093559, "grad_norm": 16.588524539188985, "learning_rate": 1.0604222918587138e-07, "logits/chosen": -0.8046386241912842, "logits/rejected": -0.8380962610244751, "logps/chosen": -883.6956787109375, "logps/rejected": -1103.479248046875, "loss": 0.3659, "rewards/accuracies": 0.84375, "rewards/chosen": -4.380067825317383, "rewards/margins": 1.518940806388855, "rewards/rejected": -5.899008750915527, "step": 816 }, { "epoch": 0.5335945791493183, "grad_norm": 37.47838079132424, "learning_rate": 1.0581448289104757e-07, "logits/chosen": -0.8544302582740784, "logits/rejected": -0.8431679010391235, "logps/chosen": -943.7468872070312, "logps/rejected": -1091.618896484375, "loss": 0.4921, "rewards/accuracies": 0.90625, "rewards/chosen": -4.296459674835205, "rewards/margins": 1.5792046785354614, "rewards/rejected": -5.875664234161377, "step": 817 }, { "epoch": 0.5342476936892807, "grad_norm": 17.35937964327749, "learning_rate": 1.0558670633107778e-07, "logits/chosen": -0.8644323348999023, "logits/rejected": -0.7818232774734497, "logps/chosen": -911.686279296875, "logps/rejected": -1052.9893798828125, "loss": 0.3812, "rewards/accuracies": 0.875, "rewards/chosen": -4.129489898681641, "rewards/margins": 1.676690697669983, "rewards/rejected": -5.806180477142334, "step": 818 }, { "epoch": 0.5349008082292432, "grad_norm": 22.2328867221953, "learning_rate": 1.0535890069156883e-07, "logits/chosen": -0.7671269178390503, "logits/rejected": -0.7240158319473267, "logps/chosen": -901.3548583984375, "logps/rejected": -1148.04248046875, "loss": 0.3978, "rewards/accuracies": 0.875, "rewards/chosen": -4.508139133453369, "rewards/margins": 1.8734978437423706, "rewards/rejected": -6.381637096405029, "step": 819 }, { "epoch": 0.5355539227692057, "grad_norm": 18.971039193243993, "learning_rate": 1.0513106715827895e-07, "logits/chosen": -0.9428325891494751, "logits/rejected": -0.9358526468276978, "logps/chosen": -1081.2198486328125, "logps/rejected": -1319.913330078125, "loss": 0.4345, "rewards/accuracies": 0.8125, "rewards/chosen": -6.017473220825195, "rewards/margins": 1.9838080406188965, "rewards/rejected": -8.00128173828125, "step": 820 }, { "epoch": 0.5362070373091681, "grad_norm": 20.059049388946285, "learning_rate": 1.0490320691711161e-07, "logits/chosen": -0.8310408592224121, "logits/rejected": -0.679469108581543, "logps/chosen": -968.2921752929688, "logps/rejected": -1083.9666748046875, "loss": 0.4387, "rewards/accuracies": 0.84375, "rewards/chosen": -4.922040939331055, "rewards/margins": 1.4484070539474487, "rewards/rejected": -6.370449066162109, "step": 821 }, { "epoch": 0.5368601518491305, "grad_norm": 16.817851734320218, "learning_rate": 1.046753211541092e-07, "logits/chosen": -0.7916433811187744, "logits/rejected": -0.8427475094795227, "logps/chosen": -929.83447265625, "logps/rejected": -1193.4378662109375, "loss": 0.3673, "rewards/accuracies": 0.875, "rewards/chosen": -4.488502502441406, "rewards/margins": 2.309746265411377, "rewards/rejected": -6.798249244689941, "step": 822 }, { "epoch": 0.537513266389093, "grad_norm": 17.875343156058488, "learning_rate": 1.0444741105544703e-07, "logits/chosen": -0.7578955888748169, "logits/rejected": -0.6925256252288818, "logps/chosen": -952.1182861328125, "logps/rejected": -1144.278076171875, "loss": 0.411, "rewards/accuracies": 0.8125, "rewards/chosen": -4.83486270904541, "rewards/margins": 1.6570872068405151, "rewards/rejected": -6.491950035095215, "step": 823 }, { "epoch": 0.5381663809290554, "grad_norm": 18.053703232762427, "learning_rate": 1.0421947780742703e-07, "logits/chosen": -0.8508166670799255, "logits/rejected": -0.7643336653709412, "logps/chosen": -934.5198974609375, "logps/rejected": -1070.933349609375, "loss": 0.3955, "rewards/accuracies": 0.78125, "rewards/chosen": -4.615598678588867, "rewards/margins": 1.5782887935638428, "rewards/rejected": -6.193887233734131, "step": 824 }, { "epoch": 0.5388194954690179, "grad_norm": 19.898275287550767, "learning_rate": 1.0399152259647168e-07, "logits/chosen": -0.894473671913147, "logits/rejected": -0.8692827224731445, "logps/chosen": -901.361328125, "logps/rejected": -1145.5904541015625, "loss": 0.4026, "rewards/accuracies": 0.84375, "rewards/chosen": -3.9130971431732178, "rewards/margins": 2.1608455181121826, "rewards/rejected": -6.0739426612854, "step": 825 }, { "epoch": 0.5394726100089803, "grad_norm": 62.75164418514141, "learning_rate": 1.0376354660911771e-07, "logits/chosen": -0.9447248578071594, "logits/rejected": -0.8414398431777954, "logps/chosen": -960.39453125, "logps/rejected": -1077.34814453125, "loss": 0.3594, "rewards/accuracies": 0.875, "rewards/chosen": -4.909175395965576, "rewards/margins": 1.574775218963623, "rewards/rejected": -6.483950614929199, "step": 826 }, { "epoch": 0.5401257245489428, "grad_norm": 21.232910166545164, "learning_rate": 1.0353555103201006e-07, "logits/chosen": -0.8699998259544373, "logits/rejected": -0.9011096358299255, "logps/chosen": -966.4214477539062, "logps/rejected": -1145.947021484375, "loss": 0.3985, "rewards/accuracies": 0.875, "rewards/chosen": -4.497706890106201, "rewards/margins": 1.7452514171600342, "rewards/rejected": -6.242958068847656, "step": 827 }, { "epoch": 0.5407788390889052, "grad_norm": 25.240205454629475, "learning_rate": 1.0330753705189561e-07, "logits/chosen": -0.6747693419456482, "logits/rejected": -0.7031739354133606, "logps/chosen": -914.111083984375, "logps/rejected": -1141.014892578125, "loss": 0.4238, "rewards/accuracies": 0.8125, "rewards/chosen": -4.844748497009277, "rewards/margins": 1.6366426944732666, "rewards/rejected": -6.481391429901123, "step": 828 }, { "epoch": 0.5414319536288676, "grad_norm": 18.613133740822978, "learning_rate": 1.0307950585561704e-07, "logits/chosen": -0.7960176467895508, "logits/rejected": -0.5775362253189087, "logps/chosen": -913.5531616210938, "logps/rejected": -983.9921875, "loss": 0.4677, "rewards/accuracies": 0.6875, "rewards/chosen": -4.511989593505859, "rewards/margins": 1.0901737213134766, "rewards/rejected": -5.602163314819336, "step": 829 }, { "epoch": 0.5420850681688301, "grad_norm": 14.709016520023226, "learning_rate": 1.028514586301066e-07, "logits/chosen": -0.9919070601463318, "logits/rejected": -0.8352288007736206, "logps/chosen": -917.8429565429688, "logps/rejected": -978.2846069335938, "loss": 0.4225, "rewards/accuracies": 0.8125, "rewards/chosen": -4.310373306274414, "rewards/margins": 1.0810041427612305, "rewards/rejected": -5.391376972198486, "step": 830 }, { "epoch": 0.5427381827087926, "grad_norm": 16.588064198046702, "learning_rate": 1.0262339656238003e-07, "logits/chosen": -0.7326082587242126, "logits/rejected": -0.7825813889503479, "logps/chosen": -828.0997314453125, "logps/rejected": -912.7554931640625, "loss": 0.399, "rewards/accuracies": 0.78125, "rewards/chosen": -4.009592056274414, "rewards/margins": 0.8472993969917297, "rewards/rejected": -4.856891632080078, "step": 831 }, { "epoch": 0.543391297248755, "grad_norm": 20.963473622896114, "learning_rate": 1.0239532083953031e-07, "logits/chosen": -0.7908737659454346, "logits/rejected": -0.7778758406639099, "logps/chosen": -886.2369995117188, "logps/rejected": -959.3201904296875, "loss": 0.4719, "rewards/accuracies": 0.75, "rewards/chosen": -4.217853546142578, "rewards/margins": 0.9841310381889343, "rewards/rejected": -5.201984405517578, "step": 832 }, { "epoch": 0.5440444117887174, "grad_norm": 22.538640197716454, "learning_rate": 1.0216723264872145e-07, "logits/chosen": -0.633493185043335, "logits/rejected": -0.6329817771911621, "logps/chosen": -1038.0047607421875, "logps/rejected": -1209.1646728515625, "loss": 0.3923, "rewards/accuracies": 0.875, "rewards/chosen": -4.912166595458984, "rewards/margins": 1.709618330001831, "rewards/rejected": -6.6217851638793945, "step": 833 }, { "epoch": 0.5446975263286798, "grad_norm": 17.514305805925105, "learning_rate": 1.0193913317718243e-07, "logits/chosen": -0.8764938116073608, "logits/rejected": -0.9298416376113892, "logps/chosen": -828.791748046875, "logps/rejected": -1008.34912109375, "loss": 0.4711, "rewards/accuracies": 0.8125, "rewards/chosen": -4.27457857131958, "rewards/margins": 1.551622986793518, "rewards/rejected": -5.826201438903809, "step": 834 }, { "epoch": 0.5453506408686424, "grad_norm": 14.456609289815074, "learning_rate": 1.0171102361220092e-07, "logits/chosen": -0.9871069192886353, "logits/rejected": -0.9372472167015076, "logps/chosen": -894.3827514648438, "logps/rejected": -1032.8778076171875, "loss": 0.4152, "rewards/accuracies": 0.8125, "rewards/chosen": -3.961792469024658, "rewards/margins": 1.1698249578475952, "rewards/rejected": -5.131617069244385, "step": 835 }, { "epoch": 0.5460037554086048, "grad_norm": 17.467326646690836, "learning_rate": 1.014829051411171e-07, "logits/chosen": -0.7733380794525146, "logits/rejected": -0.7617398500442505, "logps/chosen": -855.3818969726562, "logps/rejected": -965.3425903320312, "loss": 0.4027, "rewards/accuracies": 0.8125, "rewards/chosen": -3.709343910217285, "rewards/margins": 1.39662504196167, "rewards/rejected": -5.105968952178955, "step": 836 }, { "epoch": 0.5466568699485672, "grad_norm": 17.238889166022133, "learning_rate": 1.0125477895131756e-07, "logits/chosen": -0.7189229130744934, "logits/rejected": -0.7574179172515869, "logps/chosen": -897.20361328125, "logps/rejected": -1149.623291015625, "loss": 0.3897, "rewards/accuracies": 0.875, "rewards/chosen": -4.643250465393066, "rewards/margins": 1.7560412883758545, "rewards/rejected": -6.399292469024658, "step": 837 }, { "epoch": 0.5473099844885296, "grad_norm": 27.297747074907825, "learning_rate": 1.0102664623022899e-07, "logits/chosen": -0.8005959391593933, "logits/rejected": -0.8152067065238953, "logps/chosen": -887.7992553710938, "logps/rejected": -983.334716796875, "loss": 0.5078, "rewards/accuracies": 0.75, "rewards/chosen": -4.507983207702637, "rewards/margins": 1.1464667320251465, "rewards/rejected": -5.654450416564941, "step": 838 }, { "epoch": 0.5479630990284922, "grad_norm": 19.71863457458606, "learning_rate": 1.0079850816531214e-07, "logits/chosen": -0.9236647486686707, "logits/rejected": -0.9478015899658203, "logps/chosen": -817.2036743164062, "logps/rejected": -914.030517578125, "loss": 0.3951, "rewards/accuracies": 0.8125, "rewards/chosen": -3.605985164642334, "rewards/margins": 1.0325517654418945, "rewards/rejected": -4.6385369300842285, "step": 839 }, { "epoch": 0.5486162135684546, "grad_norm": 14.089508088400418, "learning_rate": 1.0057036594405562e-07, "logits/chosen": -0.8286216259002686, "logits/rejected": -0.8908025026321411, "logps/chosen": -869.789306640625, "logps/rejected": -1080.75537109375, "loss": 0.4289, "rewards/accuracies": 0.75, "rewards/chosen": -3.9381942749023438, "rewards/margins": 1.8179608583450317, "rewards/rejected": -5.756155014038086, "step": 840 }, { "epoch": 0.549269328108417, "grad_norm": 20.274743213765152, "learning_rate": 1.0034222075396952e-07, "logits/chosen": -0.8915444016456604, "logits/rejected": -0.8345527052879333, "logps/chosen": -882.3980712890625, "logps/rejected": -1007.1998291015625, "loss": 0.4294, "rewards/accuracies": 0.84375, "rewards/chosen": -4.207620143890381, "rewards/margins": 1.7206785678863525, "rewards/rejected": -5.928298473358154, "step": 841 }, { "epoch": 0.5499224426483794, "grad_norm": 13.551649707228977, "learning_rate": 1.001140737825795e-07, "logits/chosen": -1.013258695602417, "logits/rejected": -0.9069733023643494, "logps/chosen": -881.470947265625, "logps/rejected": -1054.0950927734375, "loss": 0.333, "rewards/accuracies": 0.90625, "rewards/chosen": -4.0731520652771, "rewards/margins": 1.6127002239227295, "rewards/rejected": -5.68585205078125, "step": 842 }, { "epoch": 0.550575557188342, "grad_norm": 18.23035900921133, "learning_rate": 9.988592621742053e-08, "logits/chosen": -0.8569754958152771, "logits/rejected": -0.7348583340644836, "logps/chosen": -887.21044921875, "logps/rejected": -1064.620849609375, "loss": 0.3786, "rewards/accuracies": 0.84375, "rewards/chosen": -4.200411796569824, "rewards/margins": 1.6168982982635498, "rewards/rejected": -5.817309856414795, "step": 843 }, { "epoch": 0.5512286717283044, "grad_norm": 25.70182139027448, "learning_rate": 9.965777924603051e-08, "logits/chosen": -0.907223105430603, "logits/rejected": -0.9123849868774414, "logps/chosen": -909.4808959960938, "logps/rejected": -1078.512939453125, "loss": 0.4288, "rewards/accuracies": 0.84375, "rewards/chosen": -4.969564437866211, "rewards/margins": 1.5746479034423828, "rewards/rejected": -6.544212341308594, "step": 844 }, { "epoch": 0.5518817862682668, "grad_norm": 30.303009196449445, "learning_rate": 9.942963405594442e-08, "logits/chosen": -0.9817805290222168, "logits/rejected": -0.8983163833618164, "logps/chosen": -1022.9886474609375, "logps/rejected": -1152.95849609375, "loss": 0.4492, "rewards/accuracies": 0.875, "rewards/chosen": -4.9740095138549805, "rewards/margins": 1.571298360824585, "rewards/rejected": -6.545307636260986, "step": 845 }, { "epoch": 0.5525349008082292, "grad_norm": 31.55977335805332, "learning_rate": 9.920149183468785e-08, "logits/chosen": -0.7902059555053711, "logits/rejected": -0.7529444694519043, "logps/chosen": -986.4070434570312, "logps/rejected": -1099.160888671875, "loss": 0.4789, "rewards/accuracies": 0.78125, "rewards/chosen": -5.369345188140869, "rewards/margins": 1.2646675109863281, "rewards/rejected": -6.6340131759643555, "step": 846 }, { "epoch": 0.5531880153481917, "grad_norm": 27.81516473825978, "learning_rate": 9.897335376977101e-08, "logits/chosen": -0.8680393099784851, "logits/rejected": -0.8680979609489441, "logps/chosen": -784.2938232421875, "logps/rejected": -914.517333984375, "loss": 0.3675, "rewards/accuracies": 0.78125, "rewards/chosen": -3.907463550567627, "rewards/margins": 1.699120044708252, "rewards/rejected": -5.606583595275879, "step": 847 }, { "epoch": 0.5538411298881541, "grad_norm": 21.703345546628697, "learning_rate": 9.874522104868246e-08, "logits/chosen": -0.8262298703193665, "logits/rejected": -0.8438596725463867, "logps/chosen": -984.2900390625, "logps/rejected": -1112.5946044921875, "loss": 0.3453, "rewards/accuracies": 0.84375, "rewards/chosen": -4.441863536834717, "rewards/margins": 1.5138037204742432, "rewards/rejected": -5.955667972564697, "step": 848 }, { "epoch": 0.5544942444281166, "grad_norm": 16.37096708792487, "learning_rate": 9.85170948588829e-08, "logits/chosen": -0.8089182376861572, "logits/rejected": -0.7880966663360596, "logps/chosen": -907.8336791992188, "logps/rejected": -1043.2205810546875, "loss": 0.4129, "rewards/accuracies": 0.84375, "rewards/chosen": -4.8086628913879395, "rewards/margins": 1.415217638015747, "rewards/rejected": -6.223879814147949, "step": 849 }, { "epoch": 0.555147358968079, "grad_norm": 19.64590736722594, "learning_rate": 9.828897638779909e-08, "logits/chosen": -0.8265779614448547, "logits/rejected": -0.7707447409629822, "logps/chosen": -885.2160034179688, "logps/rejected": -1009.6092529296875, "loss": 0.4157, "rewards/accuracies": 0.875, "rewards/chosen": -4.476744174957275, "rewards/margins": 1.478574514389038, "rewards/rejected": -5.955318927764893, "step": 850 }, { "epoch": 0.5558004735080415, "grad_norm": 16.589320867103968, "learning_rate": 9.806086682281757e-08, "logits/chosen": -0.9139161109924316, "logits/rejected": -0.8683348298072815, "logps/chosen": -938.354248046875, "logps/rejected": -1108.0430908203125, "loss": 0.4426, "rewards/accuracies": 0.78125, "rewards/chosen": -4.8656134605407715, "rewards/margins": 1.4425888061523438, "rewards/rejected": -6.308202266693115, "step": 851 }, { "epoch": 0.5564535880480039, "grad_norm": 29.62901282273577, "learning_rate": 9.783276735127854e-08, "logits/chosen": -0.8697648644447327, "logits/rejected": -0.9227592945098877, "logps/chosen": -899.2152099609375, "logps/rejected": -1116.580810546875, "loss": 0.4923, "rewards/accuracies": 0.84375, "rewards/chosen": -4.2547454833984375, "rewards/margins": 1.4750183820724487, "rewards/rejected": -5.729763984680176, "step": 852 }, { "epoch": 0.5571067025879664, "grad_norm": 30.236006209514965, "learning_rate": 9.760467916046971e-08, "logits/chosen": -0.7272669076919556, "logits/rejected": -0.7988542318344116, "logps/chosen": -901.4462890625, "logps/rejected": -1169.47900390625, "loss": 0.3836, "rewards/accuracies": 0.84375, "rewards/chosen": -4.271294593811035, "rewards/margins": 2.1608831882476807, "rewards/rejected": -6.4321770668029785, "step": 853 }, { "epoch": 0.5577598171279288, "grad_norm": 14.287731139104126, "learning_rate": 9.737660343761998e-08, "logits/chosen": -0.7816422581672668, "logits/rejected": -0.7482393383979797, "logps/chosen": -966.7650756835938, "logps/rejected": -1164.88818359375, "loss": 0.3908, "rewards/accuracies": 0.875, "rewards/chosen": -5.111935615539551, "rewards/margins": 1.6342039108276367, "rewards/rejected": -6.7461395263671875, "step": 854 }, { "epoch": 0.5584129316678913, "grad_norm": 22.423108939531772, "learning_rate": 9.71485413698934e-08, "logits/chosen": -0.8687413930892944, "logits/rejected": -0.8252356052398682, "logps/chosen": -926.5127563476562, "logps/rejected": -1134.5728759765625, "loss": 0.4262, "rewards/accuracies": 0.84375, "rewards/chosen": -4.845583438873291, "rewards/margins": 1.908933401107788, "rewards/rejected": -6.754517078399658, "step": 855 }, { "epoch": 0.5590660462078537, "grad_norm": 33.852533096509646, "learning_rate": 9.692049414438298e-08, "logits/chosen": -0.7711067795753479, "logits/rejected": -0.7694792747497559, "logps/chosen": -851.6452026367188, "logps/rejected": -1003.3096313476562, "loss": 0.4085, "rewards/accuracies": 0.84375, "rewards/chosen": -4.205625057220459, "rewards/margins": 1.1972630023956299, "rewards/rejected": -5.402888298034668, "step": 856 }, { "epoch": 0.5597191607478161, "grad_norm": 20.80420263521599, "learning_rate": 9.66924629481044e-08, "logits/chosen": -0.6256682872772217, "logits/rejected": -0.6322759985923767, "logps/chosen": -906.5489501953125, "logps/rejected": -1021.0867309570312, "loss": 0.4294, "rewards/accuracies": 0.84375, "rewards/chosen": -4.7905402183532715, "rewards/margins": 1.0824882984161377, "rewards/rejected": -5.87302827835083, "step": 857 }, { "epoch": 0.5603722752877786, "grad_norm": 32.72902729562608, "learning_rate": 9.646444896798995e-08, "logits/chosen": -0.7554073333740234, "logits/rejected": -0.7180957198143005, "logps/chosen": -828.9449462890625, "logps/rejected": -949.5023193359375, "loss": 0.4937, "rewards/accuracies": 0.78125, "rewards/chosen": -4.3675971031188965, "rewards/margins": 1.031946063041687, "rewards/rejected": -5.399543762207031, "step": 858 }, { "epoch": 0.5610253898277411, "grad_norm": 18.554744408873283, "learning_rate": 9.623645339088228e-08, "logits/chosen": -0.9316878318786621, "logits/rejected": -0.744033932685852, "logps/chosen": -991.4241333007812, "logps/rejected": -1106.7291259765625, "loss": 0.435, "rewards/accuracies": 0.6875, "rewards/chosen": -4.765175819396973, "rewards/margins": 1.3670111894607544, "rewards/rejected": -6.1321868896484375, "step": 859 }, { "epoch": 0.5616785043677035, "grad_norm": 19.31098701896201, "learning_rate": 9.600847740352833e-08, "logits/chosen": -0.8235379457473755, "logits/rejected": -0.8043882846832275, "logps/chosen": -1014.3311157226562, "logps/rejected": -1210.599853515625, "loss": 0.4316, "rewards/accuracies": 0.75, "rewards/chosen": -5.12346887588501, "rewards/margins": 1.4316790103912354, "rewards/rejected": -6.555147647857666, "step": 860 }, { "epoch": 0.5623316189076659, "grad_norm": 19.831850467702388, "learning_rate": 9.578052219257297e-08, "logits/chosen": -0.874383807182312, "logits/rejected": -0.8897905349731445, "logps/chosen": -888.3280639648438, "logps/rejected": -1030.50244140625, "loss": 0.3993, "rewards/accuracies": 0.84375, "rewards/chosen": -3.6704063415527344, "rewards/margins": 1.5759150981903076, "rewards/rejected": -5.246321678161621, "step": 861 }, { "epoch": 0.5629847334476283, "grad_norm": 25.583440042617287, "learning_rate": 9.555258894455297e-08, "logits/chosen": -0.9648239612579346, "logits/rejected": -0.9095680713653564, "logps/chosen": -939.7294921875, "logps/rejected": -1098.48388671875, "loss": 0.4398, "rewards/accuracies": 0.71875, "rewards/chosen": -4.5690765380859375, "rewards/margins": 1.7343909740447998, "rewards/rejected": -6.303468227386475, "step": 862 }, { "epoch": 0.5636378479875909, "grad_norm": 23.956822440993626, "learning_rate": 9.532467884589079e-08, "logits/chosen": -0.8250060081481934, "logits/rejected": -0.8016010522842407, "logps/chosen": -864.0713500976562, "logps/rejected": -1081.85986328125, "loss": 0.4364, "rewards/accuracies": 0.84375, "rewards/chosen": -4.301777362823486, "rewards/margins": 1.6754717826843262, "rewards/rejected": -5.977249622344971, "step": 863 }, { "epoch": 0.5642909625275533, "grad_norm": 15.98416872196485, "learning_rate": 9.509679308288838e-08, "logits/chosen": -0.7682183980941772, "logits/rejected": -0.7244136929512024, "logps/chosen": -967.8103637695312, "logps/rejected": -1283.90380859375, "loss": 0.3473, "rewards/accuracies": 0.78125, "rewards/chosen": -4.515521049499512, "rewards/margins": 2.6907920837402344, "rewards/rejected": -7.206313133239746, "step": 864 }, { "epoch": 0.5649440770675157, "grad_norm": 26.277805017545777, "learning_rate": 9.486893284172101e-08, "logits/chosen": -0.9032926559448242, "logits/rejected": -0.7865481376647949, "logps/chosen": -837.1546630859375, "logps/rejected": -1026.15185546875, "loss": 0.4214, "rewards/accuracies": 0.78125, "rewards/chosen": -3.893397331237793, "rewards/margins": 1.903401494026184, "rewards/rejected": -5.796799182891846, "step": 865 }, { "epoch": 0.5655971916074781, "grad_norm": 18.34731751923566, "learning_rate": 9.464109930843119e-08, "logits/chosen": -0.7677436470985413, "logits/rejected": -0.7806234955787659, "logps/chosen": -898.8002319335938, "logps/rejected": -1071.4962158203125, "loss": 0.4379, "rewards/accuracies": 0.75, "rewards/chosen": -4.56309700012207, "rewards/margins": 1.2050151824951172, "rewards/rejected": -5.7681121826171875, "step": 866 }, { "epoch": 0.5662503061474407, "grad_norm": 20.18205861838406, "learning_rate": 9.441329366892222e-08, "logits/chosen": -0.9723510146141052, "logits/rejected": -0.8582167625427246, "logps/chosen": -932.4115600585938, "logps/rejected": -1021.81494140625, "loss": 0.4462, "rewards/accuracies": 0.8125, "rewards/chosen": -4.6146979331970215, "rewards/margins": 1.169451355934143, "rewards/rejected": -5.784149169921875, "step": 867 }, { "epoch": 0.5669034206874031, "grad_norm": 18.457641315699057, "learning_rate": 9.418551710895242e-08, "logits/chosen": -0.908024787902832, "logits/rejected": -0.8809236288070679, "logps/chosen": -964.94140625, "logps/rejected": -1143.9808349609375, "loss": 0.392, "rewards/accuracies": 0.78125, "rewards/chosen": -4.553819179534912, "rewards/margins": 1.8509615659713745, "rewards/rejected": -6.404780864715576, "step": 868 }, { "epoch": 0.5675565352273655, "grad_norm": 14.028941998053218, "learning_rate": 9.395777081412863e-08, "logits/chosen": -0.9205148220062256, "logits/rejected": -0.8955953121185303, "logps/chosen": -1036.298828125, "logps/rejected": -1205.27880859375, "loss": 0.3663, "rewards/accuracies": 0.84375, "rewards/chosen": -5.52469539642334, "rewards/margins": 1.6343746185302734, "rewards/rejected": -7.159069538116455, "step": 869 }, { "epoch": 0.5682096497673279, "grad_norm": 15.207559565633286, "learning_rate": 9.373005596990017e-08, "logits/chosen": -0.9882926940917969, "logits/rejected": -0.8943915367126465, "logps/chosen": -1000.8101196289062, "logps/rejected": -1069.3223876953125, "loss": 0.3789, "rewards/accuracies": 0.75, "rewards/chosen": -4.538582801818848, "rewards/margins": 1.1359279155731201, "rewards/rejected": -5.674510955810547, "step": 870 }, { "epoch": 0.5688627643072904, "grad_norm": 17.924430093356584, "learning_rate": 9.350237376155267e-08, "logits/chosen": -0.7669613361358643, "logits/rejected": -0.7748525738716125, "logps/chosen": -969.697265625, "logps/rejected": -1114.1466064453125, "loss": 0.4385, "rewards/accuracies": 0.84375, "rewards/chosen": -4.298352241516113, "rewards/margins": 1.4008281230926514, "rewards/rejected": -5.699180603027344, "step": 871 }, { "epoch": 0.5695158788472529, "grad_norm": 15.756432200133906, "learning_rate": 9.327472537420193e-08, "logits/chosen": -0.9131944179534912, "logits/rejected": -0.9345240592956543, "logps/chosen": -950.0737915039062, "logps/rejected": -1141.3934326171875, "loss": 0.4235, "rewards/accuracies": 0.84375, "rewards/chosen": -4.154352188110352, "rewards/margins": 1.5505449771881104, "rewards/rejected": -5.704896926879883, "step": 872 }, { "epoch": 0.5701689933872153, "grad_norm": 13.680882524525893, "learning_rate": 9.30471119927876e-08, "logits/chosen": -0.7933511734008789, "logits/rejected": -0.758368968963623, "logps/chosen": -911.67529296875, "logps/rejected": -1185.8714599609375, "loss": 0.3754, "rewards/accuracies": 0.84375, "rewards/chosen": -4.790996074676514, "rewards/margins": 2.2015562057495117, "rewards/rejected": -6.992551803588867, "step": 873 }, { "epoch": 0.5708221079271777, "grad_norm": 18.923502793123205, "learning_rate": 9.281953480206723e-08, "logits/chosen": -0.7157567143440247, "logits/rejected": -0.7758267521858215, "logps/chosen": -849.9529418945312, "logps/rejected": -1070.7559814453125, "loss": 0.4422, "rewards/accuracies": 0.78125, "rewards/chosen": -4.099109649658203, "rewards/margins": 1.9049654006958008, "rewards/rejected": -6.004075050354004, "step": 874 }, { "epoch": 0.5714752224671402, "grad_norm": 22.82247720714276, "learning_rate": 9.259199498660993e-08, "logits/chosen": -1.0105547904968262, "logits/rejected": -0.8255823850631714, "logps/chosen": -944.2193603515625, "logps/rejected": -1074.181884765625, "loss": 0.3415, "rewards/accuracies": 0.8125, "rewards/chosen": -4.413623332977295, "rewards/margins": 1.8328722715377808, "rewards/rejected": -6.246495246887207, "step": 875 }, { "epoch": 0.5721283370071026, "grad_norm": 35.206675729481915, "learning_rate": 9.236449373079026e-08, "logits/chosen": -0.9094786643981934, "logits/rejected": -0.778907060623169, "logps/chosen": -933.5310668945312, "logps/rejected": -1074.3388671875, "loss": 0.475, "rewards/accuracies": 0.71875, "rewards/chosen": -4.948477745056152, "rewards/margins": 1.522892951965332, "rewards/rejected": -6.471371173858643, "step": 876 }, { "epoch": 0.5727814515470651, "grad_norm": 27.421712157237085, "learning_rate": 9.213703221878215e-08, "logits/chosen": -0.7551379799842834, "logits/rejected": -0.7433313131332397, "logps/chosen": -939.6011962890625, "logps/rejected": -1161.37548828125, "loss": 0.3851, "rewards/accuracies": 0.8125, "rewards/chosen": -4.847814559936523, "rewards/margins": 1.955463171005249, "rewards/rejected": -6.803277969360352, "step": 877 }, { "epoch": 0.5734345660870275, "grad_norm": 16.61141184713571, "learning_rate": 9.190961163455253e-08, "logits/chosen": -0.8657856583595276, "logits/rejected": -0.8066181540489197, "logps/chosen": -973.77490234375, "logps/rejected": -1163.808349609375, "loss": 0.3738, "rewards/accuracies": 0.84375, "rewards/chosen": -4.766665458679199, "rewards/margins": 1.7785844802856445, "rewards/rejected": -6.545250415802002, "step": 878 }, { "epoch": 0.57408768062699, "grad_norm": 33.86515631869608, "learning_rate": 9.168223316185538e-08, "logits/chosen": -0.8339582681655884, "logits/rejected": -0.7555098533630371, "logps/chosen": -1008.7633056640625, "logps/rejected": -1209.2352294921875, "loss": 0.4745, "rewards/accuracies": 0.84375, "rewards/chosen": -5.217278480529785, "rewards/margins": 1.7660671472549438, "rewards/rejected": -6.983345985412598, "step": 879 }, { "epoch": 0.5747407951669524, "grad_norm": 23.838839628578985, "learning_rate": 9.14548979842255e-08, "logits/chosen": -0.8603442311286926, "logits/rejected": -0.7912033796310425, "logps/chosen": -1011.2332763671875, "logps/rejected": -1169.10205078125, "loss": 0.4029, "rewards/accuracies": 0.8125, "rewards/chosen": -5.176833152770996, "rewards/margins": 1.4864170551300049, "rewards/rejected": -6.663249969482422, "step": 880 }, { "epoch": 0.5753939097069148, "grad_norm": 24.08255938340959, "learning_rate": 9.12276072849723e-08, "logits/chosen": -0.9084462523460388, "logits/rejected": -0.9800143241882324, "logps/chosen": -992.2244873046875, "logps/rejected": -1158.918701171875, "loss": 0.3761, "rewards/accuracies": 0.875, "rewards/chosen": -4.683312892913818, "rewards/margins": 1.6894042491912842, "rewards/rejected": -6.372716903686523, "step": 881 }, { "epoch": 0.5760470242468773, "grad_norm": 20.741267761943313, "learning_rate": 9.100036224717366e-08, "logits/chosen": -0.8111915588378906, "logits/rejected": -0.8897655010223389, "logps/chosen": -938.0897216796875, "logps/rejected": -1127.0625, "loss": 0.4212, "rewards/accuracies": 0.75, "rewards/chosen": -4.022097110748291, "rewards/margins": 1.2158722877502441, "rewards/rejected": -5.237969875335693, "step": 882 }, { "epoch": 0.5767001387868398, "grad_norm": 17.520051364033677, "learning_rate": 9.077316405366981e-08, "logits/chosen": -0.8745783567428589, "logits/rejected": -0.867775559425354, "logps/chosen": -1060.55078125, "logps/rejected": -1236.429443359375, "loss": 0.3522, "rewards/accuracies": 0.875, "rewards/chosen": -5.311179161071777, "rewards/margins": 1.8504433631896973, "rewards/rejected": -7.161622047424316, "step": 883 }, { "epoch": 0.5773532533268022, "grad_norm": 17.560718564711973, "learning_rate": 9.054601388705715e-08, "logits/chosen": -0.8407562375068665, "logits/rejected": -0.7107415199279785, "logps/chosen": -1017.076416015625, "logps/rejected": -1125.1346435546875, "loss": 0.4278, "rewards/accuracies": 0.6875, "rewards/chosen": -5.200894832611084, "rewards/margins": 1.2635349035263062, "rewards/rejected": -6.464428901672363, "step": 884 }, { "epoch": 0.5780063678667646, "grad_norm": 17.42515113495744, "learning_rate": 9.03189129296821e-08, "logits/chosen": -0.598199725151062, "logits/rejected": -0.5980876684188843, "logps/chosen": -890.3635864257812, "logps/rejected": -1140.5980224609375, "loss": 0.3686, "rewards/accuracies": 0.875, "rewards/chosen": -4.948042869567871, "rewards/margins": 2.095231533050537, "rewards/rejected": -7.043274879455566, "step": 885 }, { "epoch": 0.578659482406727, "grad_norm": 16.370071766658146, "learning_rate": 9.00918623636349e-08, "logits/chosen": -0.8857473134994507, "logits/rejected": -0.7719743847846985, "logps/chosen": -1002.3225708007812, "logps/rejected": -1073.412353515625, "loss": 0.4162, "rewards/accuracies": 0.71875, "rewards/chosen": -5.398256778717041, "rewards/margins": 1.0608148574829102, "rewards/rejected": -6.459071636199951, "step": 886 }, { "epoch": 0.5793125969466896, "grad_norm": 20.019533693747015, "learning_rate": 8.986486337074354e-08, "logits/chosen": -0.8915666341781616, "logits/rejected": -0.9138790965080261, "logps/chosen": -909.7576904296875, "logps/rejected": -976.8128051757812, "loss": 0.476, "rewards/accuracies": 0.71875, "rewards/chosen": -4.364135265350342, "rewards/margins": 0.7550020217895508, "rewards/rejected": -5.119136810302734, "step": 887 }, { "epoch": 0.579965711486652, "grad_norm": 18.672808344529344, "learning_rate": 8.963791713256754e-08, "logits/chosen": -0.9769740104675293, "logits/rejected": -0.852337121963501, "logps/chosen": -977.1018676757812, "logps/rejected": -1132.941162109375, "loss": 0.3811, "rewards/accuracies": 0.8125, "rewards/chosen": -4.742093086242676, "rewards/margins": 1.554567813873291, "rewards/rejected": -6.296660423278809, "step": 888 }, { "epoch": 0.5806188260266144, "grad_norm": 27.233632516341583, "learning_rate": 8.941102483039187e-08, "logits/chosen": -0.9475686550140381, "logits/rejected": -0.8408269882202148, "logps/chosen": -985.3871459960938, "logps/rejected": -1131.696533203125, "loss": 0.3782, "rewards/accuracies": 0.84375, "rewards/chosen": -4.957531929016113, "rewards/margins": 1.5096138715744019, "rewards/rejected": -6.4671454429626465, "step": 889 }, { "epoch": 0.5812719405665768, "grad_norm": 19.66330179420604, "learning_rate": 8.918418764522068e-08, "logits/chosen": -0.7873989939689636, "logits/rejected": -0.8866724371910095, "logps/chosen": -841.0829467773438, "logps/rejected": -1159.834228515625, "loss": 0.34, "rewards/accuracies": 0.84375, "rewards/chosen": -3.916111469268799, "rewards/margins": 2.1629438400268555, "rewards/rejected": -6.079055309295654, "step": 890 }, { "epoch": 0.5819250551065394, "grad_norm": 17.46307037445428, "learning_rate": 8.895740675777126e-08, "logits/chosen": -1.0209726095199585, "logits/rejected": -1.0288503170013428, "logps/chosen": -938.6088256835938, "logps/rejected": -1139.0146484375, "loss": 0.3924, "rewards/accuracies": 0.9375, "rewards/chosen": -4.702238082885742, "rewards/margins": 2.0684337615966797, "rewards/rejected": -6.770671844482422, "step": 891 }, { "epoch": 0.5825781696465018, "grad_norm": 22.798831643296747, "learning_rate": 8.873068334846789e-08, "logits/chosen": -0.8137722015380859, "logits/rejected": -0.8584403395652771, "logps/chosen": -811.7307739257812, "logps/rejected": -1032.501953125, "loss": 0.4058, "rewards/accuracies": 0.78125, "rewards/chosen": -3.759009599685669, "rewards/margins": 1.7417285442352295, "rewards/rejected": -5.500738143920898, "step": 892 }, { "epoch": 0.5832312841864642, "grad_norm": 23.030536411916117, "learning_rate": 8.850401859743565e-08, "logits/chosen": -0.8594922423362732, "logits/rejected": -0.8479385375976562, "logps/chosen": -990.1287231445312, "logps/rejected": -1060.483642578125, "loss": 0.3786, "rewards/accuracies": 0.8125, "rewards/chosen": -4.541996479034424, "rewards/margins": 1.1046607494354248, "rewards/rejected": -5.646656513214111, "step": 893 }, { "epoch": 0.5838843987264266, "grad_norm": 32.01001320363752, "learning_rate": 8.82774136844943e-08, "logits/chosen": -0.620232343673706, "logits/rejected": -0.672800600528717, "logps/chosen": -959.5262451171875, "logps/rejected": -1130.987060546875, "loss": 0.4791, "rewards/accuracies": 0.8125, "rewards/chosen": -5.572679042816162, "rewards/margins": 1.3316614627838135, "rewards/rejected": -6.9043402671813965, "step": 894 }, { "epoch": 0.5845375132663891, "grad_norm": 17.256513174943777, "learning_rate": 8.805086978915213e-08, "logits/chosen": -0.8671358227729797, "logits/rejected": -0.9369436502456665, "logps/chosen": -885.9805297851562, "logps/rejected": -1093.339111328125, "loss": 0.3705, "rewards/accuracies": 0.9375, "rewards/chosen": -4.081684112548828, "rewards/margins": 2.167947292327881, "rewards/rejected": -6.249631881713867, "step": 895 }, { "epoch": 0.5851906278063516, "grad_norm": 31.92461284768268, "learning_rate": 8.78243880905998e-08, "logits/chosen": -0.7277215123176575, "logits/rejected": -0.8077613115310669, "logps/chosen": -877.326416015625, "logps/rejected": -1197.5010986328125, "loss": 0.3874, "rewards/accuracies": 0.9375, "rewards/chosen": -4.212238311767578, "rewards/margins": 2.691312789916992, "rewards/rejected": -6.90355110168457, "step": 896 }, { "epoch": 0.585843742346314, "grad_norm": 29.64600849223627, "learning_rate": 8.75979697677043e-08, "logits/chosen": -0.6099945902824402, "logits/rejected": -0.6647940874099731, "logps/chosen": -855.3746948242188, "logps/rejected": -1087.214599609375, "loss": 0.4806, "rewards/accuracies": 0.84375, "rewards/chosen": -4.427003860473633, "rewards/margins": 1.8464815616607666, "rewards/rejected": -6.27348518371582, "step": 897 }, { "epoch": 0.5864968568862764, "grad_norm": 22.474704857385515, "learning_rate": 8.737161599900265e-08, "logits/chosen": -0.8131806254386902, "logits/rejected": -0.715324878692627, "logps/chosen": -1068.5166015625, "logps/rejected": -1100.4036865234375, "loss": 0.4832, "rewards/accuracies": 0.65625, "rewards/chosen": -5.5754594802856445, "rewards/margins": 0.7880738377571106, "rewards/rejected": -6.3635334968566895, "step": 898 }, { "epoch": 0.5871499714262389, "grad_norm": 26.409266204419403, "learning_rate": 8.714532796269593e-08, "logits/chosen": -0.7550494074821472, "logits/rejected": -0.6975759863853455, "logps/chosen": -944.514404296875, "logps/rejected": -1085.607421875, "loss": 0.4551, "rewards/accuracies": 0.84375, "rewards/chosen": -5.10006046295166, "rewards/margins": 1.5463776588439941, "rewards/rejected": -6.646438121795654, "step": 899 }, { "epoch": 0.5878030859662013, "grad_norm": 19.847329227514567, "learning_rate": 8.69191068366431e-08, "logits/chosen": -0.8874033689498901, "logits/rejected": -0.8347741961479187, "logps/chosen": -916.2412109375, "logps/rejected": -1101.5413818359375, "loss": 0.4153, "rewards/accuracies": 0.75, "rewards/chosen": -4.8446855545043945, "rewards/margins": 1.8519930839538574, "rewards/rejected": -6.69667911529541, "step": 900 }, { "epoch": 0.5878030859662013, "eval_logits/chosen": -0.6474034786224365, "eval_logits/rejected": -0.5894980430603027, "eval_logps/chosen": -972.2631225585938, "eval_logps/rejected": -1118.36767578125, "eval_loss": 0.41075676679611206, "eval_rewards/accuracies": 0.796999990940094, "eval_rewards/chosen": -4.983559608459473, "eval_rewards/margins": 1.5484305620193481, "eval_rewards/rejected": -6.531990051269531, "eval_runtime": 615.3366, "eval_samples_per_second": 6.501, "eval_steps_per_second": 0.406, "step": 900 }, { "epoch": 0.5884562005061638, "grad_norm": 14.925997798234508, "learning_rate": 8.669295379835466e-08, "logits/chosen": -0.8691277503967285, "logits/rejected": -0.8496584296226501, "logps/chosen": -1043.7869873046875, "logps/rejected": -1212.6915283203125, "loss": 0.34, "rewards/accuracies": 0.875, "rewards/chosen": -5.405606746673584, "rewards/margins": 1.7067140340805054, "rewards/rejected": -7.112320899963379, "step": 901 }, { "epoch": 0.5891093150461262, "grad_norm": 25.753626241410412, "learning_rate": 8.646687002498691e-08, "logits/chosen": -0.9647932052612305, "logits/rejected": -0.9126352667808533, "logps/chosen": -969.6204833984375, "logps/rejected": -1194.13623046875, "loss": 0.3428, "rewards/accuracies": 0.84375, "rewards/chosen": -4.46538782119751, "rewards/margins": 2.263078212738037, "rewards/rejected": -6.728465557098389, "step": 902 }, { "epoch": 0.5897624295860887, "grad_norm": 17.703375309858618, "learning_rate": 8.624085669333552e-08, "logits/chosen": -0.8007317781448364, "logits/rejected": -0.7886674404144287, "logps/chosen": -835.8764038085938, "logps/rejected": -917.6867065429688, "loss": 0.3663, "rewards/accuracies": 0.75, "rewards/chosen": -4.808894157409668, "rewards/margins": 0.7904772162437439, "rewards/rejected": -5.599371433258057, "step": 903 }, { "epoch": 0.5904155441260511, "grad_norm": 23.835014759230322, "learning_rate": 8.601491497982954e-08, "logits/chosen": -0.9031449556350708, "logits/rejected": -0.7340150475502014, "logps/chosen": -1016.9129638671875, "logps/rejected": -1168.0784912109375, "loss": 0.4244, "rewards/accuracies": 0.84375, "rewards/chosen": -4.812582492828369, "rewards/margins": 1.9187612533569336, "rewards/rejected": -6.731344223022461, "step": 904 }, { "epoch": 0.5910686586660135, "grad_norm": 46.41920197800403, "learning_rate": 8.578904606052524e-08, "logits/chosen": -0.8163060545921326, "logits/rejected": -0.7480596899986267, "logps/chosen": -867.8343505859375, "logps/rejected": -1006.3372192382812, "loss": 0.4485, "rewards/accuracies": 0.78125, "rewards/chosen": -4.025778770446777, "rewards/margins": 1.435265064239502, "rewards/rejected": -5.461042881011963, "step": 905 }, { "epoch": 0.591721773205976, "grad_norm": 22.00085431680691, "learning_rate": 8.556325111109993e-08, "logits/chosen": -0.8558377027511597, "logits/rejected": -0.7511030435562134, "logps/chosen": -935.3119506835938, "logps/rejected": -1052.9794921875, "loss": 0.3499, "rewards/accuracies": 0.90625, "rewards/chosen": -3.992286205291748, "rewards/margins": 1.536756157875061, "rewards/rejected": -5.5290422439575195, "step": 906 }, { "epoch": 0.5923748877459385, "grad_norm": 16.976980001349833, "learning_rate": 8.533753130684596e-08, "logits/chosen": -0.8626459240913391, "logits/rejected": -0.8605834245681763, "logps/chosen": -867.1327514648438, "logps/rejected": -1003.730712890625, "loss": 0.419, "rewards/accuracies": 0.90625, "rewards/chosen": -4.049777984619141, "rewards/margins": 1.4001483917236328, "rewards/rejected": -5.449926376342773, "step": 907 }, { "epoch": 0.5930280022859009, "grad_norm": 20.35692870029299, "learning_rate": 8.51118878226645e-08, "logits/chosen": -0.8889980316162109, "logits/rejected": -0.9109795093536377, "logps/chosen": -891.5820922851562, "logps/rejected": -1061.819580078125, "loss": 0.3608, "rewards/accuracies": 0.84375, "rewards/chosen": -4.176605701446533, "rewards/margins": 1.7289620637893677, "rewards/rejected": -5.9055681228637695, "step": 908 }, { "epoch": 0.5936811168258633, "grad_norm": 30.804138303718446, "learning_rate": 8.488632183305945e-08, "logits/chosen": -0.9131035804748535, "logits/rejected": -0.8490291237831116, "logps/chosen": -907.41015625, "logps/rejected": -1025.7835693359375, "loss": 0.4051, "rewards/accuracies": 0.90625, "rewards/chosen": -4.207481384277344, "rewards/margins": 1.614747166633606, "rewards/rejected": -5.822227954864502, "step": 909 }, { "epoch": 0.5943342313658257, "grad_norm": 23.681532592721254, "learning_rate": 8.466083451213145e-08, "logits/chosen": -0.7877532839775085, "logits/rejected": -0.632519006729126, "logps/chosen": -966.0323486328125, "logps/rejected": -1077.8875732421875, "loss": 0.4482, "rewards/accuracies": 0.6875, "rewards/chosen": -4.879648685455322, "rewards/margins": 1.4281139373779297, "rewards/rejected": -6.307762145996094, "step": 910 }, { "epoch": 0.5949873459057883, "grad_norm": 21.37738164632679, "learning_rate": 8.443542703357154e-08, "logits/chosen": -0.9703052639961243, "logits/rejected": -0.9703279733657837, "logps/chosen": -891.2740478515625, "logps/rejected": -1010.065673828125, "loss": 0.3599, "rewards/accuracies": 0.90625, "rewards/chosen": -3.771116018295288, "rewards/margins": 1.7163093090057373, "rewards/rejected": -5.487425327301025, "step": 911 }, { "epoch": 0.5956404604457507, "grad_norm": 21.36892588322395, "learning_rate": 8.421010057065517e-08, "logits/chosen": -0.8273603320121765, "logits/rejected": -0.6571228504180908, "logps/chosen": -909.6326904296875, "logps/rejected": -1141.2235107421875, "loss": 0.359, "rewards/accuracies": 0.84375, "rewards/chosen": -4.4580841064453125, "rewards/margins": 2.422499179840088, "rewards/rejected": -6.880582809448242, "step": 912 }, { "epoch": 0.5962935749857131, "grad_norm": 13.636371400634598, "learning_rate": 8.398485629623613e-08, "logits/chosen": -0.9461863040924072, "logits/rejected": -0.9167419075965881, "logps/chosen": -945.4471435546875, "logps/rejected": -1142.0771484375, "loss": 0.365, "rewards/accuracies": 0.90625, "rewards/chosen": -4.211487293243408, "rewards/margins": 2.108157157897949, "rewards/rejected": -6.319644451141357, "step": 913 }, { "epoch": 0.5969466895256755, "grad_norm": 19.50909886317978, "learning_rate": 8.375969538274046e-08, "logits/chosen": -0.9539211988449097, "logits/rejected": -0.881746768951416, "logps/chosen": -859.5111083984375, "logps/rejected": -1007.7662353515625, "loss": 0.4434, "rewards/accuracies": 0.71875, "rewards/chosen": -4.13993501663208, "rewards/margins": 1.4645112752914429, "rewards/rejected": -5.604445457458496, "step": 914 }, { "epoch": 0.5975998040656381, "grad_norm": 13.53092257072935, "learning_rate": 8.35346190021602e-08, "logits/chosen": -0.8023240566253662, "logits/rejected": -0.6279884576797485, "logps/chosen": -903.838623046875, "logps/rejected": -1045.2021484375, "loss": 0.4114, "rewards/accuracies": 0.8125, "rewards/chosen": -4.067113876342773, "rewards/margins": 1.7872202396392822, "rewards/rejected": -5.854333877563477, "step": 915 }, { "epoch": 0.5982529186056005, "grad_norm": 32.33399522874252, "learning_rate": 8.330962832604746e-08, "logits/chosen": -0.7911917567253113, "logits/rejected": -0.8615321516990662, "logps/chosen": -974.4556884765625, "logps/rejected": -1305.1280517578125, "loss": 0.3299, "rewards/accuracies": 0.78125, "rewards/chosen": -4.3269877433776855, "rewards/margins": 2.0870394706726074, "rewards/rejected": -6.414027214050293, "step": 916 }, { "epoch": 0.5989060331455629, "grad_norm": 19.159991853562595, "learning_rate": 8.308472452550821e-08, "logits/chosen": -0.7687807083129883, "logits/rejected": -0.5947264432907104, "logps/chosen": -989.5430908203125, "logps/rejected": -1158.9034423828125, "loss": 0.3795, "rewards/accuracies": 0.78125, "rewards/chosen": -5.157406806945801, "rewards/margins": 2.0121400356292725, "rewards/rejected": -7.169547080993652, "step": 917 }, { "epoch": 0.5995591476855253, "grad_norm": 16.7182763656914, "learning_rate": 8.285990877119621e-08, "logits/chosen": -0.833372950553894, "logits/rejected": -0.8309823274612427, "logps/chosen": -887.657958984375, "logps/rejected": -1025.5283203125, "loss": 0.4022, "rewards/accuracies": 0.78125, "rewards/chosen": -4.440640926361084, "rewards/margins": 1.293958067893982, "rewards/rejected": -5.7345991134643555, "step": 918 }, { "epoch": 0.6002122622254878, "grad_norm": 17.517071068178463, "learning_rate": 8.263518223330696e-08, "logits/chosen": -0.9249385595321655, "logits/rejected": -0.8931978940963745, "logps/chosen": -952.4149780273438, "logps/rejected": -1138.89453125, "loss": 0.3452, "rewards/accuracies": 0.96875, "rewards/chosen": -4.342203617095947, "rewards/margins": 1.7767517566680908, "rewards/rejected": -6.118955612182617, "step": 919 }, { "epoch": 0.6008653767654503, "grad_norm": 14.896081107369003, "learning_rate": 8.241054608157157e-08, "logits/chosen": -0.7339697480201721, "logits/rejected": -0.7770588994026184, "logps/chosen": -865.361083984375, "logps/rejected": -1034.2215576171875, "loss": 0.3847, "rewards/accuracies": 0.84375, "rewards/chosen": -4.295354843139648, "rewards/margins": 1.5795997381210327, "rewards/rejected": -5.874954700469971, "step": 920 }, { "epoch": 0.6015184913054127, "grad_norm": 22.1021955531778, "learning_rate": 8.218600148525065e-08, "logits/chosen": -0.6869848966598511, "logits/rejected": -0.7026206254959106, "logps/chosen": -830.2351684570312, "logps/rejected": -978.03564453125, "loss": 0.4343, "rewards/accuracies": 0.90625, "rewards/chosen": -4.30990743637085, "rewards/margins": 1.3018380403518677, "rewards/rejected": -5.611745357513428, "step": 921 }, { "epoch": 0.6021716058453751, "grad_norm": 15.381420621498501, "learning_rate": 8.19615496131283e-08, "logits/chosen": -0.9402379989624023, "logits/rejected": -0.8162367343902588, "logps/chosen": -948.3676147460938, "logps/rejected": -1116.4700927734375, "loss": 0.3934, "rewards/accuracies": 1.0, "rewards/chosen": -4.638570308685303, "rewards/margins": 1.7800812721252441, "rewards/rejected": -6.418651103973389, "step": 922 }, { "epoch": 0.6028247203853376, "grad_norm": 18.845537370733666, "learning_rate": 8.173719163350594e-08, "logits/chosen": -0.8794003129005432, "logits/rejected": -0.8930718898773193, "logps/chosen": -948.6639404296875, "logps/rejected": -1154.9439697265625, "loss": 0.4172, "rewards/accuracies": 0.78125, "rewards/chosen": -4.331446647644043, "rewards/margins": 1.726813554763794, "rewards/rejected": -6.058260440826416, "step": 923 }, { "epoch": 0.6034778349253, "grad_norm": 34.811506098824175, "learning_rate": 8.151292871419626e-08, "logits/chosen": -0.7502395510673523, "logits/rejected": -0.7458022832870483, "logps/chosen": -881.787841796875, "logps/rejected": -1068.427734375, "loss": 0.364, "rewards/accuracies": 0.84375, "rewards/chosen": -4.579074382781982, "rewards/margins": 1.974166989326477, "rewards/rejected": -6.553241729736328, "step": 924 }, { "epoch": 0.6041309494652625, "grad_norm": 19.25584833056271, "learning_rate": 8.128876202251717e-08, "logits/chosen": -1.001344084739685, "logits/rejected": -0.937218427658081, "logps/chosen": -830.0806274414062, "logps/rejected": -987.154052734375, "loss": 0.3544, "rewards/accuracies": 0.875, "rewards/chosen": -4.078459739685059, "rewards/margins": 1.5838062763214111, "rewards/rejected": -5.662266254425049, "step": 925 }, { "epoch": 0.6047840640052249, "grad_norm": 17.416222678203805, "learning_rate": 8.106469272528572e-08, "logits/chosen": -1.023118257522583, "logits/rejected": -0.8872838616371155, "logps/chosen": -935.068115234375, "logps/rejected": -1081.85595703125, "loss": 0.3868, "rewards/accuracies": 0.90625, "rewards/chosen": -4.559538841247559, "rewards/margins": 1.8961910009384155, "rewards/rejected": -6.4557294845581055, "step": 926 }, { "epoch": 0.6054371785451874, "grad_norm": 15.520574577955848, "learning_rate": 8.084072198881198e-08, "logits/chosen": -0.8037633299827576, "logits/rejected": -0.6210764050483704, "logps/chosen": -917.8544921875, "logps/rejected": -1018.9097900390625, "loss": 0.4153, "rewards/accuracies": 0.78125, "rewards/chosen": -4.491122245788574, "rewards/margins": 1.4294805526733398, "rewards/rejected": -5.920601844787598, "step": 927 }, { "epoch": 0.6060902930851498, "grad_norm": 28.746922467088186, "learning_rate": 8.061685097889299e-08, "logits/chosen": -0.9267802238464355, "logits/rejected": -0.788466215133667, "logps/chosen": -1069.5301513671875, "logps/rejected": -1141.6458740234375, "loss": 0.4574, "rewards/accuracies": 0.75, "rewards/chosen": -5.574659824371338, "rewards/margins": 1.0332306623458862, "rewards/rejected": -6.607890605926514, "step": 928 }, { "epoch": 0.6067434076251123, "grad_norm": 14.23380808630684, "learning_rate": 8.039308086080674e-08, "logits/chosen": -0.9022722244262695, "logits/rejected": -0.8536574244499207, "logps/chosen": -1080.463623046875, "logps/rejected": -1230.5780029296875, "loss": 0.3791, "rewards/accuracies": 0.71875, "rewards/chosen": -5.848098278045654, "rewards/margins": 1.53517746925354, "rewards/rejected": -7.383275985717773, "step": 929 }, { "epoch": 0.6073965221650747, "grad_norm": 17.403206785758336, "learning_rate": 8.016941279930605e-08, "logits/chosen": -0.919601321220398, "logits/rejected": -0.9383385181427002, "logps/chosen": -880.8705444335938, "logps/rejected": -1069.1361083984375, "loss": 0.363, "rewards/accuracies": 0.875, "rewards/chosen": -4.793572425842285, "rewards/margins": 1.5130126476287842, "rewards/rejected": -6.306585311889648, "step": 930 }, { "epoch": 0.6080496367050372, "grad_norm": 18.009932005684178, "learning_rate": 7.994584795861247e-08, "logits/chosen": -0.7678056359291077, "logits/rejected": -0.7451735138893127, "logps/chosen": -1034.4462890625, "logps/rejected": -1221.044677734375, "loss": 0.3192, "rewards/accuracies": 0.9375, "rewards/chosen": -5.145386219024658, "rewards/margins": 2.1374313831329346, "rewards/rejected": -7.282817363739014, "step": 931 }, { "epoch": 0.6087027512449996, "grad_norm": 49.34545548945588, "learning_rate": 7.972238750241036e-08, "logits/chosen": -0.7207657694816589, "logits/rejected": -0.7844063639640808, "logps/chosen": -1049.498779296875, "logps/rejected": -1199.9368896484375, "loss": 0.6142, "rewards/accuracies": 0.6875, "rewards/chosen": -5.626737594604492, "rewards/margins": 1.12619948387146, "rewards/rejected": -6.752936840057373, "step": 932 }, { "epoch": 0.609355865784962, "grad_norm": 18.136351511937786, "learning_rate": 7.949903259384068e-08, "logits/chosen": -0.9160058498382568, "logits/rejected": -0.951209545135498, "logps/chosen": -978.7283325195312, "logps/rejected": -1158.1009521484375, "loss": 0.4027, "rewards/accuracies": 0.875, "rewards/chosen": -4.956260681152344, "rewards/margins": 1.780358076095581, "rewards/rejected": -6.736618995666504, "step": 933 }, { "epoch": 0.6100089803249245, "grad_norm": 14.171668974961465, "learning_rate": 7.927578439549506e-08, "logits/chosen": -0.8602361083030701, "logits/rejected": -0.8363439440727234, "logps/chosen": -1078.1002197265625, "logps/rejected": -1262.086669921875, "loss": 0.3364, "rewards/accuracies": 0.84375, "rewards/chosen": -5.762195110321045, "rewards/margins": 1.9282588958740234, "rewards/rejected": -7.690454483032227, "step": 934 }, { "epoch": 0.610662094864887, "grad_norm": 21.95332133574316, "learning_rate": 7.905264406940959e-08, "logits/chosen": -0.814610481262207, "logits/rejected": -0.8668062090873718, "logps/chosen": -996.294921875, "logps/rejected": -1317.03955078125, "loss": 0.4424, "rewards/accuracies": 0.84375, "rewards/chosen": -5.210824966430664, "rewards/margins": 2.820859432220459, "rewards/rejected": -8.031684875488281, "step": 935 }, { "epoch": 0.6113152094048494, "grad_norm": 23.10823799153079, "learning_rate": 7.882961277705895e-08, "logits/chosen": -0.6385756134986877, "logits/rejected": -0.6616299152374268, "logps/chosen": -852.9818115234375, "logps/rejected": -1010.8091430664062, "loss": 0.4107, "rewards/accuracies": 0.75, "rewards/chosen": -4.4981513023376465, "rewards/margins": 1.3144606351852417, "rewards/rejected": -5.8126115798950195, "step": 936 }, { "epoch": 0.6119683239448118, "grad_norm": 13.356378969333702, "learning_rate": 7.860669167935028e-08, "logits/chosen": -0.6723610758781433, "logits/rejected": -0.5586845874786377, "logps/chosen": -911.2410278320312, "logps/rejected": -1077.91552734375, "loss": 0.3322, "rewards/accuracies": 0.875, "rewards/chosen": -4.328841209411621, "rewards/margins": 1.6360496282577515, "rewards/rejected": -5.964890003204346, "step": 937 }, { "epoch": 0.6126214384847742, "grad_norm": 17.64088440541406, "learning_rate": 7.838388193661711e-08, "logits/chosen": -1.0395886898040771, "logits/rejected": -0.9543710947036743, "logps/chosen": -994.0137329101562, "logps/rejected": -1132.4892578125, "loss": 0.4228, "rewards/accuracies": 0.84375, "rewards/chosen": -5.503838062286377, "rewards/margins": 1.5002458095550537, "rewards/rejected": -7.004083156585693, "step": 938 }, { "epoch": 0.6132745530247368, "grad_norm": 41.76222226846645, "learning_rate": 7.816118470861342e-08, "logits/chosen": -0.7007467150688171, "logits/rejected": -0.7458237409591675, "logps/chosen": -859.9625244140625, "logps/rejected": -1088.14013671875, "loss": 0.3553, "rewards/accuracies": 0.8125, "rewards/chosen": -4.435876369476318, "rewards/margins": 1.9407938718795776, "rewards/rejected": -6.376670837402344, "step": 939 }, { "epoch": 0.6139276675646992, "grad_norm": 16.41466968357761, "learning_rate": 7.793860115450743e-08, "logits/chosen": -0.8894115686416626, "logits/rejected": -0.7774127125740051, "logps/chosen": -998.6695556640625, "logps/rejected": -1176.7144775390625, "loss": 0.4081, "rewards/accuracies": 0.84375, "rewards/chosen": -5.242660999298096, "rewards/margins": 1.6881160736083984, "rewards/rejected": -6.930777072906494, "step": 940 }, { "epoch": 0.6145807821046616, "grad_norm": 28.81321193957898, "learning_rate": 7.771613243287573e-08, "logits/chosen": -0.8349412083625793, "logits/rejected": -0.6980231404304504, "logps/chosen": -1037.77783203125, "logps/rejected": -1159.787353515625, "loss": 0.5375, "rewards/accuracies": 0.78125, "rewards/chosen": -5.490978240966797, "rewards/margins": 1.5088858604431152, "rewards/rejected": -6.99986457824707, "step": 941 }, { "epoch": 0.615233896644624, "grad_norm": 15.47414082361199, "learning_rate": 7.749377970169726e-08, "logits/chosen": -0.7294608354568481, "logits/rejected": -0.8300274014472961, "logps/chosen": -881.7943725585938, "logps/rejected": -1087.314208984375, "loss": 0.3835, "rewards/accuracies": 0.90625, "rewards/chosen": -4.590490818023682, "rewards/margins": 1.7875473499298096, "rewards/rejected": -6.37803840637207, "step": 942 }, { "epoch": 0.6158870111845864, "grad_norm": 19.63090785781211, "learning_rate": 7.72715441183471e-08, "logits/chosen": -0.8396845459938049, "logits/rejected": -0.8975608348846436, "logps/chosen": -935.793701171875, "logps/rejected": -1181.52392578125, "loss": 0.4221, "rewards/accuracies": 0.84375, "rewards/chosen": -4.497322082519531, "rewards/margins": 2.063370704650879, "rewards/rejected": -6.56069278717041, "step": 943 }, { "epoch": 0.616540125724549, "grad_norm": 19.55448604762308, "learning_rate": 7.704942683959061e-08, "logits/chosen": -0.7600312829017639, "logits/rejected": -0.6722344160079956, "logps/chosen": -876.5004272460938, "logps/rejected": -1015.899169921875, "loss": 0.4024, "rewards/accuracies": 0.875, "rewards/chosen": -3.6736412048339844, "rewards/margins": 1.7954384088516235, "rewards/rejected": -5.469079494476318, "step": 944 }, { "epoch": 0.6171932402645114, "grad_norm": 22.577034429164247, "learning_rate": 7.682742902157742e-08, "logits/chosen": -0.8105038404464722, "logits/rejected": -0.8246266841888428, "logps/chosen": -981.1633911132812, "logps/rejected": -1142.673828125, "loss": 0.3782, "rewards/accuracies": 0.84375, "rewards/chosen": -5.0762176513671875, "rewards/margins": 1.5619633197784424, "rewards/rejected": -6.638180732727051, "step": 945 }, { "epoch": 0.6178463548044738, "grad_norm": 16.850655460651723, "learning_rate": 7.660555181983517e-08, "logits/chosen": -0.8345729112625122, "logits/rejected": -0.8672900199890137, "logps/chosen": -996.8236083984375, "logps/rejected": -1235.411865234375, "loss": 0.34, "rewards/accuracies": 0.8125, "rewards/chosen": -4.766516208648682, "rewards/margins": 1.9924037456512451, "rewards/rejected": -6.758920669555664, "step": 946 }, { "epoch": 0.6184994693444362, "grad_norm": 24.42884479602425, "learning_rate": 7.638379638926384e-08, "logits/chosen": -0.9110671281814575, "logits/rejected": -0.9710690975189209, "logps/chosen": -972.607177734375, "logps/rejected": -1323.8133544921875, "loss": 0.355, "rewards/accuracies": 0.9375, "rewards/chosen": -5.0181565284729, "rewards/margins": 3.4316680431365967, "rewards/rejected": -8.449824333190918, "step": 947 }, { "epoch": 0.6191525838843988, "grad_norm": 32.661617699447575, "learning_rate": 7.616216388412956e-08, "logits/chosen": -0.9173276424407959, "logits/rejected": -0.784736692905426, "logps/chosen": -906.2161865234375, "logps/rejected": -1020.6433715820312, "loss": 0.4303, "rewards/accuracies": 0.78125, "rewards/chosen": -4.6049885749816895, "rewards/margins": 1.6103415489196777, "rewards/rejected": -6.215329647064209, "step": 948 }, { "epoch": 0.6198056984243612, "grad_norm": 22.777809128273034, "learning_rate": 7.594065545805857e-08, "logits/chosen": -0.6866058707237244, "logits/rejected": -0.8154290318489075, "logps/chosen": -895.670654296875, "logps/rejected": -1135.50048828125, "loss": 0.4758, "rewards/accuracies": 0.90625, "rewards/chosen": -4.395349979400635, "rewards/margins": 1.9171158075332642, "rewards/rejected": -6.312465667724609, "step": 949 }, { "epoch": 0.6204588129643236, "grad_norm": 23.234837418589642, "learning_rate": 7.571927226403126e-08, "logits/chosen": -0.749639093875885, "logits/rejected": -0.7934308052062988, "logps/chosen": -928.8316040039062, "logps/rejected": -1088.88525390625, "loss": 0.3238, "rewards/accuracies": 0.8125, "rewards/chosen": -4.648177146911621, "rewards/margins": 1.5695174932479858, "rewards/rejected": -6.217695236206055, "step": 950 }, { "epoch": 0.621111927504286, "grad_norm": 38.340388409455954, "learning_rate": 7.549801545437621e-08, "logits/chosen": -0.7972779870033264, "logits/rejected": -0.8462767004966736, "logps/chosen": -829.4715576171875, "logps/rejected": -1028.169921875, "loss": 0.3906, "rewards/accuracies": 0.78125, "rewards/chosen": -4.178791046142578, "rewards/margins": 1.6456745862960815, "rewards/rejected": -5.824464797973633, "step": 951 }, { "epoch": 0.6217650420442485, "grad_norm": 17.06008923511345, "learning_rate": 7.527688618076413e-08, "logits/chosen": -0.8404926657676697, "logits/rejected": -0.8594713807106018, "logps/chosen": -826.0974731445312, "logps/rejected": -1009.733642578125, "loss": 0.4399, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9912056922912598, "rewards/margins": 1.581747055053711, "rewards/rejected": -5.5729522705078125, "step": 952 }, { "epoch": 0.622418156584211, "grad_norm": 26.890220915174563, "learning_rate": 7.505588559420187e-08, "logits/chosen": -0.8127776384353638, "logits/rejected": -0.8098663091659546, "logps/chosen": -986.1820678710938, "logps/rejected": -1091.328369140625, "loss": 0.3669, "rewards/accuracies": 0.71875, "rewards/chosen": -5.3212432861328125, "rewards/margins": 1.011331558227539, "rewards/rejected": -6.332574844360352, "step": 953 }, { "epoch": 0.6230712711241734, "grad_norm": 16.457932917397315, "learning_rate": 7.48350148450265e-08, "logits/chosen": -0.9597660303115845, "logits/rejected": -0.8716091513633728, "logps/chosen": -1003.8367919921875, "logps/rejected": -1141.11669921875, "loss": 0.3672, "rewards/accuracies": 0.8125, "rewards/chosen": -5.11099910736084, "rewards/margins": 1.655037522315979, "rewards/rejected": -6.7660369873046875, "step": 954 }, { "epoch": 0.6237243856641358, "grad_norm": 36.48894968020559, "learning_rate": 7.461427508289921e-08, "logits/chosen": -0.8966237306594849, "logits/rejected": -0.7957350015640259, "logps/chosen": -832.1514892578125, "logps/rejected": -917.5252075195312, "loss": 0.3699, "rewards/accuracies": 0.90625, "rewards/chosen": -3.9978244304656982, "rewards/margins": 1.4576671123504639, "rewards/rejected": -5.455491065979004, "step": 955 }, { "epoch": 0.6243775002040983, "grad_norm": 19.6017580958855, "learning_rate": 7.439366745679942e-08, "logits/chosen": -0.8043563365936279, "logits/rejected": -0.8075878620147705, "logps/chosen": -941.8458251953125, "logps/rejected": -1092.9459228515625, "loss": 0.3917, "rewards/accuracies": 0.84375, "rewards/chosen": -4.635620594024658, "rewards/margins": 1.3564602136611938, "rewards/rejected": -5.992081165313721, "step": 956 }, { "epoch": 0.6250306147440607, "grad_norm": 25.477482249295566, "learning_rate": 7.417319311501879e-08, "logits/chosen": -0.7736403942108154, "logits/rejected": -0.7841091752052307, "logps/chosen": -850.4437255859375, "logps/rejected": -1102.8319091796875, "loss": 0.4188, "rewards/accuracies": 0.8125, "rewards/chosen": -4.24493408203125, "rewards/margins": 1.8479266166687012, "rewards/rejected": -6.092860221862793, "step": 957 }, { "epoch": 0.6256837292840232, "grad_norm": 22.907968350007085, "learning_rate": 7.395285320515512e-08, "logits/chosen": -0.8162537217140198, "logits/rejected": -0.8792285919189453, "logps/chosen": -913.6036987304688, "logps/rejected": -1090.5911865234375, "loss": 0.3582, "rewards/accuracies": 0.78125, "rewards/chosen": -4.4979424476623535, "rewards/margins": 1.1419354677200317, "rewards/rejected": -5.639878273010254, "step": 958 }, { "epoch": 0.6263368438239856, "grad_norm": 31.225301502154736, "learning_rate": 7.373264887410656e-08, "logits/chosen": -0.6841857433319092, "logits/rejected": -0.7074832320213318, "logps/chosen": -835.9268798828125, "logps/rejected": -1149.3931884765625, "loss": 0.3721, "rewards/accuracies": 0.71875, "rewards/chosen": -3.8488359451293945, "rewards/margins": 2.7709360122680664, "rewards/rejected": -6.619771957397461, "step": 959 }, { "epoch": 0.6269899583639481, "grad_norm": 25.565009032820456, "learning_rate": 7.351258126806555e-08, "logits/chosen": -0.7442363500595093, "logits/rejected": -0.754153847694397, "logps/chosen": -1003.4380493164062, "logps/rejected": -1236.912841796875, "loss": 0.3945, "rewards/accuracies": 0.84375, "rewards/chosen": -5.113544940948486, "rewards/margins": 2.3746109008789062, "rewards/rejected": -7.488155364990234, "step": 960 }, { "epoch": 0.6276430729039105, "grad_norm": 20.213696097403353, "learning_rate": 7.329265153251284e-08, "logits/chosen": -0.9148849844932556, "logits/rejected": -0.8240453600883484, "logps/chosen": -995.921630859375, "logps/rejected": -1149.3841552734375, "loss": 0.3667, "rewards/accuracies": 0.8125, "rewards/chosen": -5.002110481262207, "rewards/margins": 1.7089855670928955, "rewards/rejected": -6.711096286773682, "step": 961 }, { "epoch": 0.6282961874438729, "grad_norm": 21.830265430877628, "learning_rate": 7.307286081221153e-08, "logits/chosen": -0.8627299070358276, "logits/rejected": -0.879044771194458, "logps/chosen": -893.7263793945312, "logps/rejected": -1017.5081787109375, "loss": 0.3958, "rewards/accuracies": 0.84375, "rewards/chosen": -4.613065242767334, "rewards/margins": 1.1169685125350952, "rewards/rejected": -5.730034351348877, "step": 962 }, { "epoch": 0.6289493019838354, "grad_norm": 21.34761586639017, "learning_rate": 7.285321025120116e-08, "logits/chosen": -1.028733253479004, "logits/rejected": -0.9963964223861694, "logps/chosen": -1049.7506103515625, "logps/rejected": -1225.5938720703125, "loss": 0.412, "rewards/accuracies": 0.84375, "rewards/chosen": -5.158174514770508, "rewards/margins": 1.789759635925293, "rewards/rejected": -6.947933673858643, "step": 963 }, { "epoch": 0.6296024165237979, "grad_norm": 25.298717035276482, "learning_rate": 7.263370099279171e-08, "logits/chosen": -0.9898463487625122, "logits/rejected": -0.9164149761199951, "logps/chosen": -857.2523193359375, "logps/rejected": -1091.1043701171875, "loss": 0.4432, "rewards/accuracies": 0.78125, "rewards/chosen": -4.275208473205566, "rewards/margins": 2.2709662914276123, "rewards/rejected": -6.546175956726074, "step": 964 }, { "epoch": 0.6302555310637603, "grad_norm": 21.913629938148368, "learning_rate": 7.241433417955764e-08, "logits/chosen": -0.8013566732406616, "logits/rejected": -0.7411233186721802, "logps/chosen": -979.8931884765625, "logps/rejected": -1252.721923828125, "loss": 0.3631, "rewards/accuracies": 0.84375, "rewards/chosen": -5.143740653991699, "rewards/margins": 2.6635663509368896, "rewards/rejected": -7.807306289672852, "step": 965 }, { "epoch": 0.6309086456037227, "grad_norm": 23.61954289841313, "learning_rate": 7.219511095333199e-08, "logits/chosen": -0.9983224272727966, "logits/rejected": -0.8240174651145935, "logps/chosen": -971.7218017578125, "logps/rejected": -1114.72509765625, "loss": 0.4529, "rewards/accuracies": 0.78125, "rewards/chosen": -4.7123494148254395, "rewards/margins": 1.8094121217727661, "rewards/rejected": -6.521760940551758, "step": 966 }, { "epoch": 0.6315617601436851, "grad_norm": 23.349877754997273, "learning_rate": 7.197603245520041e-08, "logits/chosen": -0.6674496531486511, "logits/rejected": -0.654395341873169, "logps/chosen": -986.275146484375, "logps/rejected": -1218.1419677734375, "loss": 0.4591, "rewards/accuracies": 0.90625, "rewards/chosen": -5.584812641143799, "rewards/margins": 2.1112468242645264, "rewards/rejected": -7.69605827331543, "step": 967 }, { "epoch": 0.6322148746836477, "grad_norm": 19.46883241188175, "learning_rate": 7.175709982549524e-08, "logits/chosen": -0.8957158327102661, "logits/rejected": -0.8326559066772461, "logps/chosen": -923.7268676757812, "logps/rejected": -1083.7965087890625, "loss": 0.3943, "rewards/accuracies": 0.84375, "rewards/chosen": -4.729306221008301, "rewards/margins": 1.6854844093322754, "rewards/rejected": -6.414790630340576, "step": 968 }, { "epoch": 0.6328679892236101, "grad_norm": 22.593182793628216, "learning_rate": 7.153831420378949e-08, "logits/chosen": -0.9131090044975281, "logits/rejected": -0.8579813241958618, "logps/chosen": -1036.234130859375, "logps/rejected": -1180.869140625, "loss": 0.3677, "rewards/accuracies": 0.84375, "rewards/chosen": -4.513719081878662, "rewards/margins": 2.031284809112549, "rewards/rejected": -6.545003414154053, "step": 969 }, { "epoch": 0.6335211037635725, "grad_norm": 30.292860588793435, "learning_rate": 7.1319676728891e-08, "logits/chosen": -0.9997818470001221, "logits/rejected": -0.9115554094314575, "logps/chosen": -881.9102783203125, "logps/rejected": -1037.227783203125, "loss": 0.455, "rewards/accuracies": 0.90625, "rewards/chosen": -4.3664350509643555, "rewards/margins": 1.9524322748184204, "rewards/rejected": -6.3188676834106445, "step": 970 }, { "epoch": 0.6341742183035349, "grad_norm": 25.7144274108187, "learning_rate": 7.110118853883653e-08, "logits/chosen": -0.9433892965316772, "logits/rejected": -0.878379225730896, "logps/chosen": -957.4013061523438, "logps/rejected": -1089.672607421875, "loss": 0.4026, "rewards/accuracies": 0.75, "rewards/chosen": -4.62063455581665, "rewards/margins": 1.3547346591949463, "rewards/rejected": -5.975369453430176, "step": 971 }, { "epoch": 0.6348273328434975, "grad_norm": 14.06727763907239, "learning_rate": 7.088285077088576e-08, "logits/chosen": -0.7802984714508057, "logits/rejected": -0.7867769598960876, "logps/chosen": -916.9561767578125, "logps/rejected": -1012.1802978515625, "loss": 0.3995, "rewards/accuracies": 0.78125, "rewards/chosen": -4.424203395843506, "rewards/margins": 1.1160019636154175, "rewards/rejected": -5.540205478668213, "step": 972 }, { "epoch": 0.6354804473834599, "grad_norm": 17.405294616197406, "learning_rate": 7.06646645615154e-08, "logits/chosen": -0.8531831502914429, "logits/rejected": -0.7872010469436646, "logps/chosen": -914.7217407226562, "logps/rejected": -1074.9007568359375, "loss": 0.3495, "rewards/accuracies": 0.875, "rewards/chosen": -4.552631855010986, "rewards/margins": 1.3901065587997437, "rewards/rejected": -5.942738056182861, "step": 973 }, { "epoch": 0.6361335619234223, "grad_norm": 44.55104973891953, "learning_rate": 7.044663104641331e-08, "logits/chosen": -0.7868108153343201, "logits/rejected": -0.807988166809082, "logps/chosen": -941.859375, "logps/rejected": -1273.0975341796875, "loss": 0.3725, "rewards/accuracies": 0.96875, "rewards/chosen": -4.062593936920166, "rewards/margins": 2.6936287879943848, "rewards/rejected": -6.756222724914551, "step": 974 }, { "epoch": 0.6367866764633847, "grad_norm": 14.671664046221297, "learning_rate": 7.02287513604725e-08, "logits/chosen": -0.7551882863044739, "logits/rejected": -0.7513316869735718, "logps/chosen": -807.0482177734375, "logps/rejected": -928.9945068359375, "loss": 0.4005, "rewards/accuracies": 0.78125, "rewards/chosen": -4.388478755950928, "rewards/margins": 1.168333649635315, "rewards/rejected": -5.556812286376953, "step": 975 }, { "epoch": 0.6374397910033472, "grad_norm": 20.1405977104016, "learning_rate": 7.001102663778532e-08, "logits/chosen": -0.7532812356948853, "logits/rejected": -0.7765889167785645, "logps/chosen": -839.454345703125, "logps/rejected": -991.3602294921875, "loss": 0.3791, "rewards/accuracies": 0.8125, "rewards/chosen": -3.9657483100891113, "rewards/margins": 1.6194977760314941, "rewards/rejected": -5.585245609283447, "step": 976 }, { "epoch": 0.6380929055433097, "grad_norm": 16.410114359901613, "learning_rate": 6.979345801163751e-08, "logits/chosen": -0.7125265002250671, "logits/rejected": -0.7120476365089417, "logps/chosen": -922.1001586914062, "logps/rejected": -1161.8427734375, "loss": 0.2998, "rewards/accuracies": 0.8125, "rewards/chosen": -4.680972576141357, "rewards/margins": 1.9333194494247437, "rewards/rejected": -6.614292144775391, "step": 977 }, { "epoch": 0.6387460200832721, "grad_norm": 18.172957147197614, "learning_rate": 6.957604661450228e-08, "logits/chosen": -1.0247219800949097, "logits/rejected": -0.9776769280433655, "logps/chosen": -901.0526123046875, "logps/rejected": -1040.6275634765625, "loss": 0.3487, "rewards/accuracies": 0.90625, "rewards/chosen": -4.152916431427002, "rewards/margins": 1.5070492029190063, "rewards/rejected": -5.659965991973877, "step": 978 }, { "epoch": 0.6393991346232345, "grad_norm": 19.453149679054654, "learning_rate": 6.935879357803451e-08, "logits/chosen": -0.8311464786529541, "logits/rejected": -0.8162705898284912, "logps/chosen": -808.68359375, "logps/rejected": -955.904541015625, "loss": 0.4156, "rewards/accuracies": 0.84375, "rewards/chosen": -3.762153148651123, "rewards/margins": 1.4650424718856812, "rewards/rejected": -5.227195739746094, "step": 979 }, { "epoch": 0.640052249163197, "grad_norm": 18.263500694059907, "learning_rate": 6.914170003306476e-08, "logits/chosen": -1.0470964908599854, "logits/rejected": -0.9374945163726807, "logps/chosen": -939.5408935546875, "logps/rejected": -1130.5428466796875, "loss": 0.3755, "rewards/accuracies": 0.8125, "rewards/chosen": -4.667804718017578, "rewards/margins": 2.05245041847229, "rewards/rejected": -6.720254898071289, "step": 980 }, { "epoch": 0.6407053637031594, "grad_norm": 20.87710669272072, "learning_rate": 6.892476710959334e-08, "logits/chosen": -0.7399060726165771, "logits/rejected": -0.7495934367179871, "logps/chosen": -881.5730590820312, "logps/rejected": -1082.3707275390625, "loss": 0.3691, "rewards/accuracies": 0.9375, "rewards/chosen": -4.176002502441406, "rewards/margins": 2.2363686561584473, "rewards/rejected": -6.412371635437012, "step": 981 }, { "epoch": 0.6413584782431219, "grad_norm": 23.889366608981074, "learning_rate": 6.870799593678458e-08, "logits/chosen": -0.8247918486595154, "logits/rejected": -0.8469498157501221, "logps/chosen": -935.6553344726562, "logps/rejected": -1030.1029052734375, "loss": 0.3968, "rewards/accuracies": 0.71875, "rewards/chosen": -4.512282848358154, "rewards/margins": 1.1883516311645508, "rewards/rejected": -5.700634956359863, "step": 982 }, { "epoch": 0.6420115927830843, "grad_norm": 23.550902347456475, "learning_rate": 6.849138764296087e-08, "logits/chosen": -0.8264920711517334, "logits/rejected": -0.8020291328430176, "logps/chosen": -953.135498046875, "logps/rejected": -1134.365966796875, "loss": 0.4937, "rewards/accuracies": 0.78125, "rewards/chosen": -4.6887664794921875, "rewards/margins": 1.7499239444732666, "rewards/rejected": -6.438691139221191, "step": 983 }, { "epoch": 0.6426647073230468, "grad_norm": 22.7648671670516, "learning_rate": 6.827494335559682e-08, "logits/chosen": -0.9459264278411865, "logits/rejected": -0.9424221515655518, "logps/chosen": -923.2305297851562, "logps/rejected": -1065.76806640625, "loss": 0.4346, "rewards/accuracies": 0.8125, "rewards/chosen": -5.051098346710205, "rewards/margins": 1.6530250310897827, "rewards/rejected": -6.704123497009277, "step": 984 }, { "epoch": 0.6433178218630092, "grad_norm": 18.62264187081726, "learning_rate": 6.805866420131329e-08, "logits/chosen": -0.9513512253761292, "logits/rejected": -0.9166187047958374, "logps/chosen": -910.2726440429688, "logps/rejected": -1088.285888671875, "loss": 0.4295, "rewards/accuracies": 0.65625, "rewards/chosen": -4.5494465827941895, "rewards/margins": 1.8071138858795166, "rewards/rejected": -6.356560230255127, "step": 985 }, { "epoch": 0.6439709364029716, "grad_norm": 32.6686778226223, "learning_rate": 6.784255130587166e-08, "logits/chosen": -0.5947114825248718, "logits/rejected": -0.578256368637085, "logps/chosen": -806.2866821289062, "logps/rejected": -951.0847778320312, "loss": 0.4845, "rewards/accuracies": 0.75, "rewards/chosen": -4.234209060668945, "rewards/margins": 1.5783112049102783, "rewards/rejected": -5.8125200271606445, "step": 986 }, { "epoch": 0.6446240509429341, "grad_norm": 22.385129535074565, "learning_rate": 6.76266057941679e-08, "logits/chosen": -0.8183972239494324, "logits/rejected": -0.7740908265113831, "logps/chosen": -880.7390747070312, "logps/rejected": -1058.571044921875, "loss": 0.3731, "rewards/accuracies": 0.8125, "rewards/chosen": -4.1473612785339355, "rewards/margins": 1.603851079940796, "rewards/rejected": -5.751212120056152, "step": 987 }, { "epoch": 0.6452771654828966, "grad_norm": 17.849948544570488, "learning_rate": 6.74108287902267e-08, "logits/chosen": -0.9178858995437622, "logits/rejected": -0.8500401377677917, "logps/chosen": -923.59228515625, "logps/rejected": -1101.4404296875, "loss": 0.4397, "rewards/accuracies": 0.84375, "rewards/chosen": -4.897286415100098, "rewards/margins": 1.5341837406158447, "rewards/rejected": -6.431469917297363, "step": 988 }, { "epoch": 0.645930280022859, "grad_norm": 20.377859242404433, "learning_rate": 6.71952214171957e-08, "logits/chosen": -0.6903858780860901, "logits/rejected": -0.6488080620765686, "logps/chosen": -924.1713256835938, "logps/rejected": -1020.035888671875, "loss": 0.3853, "rewards/accuracies": 0.90625, "rewards/chosen": -5.061565399169922, "rewards/margins": 1.2257417440414429, "rewards/rejected": -6.287306785583496, "step": 989 }, { "epoch": 0.6465833945628214, "grad_norm": 21.58412096751822, "learning_rate": 6.697978479733951e-08, "logits/chosen": -0.9614883661270142, "logits/rejected": -0.9037103056907654, "logps/chosen": -965.2286987304688, "logps/rejected": -1166.6185302734375, "loss": 0.3806, "rewards/accuracies": 0.8125, "rewards/chosen": -4.743613243103027, "rewards/margins": 1.9612386226654053, "rewards/rejected": -6.704852104187012, "step": 990 }, { "epoch": 0.6472365091027839, "grad_norm": 23.814036679350696, "learning_rate": 6.676452005203405e-08, "logits/chosen": -0.7713127136230469, "logits/rejected": -0.9046941995620728, "logps/chosen": -868.4608154296875, "logps/rejected": -1109.4259033203125, "loss": 0.3486, "rewards/accuracies": 0.875, "rewards/chosen": -4.386026382446289, "rewards/margins": 1.9416440725326538, "rewards/rejected": -6.327670097351074, "step": 991 }, { "epoch": 0.6478896236427464, "grad_norm": 15.283217602370875, "learning_rate": 6.654942830176046e-08, "logits/chosen": -0.8953443169593811, "logits/rejected": -0.8187096118927002, "logps/chosen": -857.76708984375, "logps/rejected": -1050.30029296875, "loss": 0.3335, "rewards/accuracies": 0.875, "rewards/chosen": -3.9873547554016113, "rewards/margins": 1.8355342149734497, "rewards/rejected": -5.82288932800293, "step": 992 }, { "epoch": 0.6485427381827088, "grad_norm": 24.833610729179064, "learning_rate": 6.63345106660996e-08, "logits/chosen": -0.8883509039878845, "logits/rejected": -0.7819935083389282, "logps/chosen": -1011.8300170898438, "logps/rejected": -1155.8167724609375, "loss": 0.4468, "rewards/accuracies": 0.875, "rewards/chosen": -5.484269618988037, "rewards/margins": 1.5660336017608643, "rewards/rejected": -7.0503034591674805, "step": 993 }, { "epoch": 0.6491958527226712, "grad_norm": 18.970078980831484, "learning_rate": 6.61197682637259e-08, "logits/chosen": -0.9100611209869385, "logits/rejected": -0.9906662702560425, "logps/chosen": -935.2700805664062, "logps/rejected": -1161.526123046875, "loss": 0.3892, "rewards/accuracies": 0.75, "rewards/chosen": -4.428016662597656, "rewards/margins": 1.8627903461456299, "rewards/rejected": -6.290807247161865, "step": 994 }, { "epoch": 0.6498489672626336, "grad_norm": 28.497491815155342, "learning_rate": 6.590520221240173e-08, "logits/chosen": -0.8032527565956116, "logits/rejected": -0.755669116973877, "logps/chosen": -940.073974609375, "logps/rejected": -1132.3983154296875, "loss": 0.4247, "rewards/accuracies": 0.8125, "rewards/chosen": -4.985255241394043, "rewards/margins": 1.579092025756836, "rewards/rejected": -6.564347267150879, "step": 995 }, { "epoch": 0.6505020818025962, "grad_norm": 19.59715751506617, "learning_rate": 6.569081362897154e-08, "logits/chosen": -0.8742798566818237, "logits/rejected": -0.8021313548088074, "logps/chosen": -972.17919921875, "logps/rejected": -1167.32373046875, "loss": 0.4422, "rewards/accuracies": 0.84375, "rewards/chosen": -5.003599166870117, "rewards/margins": 1.8019216060638428, "rewards/rejected": -6.805520534515381, "step": 996 }, { "epoch": 0.6511551963425586, "grad_norm": 19.152498975684075, "learning_rate": 6.547660362935602e-08, "logits/chosen": -0.8598069548606873, "logits/rejected": -0.8882841467857361, "logps/chosen": -964.857666015625, "logps/rejected": -1174.9244384765625, "loss": 0.4068, "rewards/accuracies": 0.9375, "rewards/chosen": -4.544558048248291, "rewards/margins": 1.9304155111312866, "rewards/rejected": -6.474973678588867, "step": 997 }, { "epoch": 0.651808310882521, "grad_norm": 20.733434029636296, "learning_rate": 6.526257332854631e-08, "logits/chosen": -0.815421462059021, "logits/rejected": -0.817630410194397, "logps/chosen": -898.12744140625, "logps/rejected": -1068.466796875, "loss": 0.4547, "rewards/accuracies": 0.8125, "rewards/chosen": -4.393612861633301, "rewards/margins": 1.5290162563323975, "rewards/rejected": -5.922628879547119, "step": 998 }, { "epoch": 0.6524614254224834, "grad_norm": 39.67458708618184, "learning_rate": 6.504872384059821e-08, "logits/chosen": -0.986212968826294, "logits/rejected": -0.7506334781646729, "logps/chosen": -941.7611083984375, "logps/rejected": -980.0782470703125, "loss": 0.5037, "rewards/accuracies": 0.75, "rewards/chosen": -4.185219764709473, "rewards/margins": 1.1642541885375977, "rewards/rejected": -5.34947395324707, "step": 999 }, { "epoch": 0.653114539962446, "grad_norm": 28.590369004082522, "learning_rate": 6.483505627862632e-08, "logits/chosen": -0.7975764870643616, "logits/rejected": -0.8433130383491516, "logps/chosen": -905.9865112304688, "logps/rejected": -1150.0699462890625, "loss": 0.3935, "rewards/accuracies": 0.8125, "rewards/chosen": -4.6011528968811035, "rewards/margins": 2.136965274810791, "rewards/rejected": -6.7381181716918945, "step": 1000 }, { "epoch": 0.653114539962446, "eval_logits/chosen": -0.6597719192504883, "eval_logits/rejected": -0.6015712022781372, "eval_logps/chosen": -916.9378662109375, "eval_logps/rejected": -1058.8646240234375, "eval_loss": 0.39993926882743835, "eval_rewards/accuracies": 0.8109999895095825, "eval_rewards/chosen": -4.430306911468506, "eval_rewards/margins": 1.506652593612671, "eval_rewards/rejected": -5.936959743499756, "eval_runtime": 611.597, "eval_samples_per_second": 6.54, "eval_steps_per_second": 0.409, "step": 1000 }, { "epoch": 0.6537676545024084, "grad_norm": 18.88782766889396, "learning_rate": 6.462157175479834e-08, "logits/chosen": -0.7251522541046143, "logits/rejected": -0.7726633548736572, "logps/chosen": -888.7282104492188, "logps/rejected": -1134.8662109375, "loss": 0.4294, "rewards/accuracies": 0.84375, "rewards/chosen": -4.475460052490234, "rewards/margins": 2.2871503829956055, "rewards/rejected": -6.76261043548584, "step": 1001 }, { "epoch": 0.6544207690423708, "grad_norm": 17.103388735372086, "learning_rate": 6.440827138032926e-08, "logits/chosen": -0.8073115944862366, "logits/rejected": -0.8252954483032227, "logps/chosen": -954.5016479492188, "logps/rejected": -1213.746826171875, "loss": 0.3381, "rewards/accuracies": 0.84375, "rewards/chosen": -4.250032424926758, "rewards/margins": 2.545926809310913, "rewards/rejected": -6.79595947265625, "step": 1002 }, { "epoch": 0.6550738835823332, "grad_norm": 20.252892967215743, "learning_rate": 6.419515626547542e-08, "logits/chosen": -0.9170666337013245, "logits/rejected": -0.830635130405426, "logps/chosen": -868.954833984375, "logps/rejected": -1118.8985595703125, "loss": 0.3554, "rewards/accuracies": 1.0, "rewards/chosen": -4.489101886749268, "rewards/margins": 2.7545058727264404, "rewards/rejected": -7.243607997894287, "step": 1003 }, { "epoch": 0.6557269981222957, "grad_norm": 21.94769150248843, "learning_rate": 6.398222751952898e-08, "logits/chosen": -0.97673499584198, "logits/rejected": -0.8297156691551208, "logps/chosen": -961.0098876953125, "logps/rejected": -1063.3936767578125, "loss": 0.4405, "rewards/accuracies": 0.78125, "rewards/chosen": -4.749734878540039, "rewards/margins": 1.4654510021209717, "rewards/rejected": -6.215185642242432, "step": 1004 }, { "epoch": 0.6563801126622582, "grad_norm": 17.236789737719608, "learning_rate": 6.376948625081197e-08, "logits/chosen": -0.9749749302864075, "logits/rejected": -0.8019654750823975, "logps/chosen": -841.8432006835938, "logps/rejected": -1004.3468017578125, "loss": 0.3637, "rewards/accuracies": 0.84375, "rewards/chosen": -3.680497646331787, "rewards/margins": 2.00964093208313, "rewards/rejected": -5.69013786315918, "step": 1005 }, { "epoch": 0.6570332272022206, "grad_norm": 17.819653232599666, "learning_rate": 6.355693356667064e-08, "logits/chosen": -0.9714920520782471, "logits/rejected": -0.8096312284469604, "logps/chosen": -877.8084106445312, "logps/rejected": -976.0759887695312, "loss": 0.3838, "rewards/accuracies": 0.75, "rewards/chosen": -4.433545112609863, "rewards/margins": 1.3440808057785034, "rewards/rejected": -5.777626037597656, "step": 1006 }, { "epoch": 0.657686341742183, "grad_norm": 21.829219375237, "learning_rate": 6.334457057346955e-08, "logits/chosen": -1.0377883911132812, "logits/rejected": -0.8932283520698547, "logps/chosen": -975.3641357421875, "logps/rejected": -1123.2431640625, "loss": 0.4111, "rewards/accuracies": 0.875, "rewards/chosen": -4.581691741943359, "rewards/margins": 2.190666437149048, "rewards/rejected": -6.772358417510986, "step": 1007 }, { "epoch": 0.6583394562821455, "grad_norm": 25.870601002660127, "learning_rate": 6.313239837658595e-08, "logits/chosen": -0.794097900390625, "logits/rejected": -0.7714889645576477, "logps/chosen": -843.1420288085938, "logps/rejected": -1065.3768310546875, "loss": 0.3803, "rewards/accuracies": 0.78125, "rewards/chosen": -4.312726020812988, "rewards/margins": 1.87660813331604, "rewards/rejected": -6.189334392547607, "step": 1008 }, { "epoch": 0.6589925708221079, "grad_norm": 18.443080252151734, "learning_rate": 6.292041808040392e-08, "logits/chosen": -0.8890469074249268, "logits/rejected": -0.8851238489151001, "logps/chosen": -943.687255859375, "logps/rejected": -1115.829345703125, "loss": 0.3903, "rewards/accuracies": 0.90625, "rewards/chosen": -4.130290985107422, "rewards/margins": 1.4789390563964844, "rewards/rejected": -5.609230041503906, "step": 1009 }, { "epoch": 0.6596456853620704, "grad_norm": 18.127254021458366, "learning_rate": 6.270863078830875e-08, "logits/chosen": -0.8616389036178589, "logits/rejected": -0.8308084607124329, "logps/chosen": -1043.859619140625, "logps/rejected": -1196.02734375, "loss": 0.3193, "rewards/accuracies": 0.8125, "rewards/chosen": -4.978635787963867, "rewards/margins": 1.3679126501083374, "rewards/rejected": -6.346548080444336, "step": 1010 }, { "epoch": 0.6602987999020328, "grad_norm": 15.263520577363048, "learning_rate": 6.249703760268102e-08, "logits/chosen": -0.8907433748245239, "logits/rejected": -0.7941842079162598, "logps/chosen": -902.1382446289062, "logps/rejected": -1041.628173828125, "loss": 0.4291, "rewards/accuracies": 0.78125, "rewards/chosen": -4.760441303253174, "rewards/margins": 1.240327000617981, "rewards/rejected": -6.000767707824707, "step": 1011 }, { "epoch": 0.6609519144419953, "grad_norm": 16.88956803814998, "learning_rate": 6.228563962489105e-08, "logits/chosen": -0.7089736461639404, "logits/rejected": -0.8286032676696777, "logps/chosen": -887.5546875, "logps/rejected": -1206.897705078125, "loss": 0.3464, "rewards/accuracies": 0.90625, "rewards/chosen": -4.500287055969238, "rewards/margins": 2.4705374240875244, "rewards/rejected": -6.970824241638184, "step": 1012 }, { "epoch": 0.6616050289819577, "grad_norm": 20.507417708250284, "learning_rate": 6.207443795529302e-08, "logits/chosen": -0.9035853147506714, "logits/rejected": -0.8546502590179443, "logps/chosen": -930.5029296875, "logps/rejected": -1015.2005004882812, "loss": 0.3733, "rewards/accuracies": 0.75, "rewards/chosen": -4.440670490264893, "rewards/margins": 1.2674169540405273, "rewards/rejected": -5.708087921142578, "step": 1013 }, { "epoch": 0.6622581435219201, "grad_norm": 23.30558964260641, "learning_rate": 6.186343369321936e-08, "logits/chosen": -0.866624116897583, "logits/rejected": -0.8739579916000366, "logps/chosen": -961.4056396484375, "logps/rejected": -1113.17529296875, "loss": 0.413, "rewards/accuracies": 0.875, "rewards/chosen": -4.258022308349609, "rewards/margins": 1.6140351295471191, "rewards/rejected": -5.8720574378967285, "step": 1014 }, { "epoch": 0.6629112580618826, "grad_norm": 27.951902968353572, "learning_rate": 6.165262793697485e-08, "logits/chosen": -0.7021287083625793, "logits/rejected": -0.7541577816009521, "logps/chosen": -975.0057373046875, "logps/rejected": -1180.75830078125, "loss": 0.3428, "rewards/accuracies": 0.90625, "rewards/chosen": -4.649076461791992, "rewards/margins": 1.4899348020553589, "rewards/rejected": -6.139011383056641, "step": 1015 }, { "epoch": 0.6635643726018451, "grad_norm": 17.65162459087189, "learning_rate": 6.144202178383116e-08, "logits/chosen": -0.8119497299194336, "logits/rejected": -0.7789148688316345, "logps/chosen": -1021.3302001953125, "logps/rejected": -1172.5645751953125, "loss": 0.3729, "rewards/accuracies": 0.59375, "rewards/chosen": -5.71751070022583, "rewards/margins": 1.3930636644363403, "rewards/rejected": -7.110573768615723, "step": 1016 }, { "epoch": 0.6642174871418075, "grad_norm": 21.54542033666558, "learning_rate": 6.12316163300209e-08, "logits/chosen": -1.0835685729980469, "logits/rejected": -0.9866104125976562, "logps/chosen": -963.4041137695312, "logps/rejected": -1078.0098876953125, "loss": 0.387, "rewards/accuracies": 0.84375, "rewards/chosen": -4.319831371307373, "rewards/margins": 1.408236026763916, "rewards/rejected": -5.7280683517456055, "step": 1017 }, { "epoch": 0.6648706016817699, "grad_norm": 28.278042534457644, "learning_rate": 6.102141267073207e-08, "logits/chosen": -0.7278181314468384, "logits/rejected": -0.6452523469924927, "logps/chosen": -1027.501708984375, "logps/rejected": -1116.961181640625, "loss": 0.4252, "rewards/accuracies": 0.71875, "rewards/chosen": -5.605171203613281, "rewards/margins": 1.059931993484497, "rewards/rejected": -6.665103435516357, "step": 1018 }, { "epoch": 0.6655237162217323, "grad_norm": 21.588692051121384, "learning_rate": 6.081141190010228e-08, "logits/chosen": -0.9027915596961975, "logits/rejected": -0.9593067169189453, "logps/chosen": -929.5646362304688, "logps/rejected": -1081.724365234375, "loss": 0.3564, "rewards/accuracies": 0.875, "rewards/chosen": -5.074166774749756, "rewards/margins": 1.5641591548919678, "rewards/rejected": -6.638326644897461, "step": 1019 }, { "epoch": 0.6661768307616949, "grad_norm": 29.52749993277838, "learning_rate": 6.06016151112131e-08, "logits/chosen": -0.8214589953422546, "logits/rejected": -0.7484840750694275, "logps/chosen": -941.7957763671875, "logps/rejected": -1127.7822265625, "loss": 0.3624, "rewards/accuracies": 0.875, "rewards/chosen": -5.040858745574951, "rewards/margins": 1.8832162618637085, "rewards/rejected": -6.924075126647949, "step": 1020 }, { "epoch": 0.6668299453016573, "grad_norm": 35.80802512812045, "learning_rate": 6.039202339608431e-08, "logits/chosen": -0.9403936266899109, "logits/rejected": -0.9254910349845886, "logps/chosen": -966.054443359375, "logps/rejected": -1181.616455078125, "loss": 0.3783, "rewards/accuracies": 0.75, "rewards/chosen": -4.613440036773682, "rewards/margins": 1.987739086151123, "rewards/rejected": -6.601179122924805, "step": 1021 }, { "epoch": 0.6674830598416197, "grad_norm": 32.33080558442622, "learning_rate": 6.01826378456683e-08, "logits/chosen": -0.8391031622886658, "logits/rejected": -0.6777211427688599, "logps/chosen": -1084.9720458984375, "logps/rejected": -1229.69140625, "loss": 0.4813, "rewards/accuracies": 0.90625, "rewards/chosen": -5.708389759063721, "rewards/margins": 2.226646661758423, "rewards/rejected": -7.9350361824035645, "step": 1022 }, { "epoch": 0.6681361743815821, "grad_norm": 26.323254452592845, "learning_rate": 5.997345954984428e-08, "logits/chosen": -0.8788707852363586, "logits/rejected": -0.7895121574401855, "logps/chosen": -980.4774169921875, "logps/rejected": -1198.721435546875, "loss": 0.4067, "rewards/accuracies": 0.875, "rewards/chosen": -5.896998882293701, "rewards/margins": 1.8806391954421997, "rewards/rejected": -7.777637958526611, "step": 1023 }, { "epoch": 0.6687892889215447, "grad_norm": 29.774523819635025, "learning_rate": 5.976448959741274e-08, "logits/chosen": -0.8680253624916077, "logits/rejected": -0.8557955026626587, "logps/chosen": -1033.0860595703125, "logps/rejected": -1219.666748046875, "loss": 0.4247, "rewards/accuracies": 0.84375, "rewards/chosen": -5.772690773010254, "rewards/margins": 1.7076537609100342, "rewards/rejected": -7.480345249176025, "step": 1024 }, { "epoch": 0.6694424034615071, "grad_norm": 23.95120311809926, "learning_rate": 5.95557290760897e-08, "logits/chosen": -0.8246513605117798, "logits/rejected": -0.7980694770812988, "logps/chosen": -1030.458251953125, "logps/rejected": -1224.46826171875, "loss": 0.4175, "rewards/accuracies": 0.9375, "rewards/chosen": -5.804914474487305, "rewards/margins": 1.9084160327911377, "rewards/rejected": -7.71333122253418, "step": 1025 }, { "epoch": 0.6700955180014695, "grad_norm": 15.030718046861146, "learning_rate": 5.934717907250103e-08, "logits/chosen": -0.9304713010787964, "logits/rejected": -0.8270508050918579, "logps/chosen": -1171.826416015625, "logps/rejected": -1419.8758544921875, "loss": 0.3204, "rewards/accuracies": 0.875, "rewards/chosen": -6.460097312927246, "rewards/margins": 2.5480339527130127, "rewards/rejected": -9.00813102722168, "step": 1026 }, { "epoch": 0.6707486325414319, "grad_norm": 38.89653299484151, "learning_rate": 5.9138840672176845e-08, "logits/chosen": -1.040481448173523, "logits/rejected": -0.9256024360656738, "logps/chosen": -1069.36767578125, "logps/rejected": -1228.0535888671875, "loss": 0.3857, "rewards/accuracies": 0.78125, "rewards/chosen": -5.9183573722839355, "rewards/margins": 1.9786078929901123, "rewards/rejected": -7.896965026855469, "step": 1027 }, { "epoch": 0.6714017470813944, "grad_norm": 36.203980499152635, "learning_rate": 5.893071495954587e-08, "logits/chosen": -0.9186846017837524, "logits/rejected": -0.6426994800567627, "logps/chosen": -1027.7308349609375, "logps/rejected": -1147.070068359375, "loss": 0.4639, "rewards/accuracies": 0.8125, "rewards/chosen": -5.392332077026367, "rewards/margins": 1.7910363674163818, "rewards/rejected": -7.183367729187012, "step": 1028 }, { "epoch": 0.6720548616213569, "grad_norm": 49.257481169429646, "learning_rate": 5.87228030179297e-08, "logits/chosen": -0.8563072681427002, "logits/rejected": -0.8698632121086121, "logps/chosen": -940.2181396484375, "logps/rejected": -1112.7574462890625, "loss": 0.5029, "rewards/accuracies": 0.8125, "rewards/chosen": -5.377842426300049, "rewards/margins": 1.112206220626831, "rewards/rejected": -6.490049362182617, "step": 1029 }, { "epoch": 0.6727079761613193, "grad_norm": 27.724815910490978, "learning_rate": 5.851510592953728e-08, "logits/chosen": -0.705344557762146, "logits/rejected": -0.6405034065246582, "logps/chosen": -1008.1170654296875, "logps/rejected": -1307.7806396484375, "loss": 0.3799, "rewards/accuracies": 0.9375, "rewards/chosen": -5.663532257080078, "rewards/margins": 2.7474920749664307, "rewards/rejected": -8.41102409362793, "step": 1030 }, { "epoch": 0.6733610907012817, "grad_norm": 30.54103333361302, "learning_rate": 5.8307624775459194e-08, "logits/chosen": -0.9086852073669434, "logits/rejected": -0.7840572595596313, "logps/chosen": -1092.9234619140625, "logps/rejected": -1261.9825439453125, "loss": 0.3802, "rewards/accuracies": 0.90625, "rewards/chosen": -5.860642433166504, "rewards/margins": 1.9201451539993286, "rewards/rejected": -7.780787467956543, "step": 1031 }, { "epoch": 0.6740142052412442, "grad_norm": 22.99525416459896, "learning_rate": 5.810036063566206e-08, "logits/chosen": -0.9566446542739868, "logits/rejected": -0.9745736718177795, "logps/chosen": -1061.357421875, "logps/rejected": -1276.2802734375, "loss": 0.3843, "rewards/accuracies": 0.84375, "rewards/chosen": -5.4080634117126465, "rewards/margins": 1.6003923416137695, "rewards/rejected": -7.008456230163574, "step": 1032 }, { "epoch": 0.6746673197812066, "grad_norm": 16.646591987887586, "learning_rate": 5.78933145889829e-08, "logits/chosen": -0.6335456371307373, "logits/rejected": -0.603712260723114, "logps/chosen": -911.3923950195312, "logps/rejected": -1112.7017822265625, "loss": 0.3671, "rewards/accuracies": 0.84375, "rewards/chosen": -4.9419169425964355, "rewards/margins": 2.1146814823150635, "rewards/rejected": -7.056598663330078, "step": 1033 }, { "epoch": 0.6753204343211691, "grad_norm": 18.61320965234198, "learning_rate": 5.768648771312354e-08, "logits/chosen": -0.8663079738616943, "logits/rejected": -0.9046114683151245, "logps/chosen": -874.4501342773438, "logps/rejected": -1093.3466796875, "loss": 0.4778, "rewards/accuracies": 0.78125, "rewards/chosen": -4.530414581298828, "rewards/margins": 1.9164561033248901, "rewards/rejected": -6.44687032699585, "step": 1034 }, { "epoch": 0.6759735488611315, "grad_norm": 32.866564141810386, "learning_rate": 5.747988108464501e-08, "logits/chosen": -0.9264869689941406, "logits/rejected": -0.8600889444351196, "logps/chosen": -914.56494140625, "logps/rejected": -1075.855712890625, "loss": 0.4244, "rewards/accuracies": 0.875, "rewards/chosen": -4.675785541534424, "rewards/margins": 1.764554500579834, "rewards/rejected": -6.440340042114258, "step": 1035 }, { "epoch": 0.676626663401094, "grad_norm": 22.030929440915205, "learning_rate": 5.7273495778961934e-08, "logits/chosen": -0.811077356338501, "logits/rejected": -0.8401280641555786, "logps/chosen": -895.1986083984375, "logps/rejected": -1045.7843017578125, "loss": 0.4521, "rewards/accuracies": 0.90625, "rewards/chosen": -4.402617931365967, "rewards/margins": 1.4379081726074219, "rewards/rejected": -5.8405256271362305, "step": 1036 }, { "epoch": 0.6772797779410564, "grad_norm": 17.044116664966285, "learning_rate": 5.706733287033681e-08, "logits/chosen": -0.8876982927322388, "logits/rejected": -0.9169207215309143, "logps/chosen": -883.7059936523438, "logps/rejected": -1003.1925048828125, "loss": 0.3869, "rewards/accuracies": 0.78125, "rewards/chosen": -4.5824151039123535, "rewards/margins": 1.4617775678634644, "rewards/rejected": -6.044192790985107, "step": 1037 }, { "epoch": 0.6779328924810188, "grad_norm": 15.396487747451959, "learning_rate": 5.686139343187467e-08, "logits/chosen": -0.9496694803237915, "logits/rejected": -1.007110357284546, "logps/chosen": -821.8455810546875, "logps/rejected": -910.9765014648438, "loss": 0.3915, "rewards/accuracies": 0.71875, "rewards/chosen": -3.7206950187683105, "rewards/margins": 1.021874189376831, "rewards/rejected": -4.742569446563721, "step": 1038 }, { "epoch": 0.6785860070209813, "grad_norm": 15.439263523759438, "learning_rate": 5.6655678535517296e-08, "logits/chosen": -0.8559461236000061, "logits/rejected": -0.8094202876091003, "logps/chosen": -765.6228637695312, "logps/rejected": -977.0548095703125, "loss": 0.3976, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1760189533233643, "rewards/margins": 1.9604696035385132, "rewards/rejected": -5.136489391326904, "step": 1039 }, { "epoch": 0.6792391215609438, "grad_norm": 23.113871047926832, "learning_rate": 5.645018925203771e-08, "logits/chosen": -0.8940149545669556, "logits/rejected": -0.9034225940704346, "logps/chosen": -877.113037109375, "logps/rejected": -1016.8856201171875, "loss": 0.3944, "rewards/accuracies": 0.8125, "rewards/chosen": -4.261010646820068, "rewards/margins": 1.2613548040390015, "rewards/rejected": -5.522365093231201, "step": 1040 }, { "epoch": 0.6798922361009062, "grad_norm": 15.694033791253236, "learning_rate": 5.6244926651034554e-08, "logits/chosen": -0.9289844036102295, "logits/rejected": -0.8611258268356323, "logps/chosen": -899.3880004882812, "logps/rejected": -1034.294677734375, "loss": 0.4081, "rewards/accuracies": 0.75, "rewards/chosen": -4.299962997436523, "rewards/margins": 1.414807677268982, "rewards/rejected": -5.714770793914795, "step": 1041 }, { "epoch": 0.6805453506408686, "grad_norm": 20.104453338479633, "learning_rate": 5.603989180092661e-08, "logits/chosen": -1.038852334022522, "logits/rejected": -1.0274477005004883, "logps/chosen": -864.4334716796875, "logps/rejected": -1042.127197265625, "loss": 0.3334, "rewards/accuracies": 0.875, "rewards/chosen": -3.5643439292907715, "rewards/margins": 2.006700277328491, "rewards/rejected": -5.571044445037842, "step": 1042 }, { "epoch": 0.681198465180831, "grad_norm": 20.284466958221444, "learning_rate": 5.583508576894716e-08, "logits/chosen": -0.9074689149856567, "logits/rejected": -0.8942803144454956, "logps/chosen": -788.8450927734375, "logps/rejected": -977.1786499023438, "loss": 0.4116, "rewards/accuracies": 0.84375, "rewards/chosen": -3.9018185138702393, "rewards/margins": 1.7637240886688232, "rewards/rejected": -5.6655426025390625, "step": 1043 }, { "epoch": 0.6818515797207936, "grad_norm": 20.80758519782795, "learning_rate": 5.563050962113844e-08, "logits/chosen": -0.7185485363006592, "logits/rejected": -0.6636132001876831, "logps/chosen": -881.2183227539062, "logps/rejected": -1114.7947998046875, "loss": 0.3875, "rewards/accuracies": 0.875, "rewards/chosen": -4.508395195007324, "rewards/margins": 2.214874029159546, "rewards/rejected": -6.723268985748291, "step": 1044 }, { "epoch": 0.682504694260756, "grad_norm": 21.006280162797133, "learning_rate": 5.542616442234618e-08, "logits/chosen": -0.9581667184829712, "logits/rejected": -0.9366329312324524, "logps/chosen": -965.768798828125, "logps/rejected": -1094.48095703125, "loss": 0.3038, "rewards/accuracies": 1.0, "rewards/chosen": -4.318354606628418, "rewards/margins": 1.6102018356323242, "rewards/rejected": -5.928555965423584, "step": 1045 }, { "epoch": 0.6831578088007184, "grad_norm": 20.92621303297454, "learning_rate": 5.522205123621389e-08, "logits/chosen": -0.7880922555923462, "logits/rejected": -0.8342847228050232, "logps/chosen": -796.2772216796875, "logps/rejected": -954.2313232421875, "loss": 0.3693, "rewards/accuracies": 0.875, "rewards/chosen": -3.7319140434265137, "rewards/margins": 1.4701519012451172, "rewards/rejected": -5.202065467834473, "step": 1046 }, { "epoch": 0.6838109233406808, "grad_norm": 28.646083147828463, "learning_rate": 5.501817112517748e-08, "logits/chosen": -0.8063758611679077, "logits/rejected": -0.7389829158782959, "logps/chosen": -827.5449829101562, "logps/rejected": -965.8816528320312, "loss": 0.3728, "rewards/accuracies": 0.8125, "rewards/chosen": -4.156528472900391, "rewards/margins": 1.3732936382293701, "rewards/rejected": -5.529821872711182, "step": 1047 }, { "epoch": 0.6844640378806434, "grad_norm": 16.107664772286466, "learning_rate": 5.4814525150459735e-08, "logits/chosen": -0.798210859298706, "logits/rejected": -0.7040055394172668, "logps/chosen": -994.9788208007812, "logps/rejected": -1129.2100830078125, "loss": 0.4314, "rewards/accuracies": 0.8125, "rewards/chosen": -4.312658309936523, "rewards/margins": 1.7292759418487549, "rewards/rejected": -6.041934490203857, "step": 1048 }, { "epoch": 0.6851171524206058, "grad_norm": 19.906158775570255, "learning_rate": 5.461111437206456e-08, "logits/chosen": -0.6621243953704834, "logits/rejected": -0.7967365980148315, "logps/chosen": -920.3370971679688, "logps/rejected": -1080.441650390625, "loss": 0.42, "rewards/accuracies": 0.78125, "rewards/chosen": -4.662235260009766, "rewards/margins": 1.124133825302124, "rewards/rejected": -5.786369800567627, "step": 1049 }, { "epoch": 0.6857702669605682, "grad_norm": 14.893691672277162, "learning_rate": 5.4407939848771764e-08, "logits/chosen": -0.9783452749252319, "logits/rejected": -0.9631463885307312, "logps/chosen": -800.3297119140625, "logps/rejected": -957.534912109375, "loss": 0.3606, "rewards/accuracies": 0.78125, "rewards/chosen": -3.5368897914886475, "rewards/margins": 1.668915867805481, "rewards/rejected": -5.205805778503418, "step": 1050 }, { "epoch": 0.6864233815005306, "grad_norm": 25.85461733379247, "learning_rate": 5.4205002638131404e-08, "logits/chosen": -0.9667469263076782, "logits/rejected": -0.9314427971839905, "logps/chosen": -910.9541625976562, "logps/rejected": -1013.9326782226562, "loss": 0.4132, "rewards/accuracies": 0.75, "rewards/chosen": -4.142405986785889, "rewards/margins": 0.9281677007675171, "rewards/rejected": -5.070573329925537, "step": 1051 }, { "epoch": 0.6870764960404931, "grad_norm": 16.778904431002125, "learning_rate": 5.400230379645827e-08, "logits/chosen": -0.8126803636550903, "logits/rejected": -0.9782637357711792, "logps/chosen": -929.3917236328125, "logps/rejected": -1180.9573974609375, "loss": 0.351, "rewards/accuracies": 0.84375, "rewards/chosen": -4.6177825927734375, "rewards/margins": 1.6634180545806885, "rewards/rejected": -6.281200408935547, "step": 1052 }, { "epoch": 0.6877296105804556, "grad_norm": 20.764558520137363, "learning_rate": 5.379984437882642e-08, "logits/chosen": -0.7940772771835327, "logits/rejected": -0.7913058400154114, "logps/chosen": -897.7255249023438, "logps/rejected": -1095.1759033203125, "loss": 0.3669, "rewards/accuracies": 0.90625, "rewards/chosen": -4.532567977905273, "rewards/margins": 1.7281697988510132, "rewards/rejected": -6.260737419128418, "step": 1053 }, { "epoch": 0.688382725120418, "grad_norm": 20.538231684797744, "learning_rate": 5.3597625439063675e-08, "logits/chosen": -0.9358534812927246, "logits/rejected": -1.0889662504196167, "logps/chosen": -925.3179321289062, "logps/rejected": -1099.0260009765625, "loss": 0.4167, "rewards/accuracies": 0.78125, "rewards/chosen": -3.8400797843933105, "rewards/margins": 1.610809326171875, "rewards/rejected": -5.450888633728027, "step": 1054 }, { "epoch": 0.6890358396603804, "grad_norm": 31.707439136525775, "learning_rate": 5.339564802974614e-08, "logits/chosen": -0.8603663444519043, "logits/rejected": -0.658951461315155, "logps/chosen": -934.3653564453125, "logps/rejected": -1055.4537353515625, "loss": 0.4493, "rewards/accuracies": 0.75, "rewards/chosen": -4.719765663146973, "rewards/margins": 1.5893982648849487, "rewards/rejected": -6.309163570404053, "step": 1055 }, { "epoch": 0.6896889542003429, "grad_norm": 35.603302908406555, "learning_rate": 5.319391320219271e-08, "logits/chosen": -0.7260603308677673, "logits/rejected": -0.5930569767951965, "logps/chosen": -985.3331298828125, "logps/rejected": -1116.142822265625, "loss": 0.4828, "rewards/accuracies": 0.875, "rewards/chosen": -4.90203332901001, "rewards/margins": 1.783280372619629, "rewards/rejected": -6.685314178466797, "step": 1056 }, { "epoch": 0.6903420687403053, "grad_norm": 36.525210316554094, "learning_rate": 5.2992422006459584e-08, "logits/chosen": -1.0283126831054688, "logits/rejected": -0.9706616401672363, "logps/chosen": -881.817626953125, "logps/rejected": -996.3411865234375, "loss": 0.3508, "rewards/accuracies": 0.78125, "rewards/chosen": -4.343301296234131, "rewards/margins": 0.9503412246704102, "rewards/rejected": -5.293642997741699, "step": 1057 }, { "epoch": 0.6909951832802678, "grad_norm": 16.444240118695234, "learning_rate": 5.279117549133494e-08, "logits/chosen": -0.7937179207801819, "logits/rejected": -0.7530346512794495, "logps/chosen": -1088.676513671875, "logps/rejected": -1272.829833984375, "loss": 0.4141, "rewards/accuracies": 0.75, "rewards/chosen": -5.84459114074707, "rewards/margins": 1.847930669784546, "rewards/rejected": -7.692521572113037, "step": 1058 }, { "epoch": 0.6916482978202302, "grad_norm": 22.26048819934266, "learning_rate": 5.259017470433328e-08, "logits/chosen": -0.9292958378791809, "logits/rejected": -0.8815614581108093, "logps/chosen": -977.33447265625, "logps/rejected": -1077.522705078125, "loss": 0.3846, "rewards/accuracies": 0.65625, "rewards/chosen": -4.464293003082275, "rewards/margins": 1.2737520933151245, "rewards/rejected": -5.7380452156066895, "step": 1059 }, { "epoch": 0.6923014123601927, "grad_norm": 22.035446979342556, "learning_rate": 5.238942069168999e-08, "logits/chosen": -1.0402249097824097, "logits/rejected": -0.9985241889953613, "logps/chosen": -899.611328125, "logps/rejected": -1045.4075927734375, "loss": 0.417, "rewards/accuracies": 0.8125, "rewards/chosen": -4.677881240844727, "rewards/margins": 1.5670441389083862, "rewards/rejected": -6.2449259757995605, "step": 1060 }, { "epoch": 0.6929545269001551, "grad_norm": 16.41657358283337, "learning_rate": 5.2188914498356074e-08, "logits/chosen": -0.7909882068634033, "logits/rejected": -0.652152955532074, "logps/chosen": -1015.9618530273438, "logps/rejected": -1141.8668212890625, "loss": 0.3874, "rewards/accuracies": 0.78125, "rewards/chosen": -5.259662628173828, "rewards/margins": 1.6021357774734497, "rewards/rejected": -6.8617987632751465, "step": 1061 }, { "epoch": 0.6936076414401176, "grad_norm": 18.329459349173682, "learning_rate": 5.198865716799256e-08, "logits/chosen": -0.9934824109077454, "logits/rejected": -0.9579141139984131, "logps/chosen": -945.3819580078125, "logps/rejected": -1096.711181640625, "loss": 0.3489, "rewards/accuracies": 0.8125, "rewards/chosen": -4.630897045135498, "rewards/margins": 1.4307856559753418, "rewards/rejected": -6.06168270111084, "step": 1062 }, { "epoch": 0.69426075598008, "grad_norm": 18.277996368135458, "learning_rate": 5.178864974296511e-08, "logits/chosen": -0.9666036367416382, "logits/rejected": -0.9266764521598816, "logps/chosen": -993.549560546875, "logps/rejected": -1246.1953125, "loss": 0.3568, "rewards/accuracies": 0.90625, "rewards/chosen": -4.596602916717529, "rewards/margins": 2.0984785556793213, "rewards/rejected": -6.6950812339782715, "step": 1063 }, { "epoch": 0.6949138705200425, "grad_norm": 17.971296792300368, "learning_rate": 5.1588893264338616e-08, "logits/chosen": -0.8004229068756104, "logits/rejected": -0.85038822889328, "logps/chosen": -1064.92822265625, "logps/rejected": -1406.717529296875, "loss": 0.3459, "rewards/accuracies": 0.9375, "rewards/chosen": -5.635952949523926, "rewards/margins": 3.123544216156006, "rewards/rejected": -8.759496688842773, "step": 1064 }, { "epoch": 0.6955669850600049, "grad_norm": 26.220818179258686, "learning_rate": 5.138938877187173e-08, "logits/chosen": -0.8688936233520508, "logits/rejected": -0.9654818773269653, "logps/chosen": -949.654296875, "logps/rejected": -1131.2215576171875, "loss": 0.4364, "rewards/accuracies": 0.875, "rewards/chosen": -5.012843132019043, "rewards/margins": 1.3016622066497803, "rewards/rejected": -6.314505577087402, "step": 1065 }, { "epoch": 0.6962200995999673, "grad_norm": 19.263308241664543, "learning_rate": 5.119013730401152e-08, "logits/chosen": -0.9235569834709167, "logits/rejected": -0.8715468049049377, "logps/chosen": -957.46923828125, "logps/rejected": -1096.870849609375, "loss": 0.3926, "rewards/accuracies": 0.84375, "rewards/chosen": -5.0340352058410645, "rewards/margins": 1.6012662649154663, "rewards/rejected": -6.63530158996582, "step": 1066 }, { "epoch": 0.6968732141399298, "grad_norm": 35.0608624519057, "learning_rate": 5.099113989788799e-08, "logits/chosen": -0.7219104766845703, "logits/rejected": -0.7232096791267395, "logps/chosen": -897.578125, "logps/rejected": -1049.745361328125, "loss": 0.438, "rewards/accuracies": 0.75, "rewards/chosen": -4.6848464012146, "rewards/margins": 1.5889393091201782, "rewards/rejected": -6.273785591125488, "step": 1067 }, { "epoch": 0.6975263286798923, "grad_norm": 46.493238733591774, "learning_rate": 5.0792397589308754e-08, "logits/chosen": -0.9190981984138489, "logits/rejected": -0.9008411169052124, "logps/chosen": -1021.0134887695312, "logps/rejected": -1238.074462890625, "loss": 0.4594, "rewards/accuracies": 0.78125, "rewards/chosen": -5.529772758483887, "rewards/margins": 1.920540452003479, "rewards/rejected": -7.450313568115234, "step": 1068 }, { "epoch": 0.6981794432198547, "grad_norm": 34.874149040915995, "learning_rate": 5.0593911412753574e-08, "logits/chosen": -0.8482516407966614, "logits/rejected": -0.814669132232666, "logps/chosen": -878.93310546875, "logps/rejected": -1069.1351318359375, "loss": 0.4679, "rewards/accuracies": 0.78125, "rewards/chosen": -4.570670127868652, "rewards/margins": 1.5518887042999268, "rewards/rejected": -6.122559070587158, "step": 1069 }, { "epoch": 0.6988325577598171, "grad_norm": 25.3481107361764, "learning_rate": 5.0395682401369045e-08, "logits/chosen": -1.0013236999511719, "logits/rejected": -1.0127506256103516, "logps/chosen": -976.4231567382812, "logps/rejected": -1167.21728515625, "loss": 0.4275, "rewards/accuracies": 0.875, "rewards/chosen": -5.2549943923950195, "rewards/margins": 1.9067177772521973, "rewards/rejected": -7.161712169647217, "step": 1070 }, { "epoch": 0.6994856722997795, "grad_norm": 24.53339942422348, "learning_rate": 5.01977115869632e-08, "logits/chosen": -0.9320468306541443, "logits/rejected": -0.7867690920829773, "logps/chosen": -929.0973510742188, "logps/rejected": -1044.9697265625, "loss": 0.3859, "rewards/accuracies": 0.875, "rewards/chosen": -4.607649803161621, "rewards/margins": 1.5433708429336548, "rewards/rejected": -6.1510210037231445, "step": 1071 }, { "epoch": 0.7001387868397421, "grad_norm": 19.657165610425114, "learning_rate": 5.000000000000002e-08, "logits/chosen": -0.7341365814208984, "logits/rejected": -0.7753069996833801, "logps/chosen": -980.4578857421875, "logps/rejected": -1232.793212890625, "loss": 0.3473, "rewards/accuracies": 0.71875, "rewards/chosen": -5.089752674102783, "rewards/margins": 1.838250756263733, "rewards/rejected": -6.928003311157227, "step": 1072 }, { "epoch": 0.7007919013797045, "grad_norm": 16.536478826191836, "learning_rate": 4.980254866959428e-08, "logits/chosen": -0.9527970552444458, "logits/rejected": -0.7555267810821533, "logps/chosen": -1103.2843017578125, "logps/rejected": -1188.142822265625, "loss": 0.3595, "rewards/accuracies": 0.8125, "rewards/chosen": -5.621752738952637, "rewards/margins": 1.7319358587265015, "rewards/rejected": -7.353688716888428, "step": 1073 }, { "epoch": 0.7014450159196669, "grad_norm": 30.635582185007305, "learning_rate": 4.960535862350604e-08, "logits/chosen": -0.8805766105651855, "logits/rejected": -0.8499529957771301, "logps/chosen": -936.8541870117188, "logps/rejected": -1091.2286376953125, "loss": 0.2982, "rewards/accuracies": 0.90625, "rewards/chosen": -4.690476417541504, "rewards/margins": 1.805678367614746, "rewards/rejected": -6.49615478515625, "step": 1074 }, { "epoch": 0.7020981304596293, "grad_norm": 22.726614424437997, "learning_rate": 4.9408430888135366e-08, "logits/chosen": -0.9252241253852844, "logits/rejected": -0.9034774303436279, "logps/chosen": -1005.1307373046875, "logps/rejected": -1098.4501953125, "loss": 0.3193, "rewards/accuracies": 0.75, "rewards/chosen": -5.690752983093262, "rewards/margins": 1.0792409181594849, "rewards/rejected": -6.769993305206299, "step": 1075 }, { "epoch": 0.7027512449995919, "grad_norm": 16.997208594932292, "learning_rate": 4.921176648851695e-08, "logits/chosen": -0.9160603880882263, "logits/rejected": -0.9556408524513245, "logps/chosen": -1001.8563232421875, "logps/rejected": -1210.08349609375, "loss": 0.3174, "rewards/accuracies": 0.875, "rewards/chosen": -5.473414421081543, "rewards/margins": 1.7691338062286377, "rewards/rejected": -7.242548942565918, "step": 1076 }, { "epoch": 0.7034043595395543, "grad_norm": 30.06553357109662, "learning_rate": 4.9015366448314776e-08, "logits/chosen": -0.9377183318138123, "logits/rejected": -0.8571749329566956, "logps/chosen": -885.6331787109375, "logps/rejected": -1006.0621337890625, "loss": 0.3731, "rewards/accuracies": 0.9375, "rewards/chosen": -4.572412967681885, "rewards/margins": 1.4350861310958862, "rewards/rejected": -6.007499694824219, "step": 1077 }, { "epoch": 0.7040574740795167, "grad_norm": 32.30787877247046, "learning_rate": 4.8819231789816804e-08, "logits/chosen": -0.9564331769943237, "logits/rejected": -0.9491601586341858, "logps/chosen": -867.03369140625, "logps/rejected": -1031.533447265625, "loss": 0.4263, "rewards/accuracies": 0.78125, "rewards/chosen": -4.6438212394714355, "rewards/margins": 1.7188313007354736, "rewards/rejected": -6.36265230178833, "step": 1078 }, { "epoch": 0.7047105886194791, "grad_norm": 33.27109224503714, "learning_rate": 4.8623363533929665e-08, "logits/chosen": -0.7700978517532349, "logits/rejected": -0.7294084429740906, "logps/chosen": -965.6128540039062, "logps/rejected": -1207.4700927734375, "loss": 0.346, "rewards/accuracies": 0.75, "rewards/chosen": -5.409863471984863, "rewards/margins": 2.164041519165039, "rewards/rejected": -7.573904991149902, "step": 1079 }, { "epoch": 0.7053637031594416, "grad_norm": 16.688865139759347, "learning_rate": 4.8427762700173315e-08, "logits/chosen": -0.7151041030883789, "logits/rejected": -0.7404736280441284, "logps/chosen": -1033.2183837890625, "logps/rejected": -1254.7646484375, "loss": 0.3362, "rewards/accuracies": 0.78125, "rewards/chosen": -5.383447170257568, "rewards/margins": 1.9840342998504639, "rewards/rejected": -7.367480754852295, "step": 1080 }, { "epoch": 0.706016817699404, "grad_norm": 29.119132374839726, "learning_rate": 4.823243030667575e-08, "logits/chosen": -0.7962474226951599, "logits/rejected": -0.7569393515586853, "logps/chosen": -971.79345703125, "logps/rejected": -1106.267333984375, "loss": 0.4776, "rewards/accuracies": 0.6875, "rewards/chosen": -4.968940258026123, "rewards/margins": 1.46136474609375, "rewards/rejected": -6.430305004119873, "step": 1081 }, { "epoch": 0.7066699322393665, "grad_norm": 33.91675103172332, "learning_rate": 4.8037367370167734e-08, "logits/chosen": -0.9695914387702942, "logits/rejected": -0.8926000595092773, "logps/chosen": -945.17138671875, "logps/rejected": -1108.794677734375, "loss": 0.4413, "rewards/accuracies": 0.875, "rewards/chosen": -4.946935176849365, "rewards/margins": 1.6695194244384766, "rewards/rejected": -6.616455078125, "step": 1082 }, { "epoch": 0.7073230467793289, "grad_norm": 27.226190897262036, "learning_rate": 4.784257490597735e-08, "logits/chosen": -0.6901608109474182, "logits/rejected": -0.7227190732955933, "logps/chosen": -892.798583984375, "logps/rejected": -1206.8834228515625, "loss": 0.3787, "rewards/accuracies": 0.84375, "rewards/chosen": -4.823614120483398, "rewards/margins": 2.6140191555023193, "rewards/rejected": -7.4376325607299805, "step": 1083 }, { "epoch": 0.7079761613192914, "grad_norm": 21.147890499910353, "learning_rate": 4.7648053928024965e-08, "logits/chosen": -0.8689476251602173, "logits/rejected": -0.813915491104126, "logps/chosen": -930.1356201171875, "logps/rejected": -1091.7408447265625, "loss": 0.3937, "rewards/accuracies": 0.8125, "rewards/chosen": -4.987719535827637, "rewards/margins": 1.701695442199707, "rewards/rejected": -6.689414978027344, "step": 1084 }, { "epoch": 0.7086292758592538, "grad_norm": 29.385487598664348, "learning_rate": 4.745380544881779e-08, "logits/chosen": -0.8477723598480225, "logits/rejected": -0.7657088041305542, "logps/chosen": -959.1470947265625, "logps/rejected": -1071.5858154296875, "loss": 0.3979, "rewards/accuracies": 0.71875, "rewards/chosen": -5.0760931968688965, "rewards/margins": 1.1108086109161377, "rewards/rejected": -6.186902046203613, "step": 1085 }, { "epoch": 0.7092823903992163, "grad_norm": 41.50920009598862, "learning_rate": 4.725983047944461e-08, "logits/chosen": -0.8235193490982056, "logits/rejected": -0.8154042959213257, "logps/chosen": -1094.9686279296875, "logps/rejected": -1276.4725341796875, "loss": 0.5138, "rewards/accuracies": 0.875, "rewards/chosen": -5.995870590209961, "rewards/margins": 1.772527813911438, "rewards/rejected": -7.768398284912109, "step": 1086 }, { "epoch": 0.7099355049391787, "grad_norm": 22.16450576801752, "learning_rate": 4.7066130029570596e-08, "logits/chosen": -0.7971264123916626, "logits/rejected": -0.7773457765579224, "logps/chosen": -930.9798583984375, "logps/rejected": -1154.8284912109375, "loss": 0.353, "rewards/accuracies": 0.8125, "rewards/chosen": -5.19879674911499, "rewards/margins": 1.7664600610733032, "rewards/rejected": -6.965257167816162, "step": 1087 }, { "epoch": 0.7105886194791412, "grad_norm": 20.94228934785079, "learning_rate": 4.6872705107431995e-08, "logits/chosen": -0.9495987296104431, "logits/rejected": -1.042331576347351, "logps/chosen": -1047.2802734375, "logps/rejected": -1344.6365966796875, "loss": 0.3787, "rewards/accuracies": 0.8125, "rewards/chosen": -5.7100958824157715, "rewards/margins": 2.1285243034362793, "rewards/rejected": -7.838620185852051, "step": 1088 }, { "epoch": 0.7112417340191036, "grad_norm": 23.544647901426156, "learning_rate": 4.6679556719830895e-08, "logits/chosen": -0.8243415355682373, "logits/rejected": -0.9112780690193176, "logps/chosen": -1005.364501953125, "logps/rejected": -1222.390869140625, "loss": 0.3271, "rewards/accuracies": 0.875, "rewards/chosen": -4.869158744812012, "rewards/margins": 1.9440407752990723, "rewards/rejected": -6.813199996948242, "step": 1089 }, { "epoch": 0.711894848559066, "grad_norm": 28.57617302768549, "learning_rate": 4.648668587212997e-08, "logits/chosen": -0.873595118522644, "logits/rejected": -0.9013544917106628, "logps/chosen": -987.3074340820312, "logps/rejected": -1221.4285888671875, "loss": 0.364, "rewards/accuracies": 0.71875, "rewards/chosen": -5.284542083740234, "rewards/margins": 2.2790379524230957, "rewards/rejected": -7.56358003616333, "step": 1090 }, { "epoch": 0.7125479630990285, "grad_norm": 17.85569175076103, "learning_rate": 4.6294093568247297e-08, "logits/chosen": -0.7971256971359253, "logits/rejected": -0.6644490361213684, "logps/chosen": -1045.35986328125, "logps/rejected": -1205.6650390625, "loss": 0.3657, "rewards/accuracies": 0.875, "rewards/chosen": -5.020387172698975, "rewards/margins": 2.104153871536255, "rewards/rejected": -7.124541282653809, "step": 1091 }, { "epoch": 0.713201077638991, "grad_norm": 36.66717614889544, "learning_rate": 4.6101780810651057e-08, "logits/chosen": -0.9672690033912659, "logits/rejected": -0.8716130256652832, "logps/chosen": -986.6090087890625, "logps/rejected": -1103.75, "loss": 0.4264, "rewards/accuracies": 0.71875, "rewards/chosen": -5.203864097595215, "rewards/margins": 1.3343470096588135, "rewards/rejected": -6.538211345672607, "step": 1092 }, { "epoch": 0.7138541921789534, "grad_norm": 18.874466250073453, "learning_rate": 4.590974860035439e-08, "logits/chosen": -0.9630517959594727, "logits/rejected": -1.0401439666748047, "logps/chosen": -955.3115844726562, "logps/rejected": -1162.4873046875, "loss": 0.3307, "rewards/accuracies": 0.84375, "rewards/chosen": -4.960440635681152, "rewards/margins": 1.801946759223938, "rewards/rejected": -6.762386798858643, "step": 1093 }, { "epoch": 0.7145073067189158, "grad_norm": 17.887843223290304, "learning_rate": 4.571799793691013e-08, "logits/chosen": -0.7270591259002686, "logits/rejected": -0.7815565466880798, "logps/chosen": -855.4141235351562, "logps/rejected": -1117.685791015625, "loss": 0.404, "rewards/accuracies": 0.84375, "rewards/chosen": -4.313096523284912, "rewards/margins": 2.240790605545044, "rewards/rejected": -6.553886890411377, "step": 1094 }, { "epoch": 0.7151604212588782, "grad_norm": 42.00786174992929, "learning_rate": 4.5526529818405636e-08, "logits/chosen": -0.8590465784072876, "logits/rejected": -0.8318109512329102, "logps/chosen": -1035.328369140625, "logps/rejected": -1167.46728515625, "loss": 0.4439, "rewards/accuracies": 0.78125, "rewards/chosen": -5.72844934463501, "rewards/margins": 1.2250757217407227, "rewards/rejected": -6.953525543212891, "step": 1095 }, { "epoch": 0.7158135357988408, "grad_norm": 23.651213647631792, "learning_rate": 4.533534524145756e-08, "logits/chosen": -0.9591556787490845, "logits/rejected": -0.7288451194763184, "logps/chosen": -1023.7062377929688, "logps/rejected": -1212.59521484375, "loss": 0.3722, "rewards/accuracies": 0.90625, "rewards/chosen": -5.4393205642700195, "rewards/margins": 2.401437520980835, "rewards/rejected": -7.840758323669434, "step": 1096 }, { "epoch": 0.7164666503388032, "grad_norm": 21.052436894226904, "learning_rate": 4.514444520120669e-08, "logits/chosen": -0.7670482397079468, "logits/rejected": -0.7710490226745605, "logps/chosen": -998.1491088867188, "logps/rejected": -1230.4339599609375, "loss": 0.3542, "rewards/accuracies": 0.875, "rewards/chosen": -5.109296798706055, "rewards/margins": 2.5655345916748047, "rewards/rejected": -7.674831390380859, "step": 1097 }, { "epoch": 0.7171197648787656, "grad_norm": 19.91030514123549, "learning_rate": 4.495383069131281e-08, "logits/chosen": -0.7524601221084595, "logits/rejected": -0.7881897687911987, "logps/chosen": -927.6683349609375, "logps/rejected": -1151.753662109375, "loss": 0.3819, "rewards/accuracies": 0.84375, "rewards/chosen": -4.98646354675293, "rewards/margins": 2.0308237075805664, "rewards/rejected": -7.01728630065918, "step": 1098 }, { "epoch": 0.717772879418728, "grad_norm": 29.438295455629994, "learning_rate": 4.476350270394942e-08, "logits/chosen": -0.8855903148651123, "logits/rejected": -0.8176090121269226, "logps/chosen": -991.46923828125, "logps/rejected": -1106.505859375, "loss": 0.4597, "rewards/accuracies": 0.75, "rewards/chosen": -5.334303379058838, "rewards/margins": 1.3546168804168701, "rewards/rejected": -6.688920021057129, "step": 1099 }, { "epoch": 0.7184259939586906, "grad_norm": 21.8407247391087, "learning_rate": 4.457346222979864e-08, "logits/chosen": -1.0324180126190186, "logits/rejected": -1.028664231300354, "logps/chosen": -1082.05615234375, "logps/rejected": -1251.3363037109375, "loss": 0.3205, "rewards/accuracies": 0.875, "rewards/chosen": -5.213923454284668, "rewards/margins": 1.9834064245224, "rewards/rejected": -7.197330474853516, "step": 1100 }, { "epoch": 0.7184259939586906, "eval_logits/chosen": -0.6451797485351562, "eval_logits/rejected": -0.5846331715583801, "eval_logps/chosen": -992.7451782226562, "eval_logps/rejected": -1153.4371337890625, "eval_loss": 0.39495497941970825, "eval_rewards/accuracies": 0.8009999990463257, "eval_rewards/chosen": -5.188379764556885, "eval_rewards/margins": 1.6943055391311646, "eval_rewards/rejected": -6.882685661315918, "eval_runtime": 620.3885, "eval_samples_per_second": 6.448, "eval_steps_per_second": 0.403, "step": 1100 }, { "epoch": 0.719079108498653, "grad_norm": 32.440008459162485, "learning_rate": 4.4383710258046095e-08, "logits/chosen": -0.8906784057617188, "logits/rejected": -0.8440839052200317, "logps/chosen": -1033.5921630859375, "logps/rejected": -1192.6195068359375, "loss": 0.3947, "rewards/accuracies": 0.84375, "rewards/chosen": -5.000400066375732, "rewards/margins": 2.114108085632324, "rewards/rejected": -7.114508152008057, "step": 1101 }, { "epoch": 0.7197322230386154, "grad_norm": 30.116923265962466, "learning_rate": 4.419424777637565e-08, "logits/chosen": -0.9643840193748474, "logits/rejected": -0.8965681791305542, "logps/chosen": -887.838134765625, "logps/rejected": -980.8176879882812, "loss": 0.4032, "rewards/accuracies": 0.8125, "rewards/chosen": -4.65021276473999, "rewards/margins": 1.3411513566970825, "rewards/rejected": -5.991364002227783, "step": 1102 }, { "epoch": 0.7203853375785778, "grad_norm": 23.039154118625174, "learning_rate": 4.4005075770964396e-08, "logits/chosen": -1.019102931022644, "logits/rejected": -0.9605964422225952, "logps/chosen": -1037.569091796875, "logps/rejected": -1107.9766845703125, "loss": 0.4649, "rewards/accuracies": 0.75, "rewards/chosen": -5.0233683586120605, "rewards/margins": 1.0656507015228271, "rewards/rejected": -6.089019298553467, "step": 1103 }, { "epoch": 0.7210384521185403, "grad_norm": 52.674944572170844, "learning_rate": 4.3816195226477425e-08, "logits/chosen": -0.8158155679702759, "logits/rejected": -0.7879456877708435, "logps/chosen": -913.3277587890625, "logps/rejected": -1127.154296875, "loss": 0.3387, "rewards/accuracies": 0.875, "rewards/chosen": -4.6683149337768555, "rewards/margins": 2.0909457206726074, "rewards/rejected": -6.759261608123779, "step": 1104 }, { "epoch": 0.7216915666585028, "grad_norm": 31.95750334380501, "learning_rate": 4.362760712606277e-08, "logits/chosen": -0.7597179412841797, "logits/rejected": -0.7276663184165955, "logps/chosen": -917.2421875, "logps/rejected": -1164.9512939453125, "loss": 0.4093, "rewards/accuracies": 0.8125, "rewards/chosen": -4.858676433563232, "rewards/margins": 2.1077828407287598, "rewards/rejected": -6.96645975112915, "step": 1105 }, { "epoch": 0.7223446811984652, "grad_norm": 18.80716307800814, "learning_rate": 4.3439312451346154e-08, "logits/chosen": -0.777508556842804, "logits/rejected": -0.7970513105392456, "logps/chosen": -883.0430297851562, "logps/rejected": -1154.78076171875, "loss": 0.4108, "rewards/accuracies": 0.8125, "rewards/chosen": -4.606140613555908, "rewards/margins": 2.7335846424102783, "rewards/rejected": -7.339725017547607, "step": 1106 }, { "epoch": 0.7229977957384276, "grad_norm": 18.132490684953048, "learning_rate": 4.32513121824261e-08, "logits/chosen": -0.8307619094848633, "logits/rejected": -0.7737395763397217, "logps/chosen": -877.05126953125, "logps/rejected": -935.8836669921875, "loss": 0.4277, "rewards/accuracies": 0.78125, "rewards/chosen": -4.600595474243164, "rewards/margins": 0.8932433128356934, "rewards/rejected": -5.493838787078857, "step": 1107 }, { "epoch": 0.7236509102783901, "grad_norm": 20.319116079697935, "learning_rate": 4.306360729786866e-08, "logits/chosen": -0.8983240127563477, "logits/rejected": -0.7303913235664368, "logps/chosen": -999.090576171875, "logps/rejected": -1133.9527587890625, "loss": 0.4685, "rewards/accuracies": 0.75, "rewards/chosen": -5.099803924560547, "rewards/margins": 1.5410734415054321, "rewards/rejected": -6.640877723693848, "step": 1108 }, { "epoch": 0.7243040248183525, "grad_norm": 17.22827875356533, "learning_rate": 4.287619877470238e-08, "logits/chosen": -0.6570420861244202, "logits/rejected": -0.6857782006263733, "logps/chosen": -927.8821411132812, "logps/rejected": -1173.153564453125, "loss": 0.3749, "rewards/accuracies": 0.9375, "rewards/chosen": -4.6236114501953125, "rewards/margins": 2.370687484741211, "rewards/rejected": -6.994299411773682, "step": 1109 }, { "epoch": 0.724957139358315, "grad_norm": 24.744654710477878, "learning_rate": 4.268908758841317e-08, "logits/chosen": -0.769822359085083, "logits/rejected": -0.7670592069625854, "logps/chosen": -850.655517578125, "logps/rejected": -1075.8778076171875, "loss": 0.3146, "rewards/accuracies": 0.8125, "rewards/chosen": -4.2061848640441895, "rewards/margins": 1.7134172916412354, "rewards/rejected": -5.919601917266846, "step": 1110 }, { "epoch": 0.7256102538982774, "grad_norm": 24.463939391961844, "learning_rate": 4.250227471293935e-08, "logits/chosen": -0.7590472102165222, "logits/rejected": -0.6803931593894958, "logps/chosen": -939.5840454101562, "logps/rejected": -1133.0911865234375, "loss": 0.3605, "rewards/accuracies": 0.8125, "rewards/chosen": -5.415617942810059, "rewards/margins": 1.7700562477111816, "rewards/rejected": -7.185673713684082, "step": 1111 }, { "epoch": 0.7262633684382399, "grad_norm": 35.63302248021802, "learning_rate": 4.2315761120666394e-08, "logits/chosen": -0.8123034238815308, "logits/rejected": -0.6775568723678589, "logps/chosen": -924.438232421875, "logps/rejected": -1116.736328125, "loss": 0.4079, "rewards/accuracies": 0.90625, "rewards/chosen": -4.730605125427246, "rewards/margins": 2.201202869415283, "rewards/rejected": -6.931807994842529, "step": 1112 }, { "epoch": 0.7269164829782023, "grad_norm": 21.952209080683218, "learning_rate": 4.212954778242203e-08, "logits/chosen": -0.9220980405807495, "logits/rejected": -0.8390252590179443, "logps/chosen": -993.52197265625, "logps/rejected": -1138.182861328125, "loss": 0.4338, "rewards/accuracies": 0.90625, "rewards/chosen": -5.041605472564697, "rewards/margins": 1.724038004875183, "rewards/rejected": -6.76564359664917, "step": 1113 }, { "epoch": 0.7275695975181647, "grad_norm": 46.56626196193875, "learning_rate": 4.194363566747109e-08, "logits/chosen": -0.7909491062164307, "logits/rejected": -0.7935473322868347, "logps/chosen": -889.592041015625, "logps/rejected": -1148.26025390625, "loss": 0.4108, "rewards/accuracies": 0.84375, "rewards/chosen": -4.259602069854736, "rewards/margins": 2.401700496673584, "rewards/rejected": -6.6613030433654785, "step": 1114 }, { "epoch": 0.7282227120581272, "grad_norm": 23.199530145531195, "learning_rate": 4.175802574351052e-08, "logits/chosen": -1.1460967063903809, "logits/rejected": -1.1267650127410889, "logps/chosen": -1016.3873291015625, "logps/rejected": -1232.958984375, "loss": 0.318, "rewards/accuracies": 0.84375, "rewards/chosen": -5.17551326751709, "rewards/margins": 1.9491366147994995, "rewards/rejected": -7.124650001525879, "step": 1115 }, { "epoch": 0.7288758265980897, "grad_norm": 24.805300670034743, "learning_rate": 4.1572718976664366e-08, "logits/chosen": -0.7371432781219482, "logits/rejected": -0.7844556570053101, "logps/chosen": -948.8936767578125, "logps/rejected": -1193.291259765625, "loss": 0.3402, "rewards/accuracies": 0.9375, "rewards/chosen": -4.742420673370361, "rewards/margins": 2.136198043823242, "rewards/rejected": -6.8786187171936035, "step": 1116 }, { "epoch": 0.7295289411380521, "grad_norm": 36.62281859138192, "learning_rate": 4.1387716331478564e-08, "logits/chosen": -0.7828527688980103, "logits/rejected": -0.7460812330245972, "logps/chosen": -1003.2020874023438, "logps/rejected": -1216.4989013671875, "loss": 0.3146, "rewards/accuracies": 0.78125, "rewards/chosen": -5.179636001586914, "rewards/margins": 1.6944818496704102, "rewards/rejected": -6.874117851257324, "step": 1117 }, { "epoch": 0.7301820556780145, "grad_norm": 26.940456429092567, "learning_rate": 4.1203018770916185e-08, "logits/chosen": -0.7151032090187073, "logits/rejected": -0.7399024963378906, "logps/chosen": -955.585205078125, "logps/rejected": -1188.607421875, "loss": 0.4193, "rewards/accuracies": 0.78125, "rewards/chosen": -4.988138675689697, "rewards/margins": 1.6223087310791016, "rewards/rejected": -6.610446929931641, "step": 1118 }, { "epoch": 0.730835170217977, "grad_norm": 19.280842272039692, "learning_rate": 4.101862725635227e-08, "logits/chosen": -0.9849437475204468, "logits/rejected": -0.8866140842437744, "logps/chosen": -987.6910400390625, "logps/rejected": -1077.995361328125, "loss": 0.3846, "rewards/accuracies": 0.84375, "rewards/chosen": -4.67930793762207, "rewards/margins": 1.2189520597457886, "rewards/rejected": -5.898260116577148, "step": 1119 }, { "epoch": 0.7314882847579395, "grad_norm": 18.069875903287098, "learning_rate": 4.08345427475688e-08, "logits/chosen": -0.8553774356842041, "logits/rejected": -0.7431191802024841, "logps/chosen": -967.6986083984375, "logps/rejected": -1162.87060546875, "loss": 0.371, "rewards/accuracies": 0.78125, "rewards/chosen": -5.205309867858887, "rewards/margins": 2.321453332901001, "rewards/rejected": -7.52676248550415, "step": 1120 }, { "epoch": 0.7321413992979019, "grad_norm": 22.63376434941059, "learning_rate": 4.065076620274983e-08, "logits/chosen": -0.9161591529846191, "logits/rejected": -0.9195849299430847, "logps/chosen": -920.815673828125, "logps/rejected": -1047.88623046875, "loss": 0.4609, "rewards/accuracies": 0.78125, "rewards/chosen": -4.56045389175415, "rewards/margins": 1.149539589881897, "rewards/rejected": -5.709993839263916, "step": 1121 }, { "epoch": 0.7327945138378643, "grad_norm": 20.45817826800368, "learning_rate": 4.046729857847634e-08, "logits/chosen": -0.8821409344673157, "logits/rejected": -0.8005276918411255, "logps/chosen": -881.125244140625, "logps/rejected": -1102.503662109375, "loss": 0.4102, "rewards/accuracies": 0.9375, "rewards/chosen": -3.9783294200897217, "rewards/margins": 2.2585127353668213, "rewards/rejected": -6.236842155456543, "step": 1122 }, { "epoch": 0.7334476283778267, "grad_norm": 19.51675556765635, "learning_rate": 4.0284140829721404e-08, "logits/chosen": -0.7991642951965332, "logits/rejected": -0.8100806474685669, "logps/chosen": -942.4847412109375, "logps/rejected": -1282.675048828125, "loss": 0.3271, "rewards/accuracies": 0.9375, "rewards/chosen": -5.026712417602539, "rewards/margins": 2.775904417037964, "rewards/rejected": -7.802616596221924, "step": 1123 }, { "epoch": 0.7341007429177893, "grad_norm": 24.3902316105061, "learning_rate": 4.01012939098451e-08, "logits/chosen": -0.893683910369873, "logits/rejected": -0.7639338970184326, "logps/chosen": -966.1260986328125, "logps/rejected": -1116.3106689453125, "loss": 0.4255, "rewards/accuracies": 0.75, "rewards/chosen": -4.668510913848877, "rewards/margins": 1.699383020401001, "rewards/rejected": -6.367894172668457, "step": 1124 }, { "epoch": 0.7347538574577517, "grad_norm": 20.875773170479594, "learning_rate": 3.9918758770589644e-08, "logits/chosen": -1.0481836795806885, "logits/rejected": -0.9271065592765808, "logps/chosen": -958.1807861328125, "logps/rejected": -1059.3460693359375, "loss": 0.3817, "rewards/accuracies": 0.78125, "rewards/chosen": -5.318079948425293, "rewards/margins": 1.2409089803695679, "rewards/rejected": -6.55898904800415, "step": 1125 }, { "epoch": 0.7354069719977141, "grad_norm": 18.73461818358389, "learning_rate": 3.973653636207437e-08, "logits/chosen": -0.7000550031661987, "logits/rejected": -0.7210078239440918, "logps/chosen": -1033.514404296875, "logps/rejected": -1344.989501953125, "loss": 0.36, "rewards/accuracies": 0.875, "rewards/chosen": -5.706725597381592, "rewards/margins": 2.880382537841797, "rewards/rejected": -8.58710765838623, "step": 1126 }, { "epoch": 0.7360600865376765, "grad_norm": 58.93678887441067, "learning_rate": 3.9554627632790815e-08, "logits/chosen": -0.9635611772537231, "logits/rejected": -0.7603409886360168, "logps/chosen": -1095.10205078125, "logps/rejected": -1271.283203125, "loss": 0.4252, "rewards/accuracies": 0.84375, "rewards/chosen": -5.461233615875244, "rewards/margins": 2.415156602859497, "rewards/rejected": -7.87639045715332, "step": 1127 }, { "epoch": 0.736713201077639, "grad_norm": 26.894919410737224, "learning_rate": 3.937303352959777e-08, "logits/chosen": -0.8913260698318481, "logits/rejected": -0.773656964302063, "logps/chosen": -929.6309814453125, "logps/rejected": -1127.9503173828125, "loss": 0.322, "rewards/accuracies": 0.78125, "rewards/chosen": -4.980562210083008, "rewards/margins": 2.162158727645874, "rewards/rejected": -7.142721176147461, "step": 1128 }, { "epoch": 0.7373663156176015, "grad_norm": 23.622760206791476, "learning_rate": 3.919175499771634e-08, "logits/chosen": -0.9496240615844727, "logits/rejected": -0.8086094260215759, "logps/chosen": -1028.3040771484375, "logps/rejected": -1120.0272216796875, "loss": 0.4898, "rewards/accuracies": 0.78125, "rewards/chosen": -5.441816329956055, "rewards/margins": 1.3477586507797241, "rewards/rejected": -6.789575099945068, "step": 1129 }, { "epoch": 0.7380194301575639, "grad_norm": 16.57012151608491, "learning_rate": 3.901079298072509e-08, "logits/chosen": -0.8438161611557007, "logits/rejected": -0.7847708463668823, "logps/chosen": -813.0806884765625, "logps/rejected": -923.8001098632812, "loss": 0.3786, "rewards/accuracies": 0.71875, "rewards/chosen": -4.117031574249268, "rewards/margins": 1.3204278945922852, "rewards/rejected": -5.4374589920043945, "step": 1130 }, { "epoch": 0.7386725446975263, "grad_norm": 14.471906831289356, "learning_rate": 3.883014842055504e-08, "logits/chosen": -0.8245331645011902, "logits/rejected": -0.870025098323822, "logps/chosen": -939.5264892578125, "logps/rejected": -1070.2132568359375, "loss": 0.3324, "rewards/accuracies": 0.84375, "rewards/chosen": -4.8601274490356445, "rewards/margins": 1.342176079750061, "rewards/rejected": -6.202303886413574, "step": 1131 }, { "epoch": 0.7393256592374887, "grad_norm": 35.53448905203971, "learning_rate": 3.864982225748481e-08, "logits/chosen": -0.8064248561859131, "logits/rejected": -0.7723050117492676, "logps/chosen": -897.3822021484375, "logps/rejected": -931.1773681640625, "loss": 0.4193, "rewards/accuracies": 0.75, "rewards/chosen": -4.686911582946777, "rewards/margins": 0.5902104377746582, "rewards/rejected": -5.277121067047119, "step": 1132 }, { "epoch": 0.7399787737774512, "grad_norm": 21.217440902300723, "learning_rate": 3.8469815430135735e-08, "logits/chosen": -1.0799496173858643, "logits/rejected": -1.0132458209991455, "logps/chosen": -1016.651611328125, "logps/rejected": -1278.3968505859375, "loss": 0.3817, "rewards/accuracies": 0.90625, "rewards/chosen": -5.062169551849365, "rewards/margins": 2.540085792541504, "rewards/rejected": -7.602255344390869, "step": 1133 }, { "epoch": 0.7406318883174137, "grad_norm": 21.67079481182841, "learning_rate": 3.8290128875466945e-08, "logits/chosen": -1.034336805343628, "logits/rejected": -0.8814944624900818, "logps/chosen": -887.02001953125, "logps/rejected": -1028.741455078125, "loss": 0.3921, "rewards/accuracies": 0.84375, "rewards/chosen": -4.301558494567871, "rewards/margins": 1.7627674341201782, "rewards/rejected": -6.064326286315918, "step": 1134 }, { "epoch": 0.7412850028573761, "grad_norm": 21.900427406794464, "learning_rate": 3.811076352877054e-08, "logits/chosen": -0.8100993633270264, "logits/rejected": -0.810735821723938, "logps/chosen": -956.65869140625, "logps/rejected": -1134.55078125, "loss": 0.4055, "rewards/accuracies": 0.75, "rewards/chosen": -5.0694427490234375, "rewards/margins": 1.6674208641052246, "rewards/rejected": -6.736863613128662, "step": 1135 }, { "epoch": 0.7419381173973385, "grad_norm": 37.11345498421912, "learning_rate": 3.793172032366667e-08, "logits/chosen": -0.8167673945426941, "logits/rejected": -0.8620283007621765, "logps/chosen": -1008.05078125, "logps/rejected": -1179.90185546875, "loss": 0.358, "rewards/accuracies": 0.84375, "rewards/chosen": -5.111788749694824, "rewards/margins": 1.8442003726959229, "rewards/rejected": -6.955988883972168, "step": 1136 }, { "epoch": 0.742591231937301, "grad_norm": 16.238948093293164, "learning_rate": 3.7753000192098695e-08, "logits/chosen": -1.0395476818084717, "logits/rejected": -1.0163346529006958, "logps/chosen": -1038.7113037109375, "logps/rejected": -1218.2423095703125, "loss": 0.3834, "rewards/accuracies": 0.78125, "rewards/chosen": -5.375159740447998, "rewards/margins": 1.8684576749801636, "rewards/rejected": -7.243618011474609, "step": 1137 }, { "epoch": 0.7432443464772635, "grad_norm": 23.547812190116634, "learning_rate": 3.757460406432833e-08, "logits/chosen": -0.7605608701705933, "logits/rejected": -0.7161009311676025, "logps/chosen": -990.2849731445312, "logps/rejected": -1235.51025390625, "loss": 0.4008, "rewards/accuracies": 0.8125, "rewards/chosen": -5.188079833984375, "rewards/margins": 2.2514495849609375, "rewards/rejected": -7.439528942108154, "step": 1138 }, { "epoch": 0.7438974610172259, "grad_norm": 18.82174611804124, "learning_rate": 3.739653286893088e-08, "logits/chosen": -0.9870573878288269, "logits/rejected": -0.7753057479858398, "logps/chosen": -1092.725830078125, "logps/rejected": -1257.0374755859375, "loss": 0.3258, "rewards/accuracies": 0.875, "rewards/chosen": -5.7637128829956055, "rewards/margins": 2.28377103805542, "rewards/rejected": -8.047484397888184, "step": 1139 }, { "epoch": 0.7445505755571883, "grad_norm": 23.869228374915046, "learning_rate": 3.721878753279016e-08, "logits/chosen": -0.8627902865409851, "logits/rejected": -0.8585209250450134, "logps/chosen": -1046.7232666015625, "logps/rejected": -1276.690185546875, "loss": 0.4385, "rewards/accuracies": 0.75, "rewards/chosen": -5.6731462478637695, "rewards/margins": 1.737582802772522, "rewards/rejected": -7.410728931427002, "step": 1140 }, { "epoch": 0.7452036900971508, "grad_norm": 25.028791146036063, "learning_rate": 3.704136898109402e-08, "logits/chosen": -0.7637258768081665, "logits/rejected": -0.8027169704437256, "logps/chosen": -939.3970947265625, "logps/rejected": -1249.8870849609375, "loss": 0.2932, "rewards/accuracies": 0.96875, "rewards/chosen": -5.005740165710449, "rewards/margins": 2.5859646797180176, "rewards/rejected": -7.591704845428467, "step": 1141 }, { "epoch": 0.7458568046371132, "grad_norm": 23.699634900623874, "learning_rate": 3.686427813732929e-08, "logits/chosen": -0.8129778504371643, "logits/rejected": -0.790728747844696, "logps/chosen": -1058.7100830078125, "logps/rejected": -1323.9364013671875, "loss": 0.407, "rewards/accuracies": 0.78125, "rewards/chosen": -5.982147216796875, "rewards/margins": 1.7800394296646118, "rewards/rejected": -7.7621870040893555, "step": 1142 }, { "epoch": 0.7465099191770757, "grad_norm": 21.921561179095427, "learning_rate": 3.6687515923277015e-08, "logits/chosen": -0.815049409866333, "logits/rejected": -0.7605541348457336, "logps/chosen": -1098.3214111328125, "logps/rejected": -1316.96728515625, "loss": 0.4258, "rewards/accuracies": 0.84375, "rewards/chosen": -6.022390842437744, "rewards/margins": 2.642651081085205, "rewards/rejected": -8.66504192352295, "step": 1143 }, { "epoch": 0.7471630337170381, "grad_norm": 24.36327520442801, "learning_rate": 3.6511083259007725e-08, "logits/chosen": -0.6395676136016846, "logits/rejected": -0.6162480711936951, "logps/chosen": -950.2569580078125, "logps/rejected": -1111.405517578125, "loss": 0.3874, "rewards/accuracies": 0.875, "rewards/chosen": -5.066840648651123, "rewards/margins": 1.5750312805175781, "rewards/rejected": -6.641872406005859, "step": 1144 }, { "epoch": 0.7478161482570006, "grad_norm": 41.172318989826486, "learning_rate": 3.633498106287657e-08, "logits/chosen": -0.9550276398658752, "logits/rejected": -0.9688808917999268, "logps/chosen": -955.484130859375, "logps/rejected": -1154.53466796875, "loss": 0.4169, "rewards/accuracies": 0.75, "rewards/chosen": -4.7296648025512695, "rewards/margins": 1.6894323825836182, "rewards/rejected": -6.419096946716309, "step": 1145 }, { "epoch": 0.748469262796963, "grad_norm": 22.01106598608996, "learning_rate": 3.6159210251518566e-08, "logits/chosen": -0.9625080823898315, "logits/rejected": -0.9198300242424011, "logps/chosen": -958.3787231445312, "logps/rejected": -1220.9547119140625, "loss": 0.3372, "rewards/accuracies": 0.84375, "rewards/chosen": -5.220369338989258, "rewards/margins": 2.5000252723693848, "rewards/rejected": -7.720395088195801, "step": 1146 }, { "epoch": 0.7491223773369254, "grad_norm": 15.911924912335554, "learning_rate": 3.598377173984385e-08, "logits/chosen": -0.8641433715820312, "logits/rejected": -0.7814199328422546, "logps/chosen": -1110.24462890625, "logps/rejected": -1363.054931640625, "loss": 0.3433, "rewards/accuracies": 0.875, "rewards/chosen": -5.966456413269043, "rewards/margins": 2.4509754180908203, "rewards/rejected": -8.417431831359863, "step": 1147 }, { "epoch": 0.7497754918768879, "grad_norm": 19.175118613511536, "learning_rate": 3.5808666441032876e-08, "logits/chosen": -0.9386720657348633, "logits/rejected": -0.9401878118515015, "logps/chosen": -953.240478515625, "logps/rejected": -1146.1922607421875, "loss": 0.3883, "rewards/accuracies": 0.90625, "rewards/chosen": -5.1068878173828125, "rewards/margins": 1.779557704925537, "rewards/rejected": -6.886445045471191, "step": 1148 }, { "epoch": 0.7504286064168504, "grad_norm": 25.57354603222973, "learning_rate": 3.56338952665317e-08, "logits/chosen": -0.839618444442749, "logits/rejected": -0.8448423147201538, "logps/chosen": -1056.17724609375, "logps/rejected": -1308.0330810546875, "loss": 0.3417, "rewards/accuracies": 0.875, "rewards/chosen": -6.239077568054199, "rewards/margins": 2.1591358184814453, "rewards/rejected": -8.398212432861328, "step": 1149 }, { "epoch": 0.7510817209568128, "grad_norm": 27.81708132565712, "learning_rate": 3.545945912604722e-08, "logits/chosen": -0.9696516394615173, "logits/rejected": -0.7934356927871704, "logps/chosen": -958.2764892578125, "logps/rejected": -1086.7510986328125, "loss": 0.3957, "rewards/accuracies": 0.71875, "rewards/chosen": -4.882455348968506, "rewards/margins": 1.7731363773345947, "rewards/rejected": -6.65559196472168, "step": 1150 }, { "epoch": 0.7517348354967752, "grad_norm": 29.929763150529183, "learning_rate": 3.5285358927542386e-08, "logits/chosen": -0.9754225015640259, "logits/rejected": -0.9199377298355103, "logps/chosen": -1044.01708984375, "logps/rejected": -1216.123291015625, "loss": 0.4275, "rewards/accuracies": 0.875, "rewards/chosen": -5.64408540725708, "rewards/margins": 1.760486125946045, "rewards/rejected": -7.404570579528809, "step": 1151 }, { "epoch": 0.7523879500367376, "grad_norm": 18.403330467837083, "learning_rate": 3.511159557723157e-08, "logits/chosen": -0.7564758062362671, "logits/rejected": -0.7638527154922485, "logps/chosen": -967.927978515625, "logps/rejected": -1212.9923095703125, "loss": 0.3566, "rewards/accuracies": 0.9375, "rewards/chosen": -4.75152587890625, "rewards/margins": 2.356855630874634, "rewards/rejected": -7.108381271362305, "step": 1152 }, { "epoch": 0.7530410645767002, "grad_norm": 27.316348794289706, "learning_rate": 3.493816997957582e-08, "logits/chosen": -0.8866490125656128, "logits/rejected": -0.8477368950843811, "logps/chosen": -969.8577270507812, "logps/rejected": -1238.609130859375, "loss": 0.4151, "rewards/accuracies": 0.90625, "rewards/chosen": -5.076401710510254, "rewards/margins": 2.3330423831939697, "rewards/rejected": -7.409444808959961, "step": 1153 }, { "epoch": 0.7536941791166626, "grad_norm": 24.26926248538676, "learning_rate": 3.476508303727809e-08, "logits/chosen": -0.9898264408111572, "logits/rejected": -0.823529839515686, "logps/chosen": -897.8689575195312, "logps/rejected": -1061.2685546875, "loss": 0.4123, "rewards/accuracies": 0.875, "rewards/chosen": -4.637533664703369, "rewards/margins": 1.7195688486099243, "rewards/rejected": -6.357102870941162, "step": 1154 }, { "epoch": 0.754347293656625, "grad_norm": 23.535978515943466, "learning_rate": 3.459233565127865e-08, "logits/chosen": -0.9022988080978394, "logits/rejected": -0.7629703283309937, "logps/chosen": -1029.1558837890625, "logps/rejected": -1222.035400390625, "loss": 0.4044, "rewards/accuracies": 0.8125, "rewards/chosen": -5.495359420776367, "rewards/margins": 1.9249953031539917, "rewards/rejected": -7.420354843139648, "step": 1155 }, { "epoch": 0.7550004081965874, "grad_norm": 22.38619660861659, "learning_rate": 3.441992872075027e-08, "logits/chosen": -0.8099140524864197, "logits/rejected": -0.9074943661689758, "logps/chosen": -926.499755859375, "logps/rejected": -1042.8348388671875, "loss": 0.3997, "rewards/accuracies": 0.8125, "rewards/chosen": -4.323889255523682, "rewards/margins": 1.126346230506897, "rewards/rejected": -5.450235366821289, "step": 1156 }, { "epoch": 0.75565352273655, "grad_norm": 37.5031440678778, "learning_rate": 3.4247863143093646e-08, "logits/chosen": -0.7545847296714783, "logits/rejected": -0.9003801345825195, "logps/chosen": -1049.3184814453125, "logps/rejected": -1330.5516357421875, "loss": 0.4131, "rewards/accuracies": 0.8125, "rewards/chosen": -5.52249002456665, "rewards/margins": 2.340816020965576, "rewards/rejected": -7.863306045532227, "step": 1157 }, { "epoch": 0.7563066372765124, "grad_norm": 24.9093202389082, "learning_rate": 3.407613981393268e-08, "logits/chosen": -0.8100746870040894, "logits/rejected": -0.7035970091819763, "logps/chosen": -969.5421142578125, "logps/rejected": -1108.051513671875, "loss": 0.4021, "rewards/accuracies": 0.75, "rewards/chosen": -5.4513044357299805, "rewards/margins": 1.1451897621154785, "rewards/rejected": -6.596494197845459, "step": 1158 }, { "epoch": 0.7569597518164748, "grad_norm": 33.568513626016305, "learning_rate": 3.3904759627109826e-08, "logits/chosen": -0.7170710563659668, "logits/rejected": -0.7021714448928833, "logps/chosen": -981.71728515625, "logps/rejected": -1198.9769287109375, "loss": 0.3767, "rewards/accuracies": 0.84375, "rewards/chosen": -5.262272834777832, "rewards/margins": 1.670971393585205, "rewards/rejected": -6.933244705200195, "step": 1159 }, { "epoch": 0.7576128663564372, "grad_norm": 19.48507470464778, "learning_rate": 3.373372347468141e-08, "logits/chosen": -0.8081870079040527, "logits/rejected": -0.8140336871147156, "logps/chosen": -953.115478515625, "logps/rejected": -1056.728515625, "loss": 0.3799, "rewards/accuracies": 0.8125, "rewards/chosen": -4.855665683746338, "rewards/margins": 1.1462256908416748, "rewards/rejected": -6.001891136169434, "step": 1160 }, { "epoch": 0.7582659808963997, "grad_norm": 31.38673322123655, "learning_rate": 3.356303224691306e-08, "logits/chosen": -0.9541839361190796, "logits/rejected": -0.7014814019203186, "logps/chosen": -1052.727783203125, "logps/rejected": -1163.9364013671875, "loss": 0.4678, "rewards/accuracies": 0.78125, "rewards/chosen": -5.942466735839844, "rewards/margins": 1.7238149642944336, "rewards/rejected": -7.666281700134277, "step": 1161 }, { "epoch": 0.7589190954363622, "grad_norm": 69.4011664473962, "learning_rate": 3.3392686832274985e-08, "logits/chosen": -0.9889993667602539, "logits/rejected": -0.7375394105911255, "logps/chosen": -941.2330322265625, "logps/rejected": -1013.1570434570312, "loss": 0.4446, "rewards/accuracies": 0.78125, "rewards/chosen": -4.634012699127197, "rewards/margins": 1.3770856857299805, "rewards/rejected": -6.011098384857178, "step": 1162 }, { "epoch": 0.7595722099763246, "grad_norm": 27.965895018082794, "learning_rate": 3.3222688117437425e-08, "logits/chosen": -0.9555126428604126, "logits/rejected": -0.7966434955596924, "logps/chosen": -1023.3953247070312, "logps/rejected": -1168.2110595703125, "loss": 0.3217, "rewards/accuracies": 0.875, "rewards/chosen": -5.375278472900391, "rewards/margins": 1.807486653327942, "rewards/rejected": -7.182765483856201, "step": 1163 }, { "epoch": 0.760225324516287, "grad_norm": 20.30038550754587, "learning_rate": 3.305303698726597e-08, "logits/chosen": -0.7721012830734253, "logits/rejected": -0.7118543982505798, "logps/chosen": -928.61279296875, "logps/rejected": -1214.0880126953125, "loss": 0.3215, "rewards/accuracies": 0.8125, "rewards/chosen": -4.874123573303223, "rewards/margins": 2.648409366607666, "rewards/rejected": -7.5225324630737305, "step": 1164 }, { "epoch": 0.7608784390562495, "grad_norm": 33.58007082608257, "learning_rate": 3.2883734324817025e-08, "logits/chosen": -0.79903644323349, "logits/rejected": -0.8045666217803955, "logps/chosen": -1060.49267578125, "logps/rejected": -1248.7489013671875, "loss": 0.3897, "rewards/accuracies": 0.78125, "rewards/chosen": -5.418579578399658, "rewards/margins": 1.9177119731903076, "rewards/rejected": -7.336291313171387, "step": 1165 }, { "epoch": 0.7615315535962119, "grad_norm": 21.581131984398414, "learning_rate": 3.271478101133313e-08, "logits/chosen": -0.8603457808494568, "logits/rejected": -0.8567978739738464, "logps/chosen": -1065.7412109375, "logps/rejected": -1239.724853515625, "loss": 0.3794, "rewards/accuracies": 0.84375, "rewards/chosen": -5.512617588043213, "rewards/margins": 1.4748889207839966, "rewards/rejected": -6.987505912780762, "step": 1166 }, { "epoch": 0.7621846681361744, "grad_norm": 27.76584915305069, "learning_rate": 3.254617792623844e-08, "logits/chosen": -0.7932612895965576, "logits/rejected": -0.7557364106178284, "logps/chosen": -859.295166015625, "logps/rejected": -1044.875732421875, "loss": 0.3551, "rewards/accuracies": 0.84375, "rewards/chosen": -4.440032482147217, "rewards/margins": 1.8427796363830566, "rewards/rejected": -6.282812595367432, "step": 1167 }, { "epoch": 0.7628377826761368, "grad_norm": 19.357322013989005, "learning_rate": 3.237792594713413e-08, "logits/chosen": -0.785632848739624, "logits/rejected": -0.8138841390609741, "logps/chosen": -1043.2958984375, "logps/rejected": -1250.5078125, "loss": 0.423, "rewards/accuracies": 0.78125, "rewards/chosen": -5.2632856369018555, "rewards/margins": 1.5555360317230225, "rewards/rejected": -6.818821907043457, "step": 1168 }, { "epoch": 0.7634908972160993, "grad_norm": 22.411264863097227, "learning_rate": 3.2210025949793826e-08, "logits/chosen": -0.7790203094482422, "logits/rejected": -0.7721933722496033, "logps/chosen": -1029.548583984375, "logps/rejected": -1246.845947265625, "loss": 0.4476, "rewards/accuracies": 0.84375, "rewards/chosen": -5.5870537757873535, "rewards/margins": 1.5175191164016724, "rewards/rejected": -7.1045732498168945, "step": 1169 }, { "epoch": 0.7641440117560617, "grad_norm": 21.765699711965283, "learning_rate": 3.204247880815902e-08, "logits/chosen": -0.848351001739502, "logits/rejected": -0.8396331071853638, "logps/chosen": -964.7103271484375, "logps/rejected": -1139.3961181640625, "loss": 0.3456, "rewards/accuracies": 0.875, "rewards/chosen": -4.735137462615967, "rewards/margins": 1.6721020936965942, "rewards/rejected": -6.407238960266113, "step": 1170 }, { "epoch": 0.7647971262960241, "grad_norm": 16.671667536802943, "learning_rate": 3.1875285394334575e-08, "logits/chosen": -0.8602566123008728, "logits/rejected": -0.8754156231880188, "logps/chosen": -929.6405029296875, "logps/rejected": -1105.7825927734375, "loss": 0.435, "rewards/accuracies": 0.6875, "rewards/chosen": -4.829796314239502, "rewards/margins": 1.4688724279403687, "rewards/rejected": -6.298668384552002, "step": 1171 }, { "epoch": 0.7654502408359866, "grad_norm": 20.79622076583873, "learning_rate": 3.1708446578584124e-08, "logits/chosen": -0.8881270885467529, "logits/rejected": -0.8268469572067261, "logps/chosen": -956.0182495117188, "logps/rejected": -1192.7308349609375, "loss": 0.3944, "rewards/accuracies": 0.90625, "rewards/chosen": -4.764824390411377, "rewards/margins": 2.306044816970825, "rewards/rejected": -7.0708699226379395, "step": 1172 }, { "epoch": 0.7661033553759491, "grad_norm": 25.011870924818727, "learning_rate": 3.154196322932562e-08, "logits/chosen": -0.7613641023635864, "logits/rejected": -0.7869819402694702, "logps/chosen": -884.375732421875, "logps/rejected": -1046.363525390625, "loss": 0.3063, "rewards/accuracies": 0.8125, "rewards/chosen": -4.63422155380249, "rewards/margins": 1.5653067827224731, "rewards/rejected": -6.199528217315674, "step": 1173 }, { "epoch": 0.7667564699159115, "grad_norm": 20.090645894822764, "learning_rate": 3.137583621312665e-08, "logits/chosen": -0.6448233127593994, "logits/rejected": -0.6775322556495667, "logps/chosen": -937.7881469726562, "logps/rejected": -1196.1304931640625, "loss": 0.4101, "rewards/accuracies": 0.875, "rewards/chosen": -4.74168062210083, "rewards/margins": 2.3308541774749756, "rewards/rejected": -7.072534561157227, "step": 1174 }, { "epoch": 0.7674095844558739, "grad_norm": 33.3903606022228, "learning_rate": 3.121006639470019e-08, "logits/chosen": -0.8766074776649475, "logits/rejected": -0.8929482698440552, "logps/chosen": -908.8091430664062, "logps/rejected": -1128.59619140625, "loss": 0.4219, "rewards/accuracies": 0.8125, "rewards/chosen": -4.524719715118408, "rewards/margins": 1.9517160654067993, "rewards/rejected": -6.476435661315918, "step": 1175 }, { "epoch": 0.7680626989958363, "grad_norm": 22.339500155579042, "learning_rate": 3.104465463689985e-08, "logits/chosen": -0.6528885364532471, "logits/rejected": -0.6316456198692322, "logps/chosen": -1036.5087890625, "logps/rejected": -1337.04833984375, "loss": 0.4085, "rewards/accuracies": 0.875, "rewards/chosen": -4.987725257873535, "rewards/margins": 2.9133403301239014, "rewards/rejected": -7.901065826416016, "step": 1176 }, { "epoch": 0.7687158135357989, "grad_norm": 22.90690018103172, "learning_rate": 3.087960180071553e-08, "logits/chosen": -0.8717073202133179, "logits/rejected": -0.9405293464660645, "logps/chosen": -862.404052734375, "logps/rejected": -1120.0740966796875, "loss": 0.229, "rewards/accuracies": 0.9375, "rewards/chosen": -4.589112758636475, "rewards/margins": 2.3341736793518066, "rewards/rejected": -6.923286437988281, "step": 1177 }, { "epoch": 0.7693689280757613, "grad_norm": 26.598328462908714, "learning_rate": 3.07149087452689e-08, "logits/chosen": -0.8764140009880066, "logits/rejected": -0.8363279104232788, "logps/chosen": -982.534912109375, "logps/rejected": -1125.4754638671875, "loss": 0.406, "rewards/accuracies": 0.8125, "rewards/chosen": -5.287962913513184, "rewards/margins": 1.5670433044433594, "rewards/rejected": -6.855005741119385, "step": 1178 }, { "epoch": 0.7700220426157237, "grad_norm": 22.893298745929318, "learning_rate": 3.055057632780891e-08, "logits/chosen": -0.8786053657531738, "logits/rejected": -0.808796763420105, "logps/chosen": -1015.7930297851562, "logps/rejected": -1083.675048828125, "loss": 0.4078, "rewards/accuracies": 0.78125, "rewards/chosen": -5.676352024078369, "rewards/margins": 0.9371025562286377, "rewards/rejected": -6.613454341888428, "step": 1179 }, { "epoch": 0.7706751571556861, "grad_norm": 22.818915591743878, "learning_rate": 3.038660540370735e-08, "logits/chosen": -0.8500253558158875, "logits/rejected": -0.8378827571868896, "logps/chosen": -977.3529052734375, "logps/rejected": -1116.5126953125, "loss": 0.4193, "rewards/accuracies": 0.6875, "rewards/chosen": -4.952162265777588, "rewards/margins": 1.1345552206039429, "rewards/rejected": -6.08671760559082, "step": 1180 }, { "epoch": 0.7713282716956487, "grad_norm": 29.865635190204213, "learning_rate": 3.022299682645436e-08, "logits/chosen": -0.8462579250335693, "logits/rejected": -0.8399958610534668, "logps/chosen": -958.1721801757812, "logps/rejected": -1113.3017578125, "loss": 0.3925, "rewards/accuracies": 0.78125, "rewards/chosen": -4.745461463928223, "rewards/margins": 1.4994910955429077, "rewards/rejected": -6.24495267868042, "step": 1181 }, { "epoch": 0.7719813862356111, "grad_norm": 27.458852230158666, "learning_rate": 3.005975144765407e-08, "logits/chosen": -0.7723562717437744, "logits/rejected": -0.7684867978096008, "logps/chosen": -927.39599609375, "logps/rejected": -1054.7724609375, "loss": 0.3763, "rewards/accuracies": 0.8125, "rewards/chosen": -5.110241889953613, "rewards/margins": 1.5953783988952637, "rewards/rejected": -6.705620765686035, "step": 1182 }, { "epoch": 0.7726345007755735, "grad_norm": 31.359566574451833, "learning_rate": 2.989687011702007e-08, "logits/chosen": -0.9768270254135132, "logits/rejected": -0.9259161949157715, "logps/chosen": -990.6243896484375, "logps/rejected": -1188.6944580078125, "loss": 0.3644, "rewards/accuracies": 0.8125, "rewards/chosen": -4.3823652267456055, "rewards/margins": 1.8317203521728516, "rewards/rejected": -6.214085578918457, "step": 1183 }, { "epoch": 0.7732876153155359, "grad_norm": 18.539542439721654, "learning_rate": 2.9734353682371082e-08, "logits/chosen": -0.9522316455841064, "logits/rejected": -1.0198249816894531, "logps/chosen": -896.4822998046875, "logps/rejected": -1116.46142578125, "loss": 0.3925, "rewards/accuracies": 0.78125, "rewards/chosen": -4.922615051269531, "rewards/margins": 1.5149760246276855, "rewards/rejected": -6.437590599060059, "step": 1184 }, { "epoch": 0.7739407298554984, "grad_norm": 37.635532934840626, "learning_rate": 2.9572202989626404e-08, "logits/chosen": -0.6697461009025574, "logits/rejected": -0.5747227668762207, "logps/chosen": -1109.9716796875, "logps/rejected": -1424.2490234375, "loss": 0.4296, "rewards/accuracies": 0.90625, "rewards/chosen": -6.394173622131348, "rewards/margins": 3.1784846782684326, "rewards/rejected": -9.57265853881836, "step": 1185 }, { "epoch": 0.7745938443954609, "grad_norm": 23.27278254145601, "learning_rate": 2.941041888280168e-08, "logits/chosen": -0.844850480556488, "logits/rejected": -0.8204216361045837, "logps/chosen": -928.3829956054688, "logps/rejected": -1041.739990234375, "loss": 0.4327, "rewards/accuracies": 0.78125, "rewards/chosen": -5.049342155456543, "rewards/margins": 1.1250241994857788, "rewards/rejected": -6.174365997314453, "step": 1186 }, { "epoch": 0.7752469589354233, "grad_norm": 30.546092848157986, "learning_rate": 2.9249002204004415e-08, "logits/chosen": -1.0821895599365234, "logits/rejected": -1.0456998348236084, "logps/chosen": -901.41015625, "logps/rejected": -1062.07861328125, "loss": 0.3324, "rewards/accuracies": 0.875, "rewards/chosen": -4.730764865875244, "rewards/margins": 1.766805648803711, "rewards/rejected": -6.497569561004639, "step": 1187 }, { "epoch": 0.7759000734753857, "grad_norm": 14.916867634591082, "learning_rate": 2.9087953793429586e-08, "logits/chosen": -0.9418157935142517, "logits/rejected": -0.8717476725578308, "logps/chosen": -932.181640625, "logps/rejected": -1018.2946166992188, "loss": 0.4279, "rewards/accuracies": 0.75, "rewards/chosen": -4.699738502502441, "rewards/margins": 0.9546055793762207, "rewards/rejected": -5.65434455871582, "step": 1188 }, { "epoch": 0.7765531880153482, "grad_norm": 18.42805800044058, "learning_rate": 2.8927274489355292e-08, "logits/chosen": -0.831034243106842, "logits/rejected": -0.7608861327171326, "logps/chosen": -1031.4248046875, "logps/rejected": -1184.942626953125, "loss": 0.3381, "rewards/accuracies": 0.90625, "rewards/chosen": -4.825045108795166, "rewards/margins": 1.7045456171035767, "rewards/rejected": -6.529591083526611, "step": 1189 }, { "epoch": 0.7772063025553106, "grad_norm": 21.658547747559748, "learning_rate": 2.8766965128138387e-08, "logits/chosen": -0.9315862059593201, "logits/rejected": -0.9007086753845215, "logps/chosen": -997.5110473632812, "logps/rejected": -1119.478515625, "loss": 0.3427, "rewards/accuracies": 0.71875, "rewards/chosen": -5.008423805236816, "rewards/margins": 1.204540491104126, "rewards/rejected": -6.212964057922363, "step": 1190 }, { "epoch": 0.7778594170952731, "grad_norm": 24.267138455955347, "learning_rate": 2.860702654421011e-08, "logits/chosen": -0.9535077214241028, "logits/rejected": -0.8994888067245483, "logps/chosen": -1031.9945068359375, "logps/rejected": -1150.012451171875, "loss": 0.3756, "rewards/accuracies": 0.9375, "rewards/chosen": -5.6364946365356445, "rewards/margins": 1.5493192672729492, "rewards/rejected": -7.18581485748291, "step": 1191 }, { "epoch": 0.7785125316352355, "grad_norm": 18.451314859777252, "learning_rate": 2.8447459570071776e-08, "logits/chosen": -0.817573070526123, "logits/rejected": -0.7613990902900696, "logps/chosen": -936.7318115234375, "logps/rejected": -1135.806884765625, "loss": 0.3049, "rewards/accuracies": 0.875, "rewards/chosen": -5.091334342956543, "rewards/margins": 1.77097749710083, "rewards/rejected": -6.862312316894531, "step": 1192 }, { "epoch": 0.779165646175198, "grad_norm": 33.458133028762745, "learning_rate": 2.8288265036290405e-08, "logits/chosen": -0.9172095060348511, "logits/rejected": -0.7128479480743408, "logps/chosen": -1004.5012817382812, "logps/rejected": -1170.0657958984375, "loss": 0.4333, "rewards/accuracies": 0.90625, "rewards/chosen": -4.607486724853516, "rewards/margins": 2.5573153495788574, "rewards/rejected": -7.164802074432373, "step": 1193 }, { "epoch": 0.7798187607151604, "grad_norm": 19.970285171428458, "learning_rate": 2.8129443771494432e-08, "logits/chosen": -0.6862125396728516, "logits/rejected": -0.7096244692802429, "logps/chosen": -802.7031860351562, "logps/rejected": -1065.51220703125, "loss": 0.3785, "rewards/accuracies": 0.875, "rewards/chosen": -4.070821285247803, "rewards/margins": 2.3168816566467285, "rewards/rejected": -6.387702941894531, "step": 1194 }, { "epoch": 0.7804718752551228, "grad_norm": 18.73895337273363, "learning_rate": 2.7970996602369368e-08, "logits/chosen": -0.6776973009109497, "logits/rejected": -0.7221867442131042, "logps/chosen": -872.154541015625, "logps/rejected": -1053.0361328125, "loss": 0.3947, "rewards/accuracies": 0.84375, "rewards/chosen": -4.469272613525391, "rewards/margins": 1.3249047994613647, "rewards/rejected": -5.794177055358887, "step": 1195 }, { "epoch": 0.7811249897950853, "grad_norm": 19.272456109855355, "learning_rate": 2.7812924353653512e-08, "logits/chosen": -0.8801841139793396, "logits/rejected": -0.8910186290740967, "logps/chosen": -958.02294921875, "logps/rejected": -1119.1151123046875, "loss": 0.3909, "rewards/accuracies": 0.875, "rewards/chosen": -4.870482444763184, "rewards/margins": 1.3198819160461426, "rewards/rejected": -6.190364360809326, "step": 1196 }, { "epoch": 0.7817781043350478, "grad_norm": 21.38229731402552, "learning_rate": 2.765522784813363e-08, "logits/chosen": -0.7160096168518066, "logits/rejected": -0.7802441120147705, "logps/chosen": -1024.5040283203125, "logps/rejected": -1252.777099609375, "loss": 0.4229, "rewards/accuracies": 0.84375, "rewards/chosen": -4.846982002258301, "rewards/margins": 1.7771443128585815, "rewards/rejected": -6.624126434326172, "step": 1197 }, { "epoch": 0.7824312188750102, "grad_norm": 22.362299049982393, "learning_rate": 2.749790790664074e-08, "logits/chosen": -0.9027339816093445, "logits/rejected": -0.8822283744812012, "logps/chosen": -1182.553955078125, "logps/rejected": -1378.359619140625, "loss": 0.3888, "rewards/accuracies": 0.71875, "rewards/chosen": -7.502844333648682, "rewards/margins": 1.8371129035949707, "rewards/rejected": -9.339956283569336, "step": 1198 }, { "epoch": 0.7830843334149726, "grad_norm": 20.50908302528186, "learning_rate": 2.734096534804574e-08, "logits/chosen": -0.9140005707740784, "logits/rejected": -1.006885290145874, "logps/chosen": -858.1275634765625, "logps/rejected": -1033.56787109375, "loss": 0.3819, "rewards/accuracies": 0.90625, "rewards/chosen": -4.204371452331543, "rewards/margins": 1.4959553480148315, "rewards/rejected": -5.700326442718506, "step": 1199 }, { "epoch": 0.783737447954935, "grad_norm": 18.563171380526605, "learning_rate": 2.7184400989255264e-08, "logits/chosen": -0.8342958688735962, "logits/rejected": -0.8678746223449707, "logps/chosen": -949.291748046875, "logps/rejected": -1202.0423583984375, "loss": 0.3612, "rewards/accuracies": 0.90625, "rewards/chosen": -4.565825939178467, "rewards/margins": 2.313875436782837, "rewards/rejected": -6.879701614379883, "step": 1200 }, { "epoch": 0.783737447954935, "eval_logits/chosen": -0.6637032628059387, "eval_logits/rejected": -0.6046258807182312, "eval_logps/chosen": -978.1701049804688, "eval_logps/rejected": -1136.9619140625, "eval_loss": 0.39009326696395874, "eval_rewards/accuracies": 0.8040000200271606, "eval_rewards/chosen": -5.042629241943359, "eval_rewards/margins": 1.6753028631210327, "eval_rewards/rejected": -6.717932224273682, "eval_runtime": 619.0985, "eval_samples_per_second": 6.461, "eval_steps_per_second": 0.404, "step": 1200 }, { "epoch": 0.7843905624948976, "grad_norm": 22.82284104896695, "learning_rate": 2.702821564520732e-08, "logits/chosen": -1.0284345149993896, "logits/rejected": -0.9083595871925354, "logps/chosen": -1072.3922119140625, "logps/rejected": -1132.9649658203125, "loss": 0.4203, "rewards/accuracies": 0.84375, "rewards/chosen": -5.476429462432861, "rewards/margins": 1.4508377313613892, "rewards/rejected": -6.927268028259277, "step": 1201 }, { "epoch": 0.78504367703486, "grad_norm": 46.73465092422572, "learning_rate": 2.6872410128867095e-08, "logits/chosen": -0.9627257585525513, "logits/rejected": -0.9185448288917542, "logps/chosen": -934.824951171875, "logps/rejected": -1114.9776611328125, "loss": 0.4346, "rewards/accuracies": 0.78125, "rewards/chosen": -4.904428482055664, "rewards/margins": 1.572401762008667, "rewards/rejected": -6.47683048248291, "step": 1202 }, { "epoch": 0.7856967915748224, "grad_norm": 38.65799318097108, "learning_rate": 2.6716985251222745e-08, "logits/chosen": -0.8291105031967163, "logits/rejected": -0.7675511837005615, "logps/chosen": -879.0901489257812, "logps/rejected": -1029.8115234375, "loss": 0.366, "rewards/accuracies": 0.9375, "rewards/chosen": -4.441387176513672, "rewards/margins": 1.5486085414886475, "rewards/rejected": -5.989995002746582, "step": 1203 }, { "epoch": 0.7863499061147848, "grad_norm": 28.376786035425976, "learning_rate": 2.656194182128114e-08, "logits/chosen": -1.124685287475586, "logits/rejected": -1.0612528324127197, "logps/chosen": -1037.1787109375, "logps/rejected": -1229.228759765625, "loss": 0.3473, "rewards/accuracies": 0.90625, "rewards/chosen": -5.469577312469482, "rewards/margins": 2.1631574630737305, "rewards/rejected": -7.632734298706055, "step": 1204 }, { "epoch": 0.7870030206547474, "grad_norm": 31.767539638419322, "learning_rate": 2.640728064606368e-08, "logits/chosen": -0.8752514123916626, "logits/rejected": -0.7526658773422241, "logps/chosen": -991.3655395507812, "logps/rejected": -1172.45556640625, "loss": 0.3662, "rewards/accuracies": 0.84375, "rewards/chosen": -5.0387468338012695, "rewards/margins": 2.1917738914489746, "rewards/rejected": -7.230520725250244, "step": 1205 }, { "epoch": 0.7876561351947098, "grad_norm": 21.79342338361952, "learning_rate": 2.6253002530602042e-08, "logits/chosen": -1.0036921501159668, "logits/rejected": -0.8435570001602173, "logps/chosen": -1018.02783203125, "logps/rejected": -1182.3656005859375, "loss": 0.4083, "rewards/accuracies": 0.8125, "rewards/chosen": -5.629342555999756, "rewards/margins": 2.0361695289611816, "rewards/rejected": -7.6655120849609375, "step": 1206 }, { "epoch": 0.7883092497346722, "grad_norm": 17.625956058750614, "learning_rate": 2.60991082779341e-08, "logits/chosen": -0.9824644327163696, "logits/rejected": -0.9369449615478516, "logps/chosen": -898.9086303710938, "logps/rejected": -1089.82568359375, "loss": 0.3814, "rewards/accuracies": 0.9375, "rewards/chosen": -4.3817315101623535, "rewards/margins": 2.228430986404419, "rewards/rejected": -6.610162734985352, "step": 1207 }, { "epoch": 0.7889623642746346, "grad_norm": 17.052280652022436, "learning_rate": 2.594559868909956e-08, "logits/chosen": -0.8383811712265015, "logits/rejected": -0.7021499276161194, "logps/chosen": -1054.260009765625, "logps/rejected": -1233.23486328125, "loss": 0.3939, "rewards/accuracies": 0.8125, "rewards/chosen": -5.855528831481934, "rewards/margins": 2.2140450477600098, "rewards/rejected": -8.069574356079102, "step": 1208 }, { "epoch": 0.7896154788145971, "grad_norm": 17.82494133782833, "learning_rate": 2.579247456313598e-08, "logits/chosen": -0.6913808584213257, "logits/rejected": -0.7808651924133301, "logps/chosen": -820.8729248046875, "logps/rejected": -995.578857421875, "loss": 0.327, "rewards/accuracies": 0.875, "rewards/chosen": -3.9144654273986816, "rewards/margins": 1.5802310705184937, "rewards/rejected": -5.494696617126465, "step": 1209 }, { "epoch": 0.7902685933545596, "grad_norm": 19.827994296098602, "learning_rate": 2.563973669707452e-08, "logits/chosen": -0.9767338633537292, "logits/rejected": -0.9783136248588562, "logps/chosen": -1063.784423828125, "logps/rejected": -1288.805419921875, "loss": 0.3702, "rewards/accuracies": 0.8125, "rewards/chosen": -5.4177045822143555, "rewards/margins": 2.281778335571289, "rewards/rejected": -7.6994829177856445, "step": 1210 }, { "epoch": 0.790921707894522, "grad_norm": 30.180018885086678, "learning_rate": 2.548738588593582e-08, "logits/chosen": -0.84820955991745, "logits/rejected": -0.7857142090797424, "logps/chosen": -1108.758544921875, "logps/rejected": -1281.418701171875, "loss": 0.3701, "rewards/accuracies": 0.875, "rewards/chosen": -5.03013801574707, "rewards/margins": 2.472670793533325, "rewards/rejected": -7.502809524536133, "step": 1211 }, { "epoch": 0.7915748224344844, "grad_norm": 24.355575672449053, "learning_rate": 2.5335422922725824e-08, "logits/chosen": -0.8779463768005371, "logits/rejected": -0.8178101181983948, "logps/chosen": -960.484619140625, "logps/rejected": -1187.8353271484375, "loss": 0.414, "rewards/accuracies": 0.9375, "rewards/chosen": -4.938921928405762, "rewards/margins": 2.410421848297119, "rewards/rejected": -7.349343299865723, "step": 1212 }, { "epoch": 0.7922279369744469, "grad_norm": 21.158534500164034, "learning_rate": 2.518384859843168e-08, "logits/chosen": -0.8605954051017761, "logits/rejected": -0.8223261833190918, "logps/chosen": -1020.8521118164062, "logps/rejected": -1173.8663330078125, "loss": 0.3428, "rewards/accuracies": 0.75, "rewards/chosen": -5.409945011138916, "rewards/margins": 1.6165356636047363, "rewards/rejected": -7.026480197906494, "step": 1213 }, { "epoch": 0.7928810515144094, "grad_norm": 19.484177348159484, "learning_rate": 2.5032663702017633e-08, "logits/chosen": -0.8267495036125183, "logits/rejected": -0.8446471095085144, "logps/chosen": -952.152587890625, "logps/rejected": -1204.9439697265625, "loss": 0.3381, "rewards/accuracies": 0.9375, "rewards/chosen": -4.965042591094971, "rewards/margins": 1.8775386810302734, "rewards/rejected": -6.842580795288086, "step": 1214 }, { "epoch": 0.7935341660543718, "grad_norm": 16.4682799053427, "learning_rate": 2.4881869020420888e-08, "logits/chosen": -1.0425740480422974, "logits/rejected": -0.9511318206787109, "logps/chosen": -985.417724609375, "logps/rejected": -1143.3631591796875, "loss": 0.357, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3016676902771, "rewards/margins": 1.9380346536636353, "rewards/rejected": -7.2397027015686035, "step": 1215 }, { "epoch": 0.7941872805943342, "grad_norm": 36.46933285274797, "learning_rate": 2.4731465338547552e-08, "logits/chosen": -1.024279236793518, "logits/rejected": -0.9062608480453491, "logps/chosen": -936.8526000976562, "logps/rejected": -1150.493408203125, "loss": 0.4068, "rewards/accuracies": 0.90625, "rewards/chosen": -4.91167688369751, "rewards/margins": 2.1005947589874268, "rewards/rejected": -7.012270927429199, "step": 1216 }, { "epoch": 0.7948403951342967, "grad_norm": 25.529220177411503, "learning_rate": 2.4581453439268506e-08, "logits/chosen": -1.0040630102157593, "logits/rejected": -0.8005591630935669, "logps/chosen": -1072.67626953125, "logps/rejected": -1209.8746337890625, "loss": 0.425, "rewards/accuracies": 0.71875, "rewards/chosen": -5.791203022003174, "rewards/margins": 1.998671054840088, "rewards/rejected": -7.789875030517578, "step": 1217 }, { "epoch": 0.7954935096742591, "grad_norm": 38.9750256048071, "learning_rate": 2.443183410341535e-08, "logits/chosen": -0.8872929811477661, "logits/rejected": -0.8971114754676819, "logps/chosen": -1029.74462890625, "logps/rejected": -1296.3499755859375, "loss": 0.362, "rewards/accuracies": 0.875, "rewards/chosen": -5.137325286865234, "rewards/margins": 2.4624693393707275, "rewards/rejected": -7.599794864654541, "step": 1218 }, { "epoch": 0.7961466242142216, "grad_norm": 23.621966508721446, "learning_rate": 2.4282608109776404e-08, "logits/chosen": -0.7239641547203064, "logits/rejected": -0.8325695395469666, "logps/chosen": -879.1217041015625, "logps/rejected": -1209.5390625, "loss": 0.3791, "rewards/accuracies": 0.9375, "rewards/chosen": -4.391090393066406, "rewards/margins": 2.7524499893188477, "rewards/rejected": -7.143540382385254, "step": 1219 }, { "epoch": 0.796799738754184, "grad_norm": 24.192463988892303, "learning_rate": 2.413377623509245e-08, "logits/chosen": -0.8919906616210938, "logits/rejected": -0.7484989166259766, "logps/chosen": -952.1494140625, "logps/rejected": -1090.5037841796875, "loss": 0.363, "rewards/accuracies": 0.78125, "rewards/chosen": -5.1700439453125, "rewards/margins": 1.6430315971374512, "rewards/rejected": -6.813075542449951, "step": 1220 }, { "epoch": 0.7974528532941465, "grad_norm": 24.082097860730222, "learning_rate": 2.3985339254052962e-08, "logits/chosen": -1.0477828979492188, "logits/rejected": -0.9007452726364136, "logps/chosen": -1051.1192626953125, "logps/rejected": -1316.3472900390625, "loss": 0.3398, "rewards/accuracies": 0.90625, "rewards/chosen": -5.486971855163574, "rewards/margins": 3.2541961669921875, "rewards/rejected": -8.741168022155762, "step": 1221 }, { "epoch": 0.7981059678341089, "grad_norm": 18.91024586575857, "learning_rate": 2.383729793929189e-08, "logits/chosen": -0.9369001388549805, "logits/rejected": -0.8223516345024109, "logps/chosen": -1001.8585205078125, "logps/rejected": -1124.386474609375, "loss": 0.3213, "rewards/accuracies": 0.78125, "rewards/chosen": -4.99350118637085, "rewards/margins": 1.4244037866592407, "rewards/rejected": -6.417904853820801, "step": 1222 }, { "epoch": 0.7987590823740713, "grad_norm": 42.05950974053666, "learning_rate": 2.3689653061383685e-08, "logits/chosen": -0.8296122550964355, "logits/rejected": -0.8270793557167053, "logps/chosen": -1003.6871948242188, "logps/rejected": -1198.9710693359375, "loss": 0.3992, "rewards/accuracies": 0.84375, "rewards/chosen": -5.332522392272949, "rewards/margins": 1.6734325885772705, "rewards/rejected": -7.005955219268799, "step": 1223 }, { "epoch": 0.7994121969140338, "grad_norm": 45.58794217717948, "learning_rate": 2.3542405388839315e-08, "logits/chosen": -0.9384451508522034, "logits/rejected": -0.7390685081481934, "logps/chosen": -1071.325927734375, "logps/rejected": -1221.4930419921875, "loss": 0.3833, "rewards/accuracies": 0.84375, "rewards/chosen": -6.6582231521606445, "rewards/margins": 1.8130872249603271, "rewards/rejected": -8.471309661865234, "step": 1224 }, { "epoch": 0.8000653114539963, "grad_norm": 46.362112329902644, "learning_rate": 2.339555568810221e-08, "logits/chosen": -0.9025658965110779, "logits/rejected": -0.8725322484970093, "logps/chosen": -1064.623291015625, "logps/rejected": -1244.8243408203125, "loss": 0.4366, "rewards/accuracies": 0.71875, "rewards/chosen": -5.746537685394287, "rewards/margins": 1.7106165885925293, "rewards/rejected": -7.457154750823975, "step": 1225 }, { "epoch": 0.8007184259939587, "grad_norm": 27.267571141968297, "learning_rate": 2.3249104723544333e-08, "logits/chosen": -0.747216522693634, "logits/rejected": -0.7036592364311218, "logps/chosen": -1053.0712890625, "logps/rejected": -1246.5084228515625, "loss": 0.43, "rewards/accuracies": 0.78125, "rewards/chosen": -6.20672082901001, "rewards/margins": 1.843930959701538, "rewards/rejected": -8.050651550292969, "step": 1226 }, { "epoch": 0.8013715405339211, "grad_norm": 31.244738281197762, "learning_rate": 2.3103053257462145e-08, "logits/chosen": -0.8720026612281799, "logits/rejected": -0.776594877243042, "logps/chosen": -916.8078002929688, "logps/rejected": -1142.56982421875, "loss": 0.3608, "rewards/accuracies": 0.875, "rewards/chosen": -5.195310115814209, "rewards/margins": 2.356201171875, "rewards/rejected": -7.551511287689209, "step": 1227 }, { "epoch": 0.8020246550738835, "grad_norm": 15.654836904900627, "learning_rate": 2.2957402050072717e-08, "logits/chosen": -0.98552006483078, "logits/rejected": -0.9261568784713745, "logps/chosen": -1043.234130859375, "logps/rejected": -1214.549560546875, "loss": 0.3054, "rewards/accuracies": 0.90625, "rewards/chosen": -5.632101058959961, "rewards/margins": 1.8705977201461792, "rewards/rejected": -7.5026984214782715, "step": 1228 }, { "epoch": 0.8026777696138461, "grad_norm": 19.962514820454384, "learning_rate": 2.2812151859509645e-08, "logits/chosen": -0.9196304082870483, "logits/rejected": -0.8677047491073608, "logps/chosen": -1022.9666748046875, "logps/rejected": -1254.571044921875, "loss": 0.3091, "rewards/accuracies": 0.875, "rewards/chosen": -5.6341094970703125, "rewards/margins": 2.5342183113098145, "rewards/rejected": -8.168327331542969, "step": 1229 }, { "epoch": 0.8033308841538085, "grad_norm": 19.192635217134253, "learning_rate": 2.2667303441819242e-08, "logits/chosen": -0.940382719039917, "logits/rejected": -0.9116930961608887, "logps/chosen": -996.216064453125, "logps/rejected": -1162.93115234375, "loss": 0.3674, "rewards/accuracies": 0.8125, "rewards/chosen": -5.108640193939209, "rewards/margins": 1.6084949970245361, "rewards/rejected": -6.717134952545166, "step": 1230 }, { "epoch": 0.8039839986937709, "grad_norm": 23.74258752114483, "learning_rate": 2.252285755095652e-08, "logits/chosen": -0.9278345704078674, "logits/rejected": -0.9251123070716858, "logps/chosen": -1053.6322021484375, "logps/rejected": -1384.0828857421875, "loss": 0.2828, "rewards/accuracies": 0.875, "rewards/chosen": -5.672571182250977, "rewards/margins": 2.669499635696411, "rewards/rejected": -8.342071533203125, "step": 1231 }, { "epoch": 0.8046371132337333, "grad_norm": 18.678877823425772, "learning_rate": 2.2378814938781265e-08, "logits/chosen": -0.8256291151046753, "logits/rejected": -0.6893395185470581, "logps/chosen": -1019.7078857421875, "logps/rejected": -1199.7503662109375, "loss": 0.382, "rewards/accuracies": 0.84375, "rewards/chosen": -5.491720199584961, "rewards/margins": 1.8675512075424194, "rewards/rejected": -7.359271049499512, "step": 1232 }, { "epoch": 0.8052902277736959, "grad_norm": 44.8394412752877, "learning_rate": 2.22351763550542e-08, "logits/chosen": -0.8614879846572876, "logits/rejected": -0.8459488749504089, "logps/chosen": -1098.8720703125, "logps/rejected": -1225.9066162109375, "loss": 0.4307, "rewards/accuracies": 0.75, "rewards/chosen": -6.287536144256592, "rewards/margins": 1.3981188535690308, "rewards/rejected": -7.685655117034912, "step": 1233 }, { "epoch": 0.8059433423136583, "grad_norm": 23.062890847523345, "learning_rate": 2.2091942547432952e-08, "logits/chosen": -0.819417417049408, "logits/rejected": -0.8785775303840637, "logps/chosen": -936.6260986328125, "logps/rejected": -1199.01220703125, "loss": 0.395, "rewards/accuracies": 0.90625, "rewards/chosen": -4.857845306396484, "rewards/margins": 1.9972339868545532, "rewards/rejected": -6.855079174041748, "step": 1234 }, { "epoch": 0.8065964568536207, "grad_norm": 25.693523378160574, "learning_rate": 2.1949114261468304e-08, "logits/chosen": -0.7582164406776428, "logits/rejected": -0.7685361504554749, "logps/chosen": -1009.5326538085938, "logps/rejected": -1153.093017578125, "loss": 0.3625, "rewards/accuracies": 0.875, "rewards/chosen": -5.58023738861084, "rewards/margins": 1.4440110921859741, "rewards/rejected": -7.0242486000061035, "step": 1235 }, { "epoch": 0.8072495713935831, "grad_norm": 25.197486248693675, "learning_rate": 2.18066922406002e-08, "logits/chosen": -0.8278719186782837, "logits/rejected": -0.8821883797645569, "logps/chosen": -939.5359497070312, "logps/rejected": -1243.8126220703125, "loss": 0.4497, "rewards/accuracies": 0.84375, "rewards/chosen": -4.773218631744385, "rewards/margins": 2.74705171585083, "rewards/rejected": -7.520270824432373, "step": 1236 }, { "epoch": 0.8079026859335456, "grad_norm": 22.66095713824487, "learning_rate": 2.1664677226153938e-08, "logits/chosen": -0.8346998691558838, "logits/rejected": -0.7691102623939514, "logps/chosen": -1045.7852783203125, "logps/rejected": -1190.573486328125, "loss": 0.3849, "rewards/accuracies": 0.8125, "rewards/chosen": -6.094000816345215, "rewards/margins": 1.7288265228271484, "rewards/rejected": -7.822828769683838, "step": 1237 }, { "epoch": 0.8085558004735081, "grad_norm": 25.500361485349092, "learning_rate": 2.1523069957336303e-08, "logits/chosen": -0.8235265612602234, "logits/rejected": -0.89778071641922, "logps/chosen": -1058.410888671875, "logps/rejected": -1341.494873046875, "loss": 0.3224, "rewards/accuracies": 0.9375, "rewards/chosen": -6.059966564178467, "rewards/margins": 2.3945508003234863, "rewards/rejected": -8.454517364501953, "step": 1238 }, { "epoch": 0.8092089150134705, "grad_norm": 37.356694926090526, "learning_rate": 2.1381871171231692e-08, "logits/chosen": -0.8128297328948975, "logits/rejected": -0.7122081518173218, "logps/chosen": -932.2877197265625, "logps/rejected": -1099.0528564453125, "loss": 0.3584, "rewards/accuracies": 0.875, "rewards/chosen": -5.248828887939453, "rewards/margins": 1.8526395559310913, "rewards/rejected": -7.101468563079834, "step": 1239 }, { "epoch": 0.8098620295534329, "grad_norm": 22.32379852460851, "learning_rate": 2.1241081602798317e-08, "logits/chosen": -0.884192943572998, "logits/rejected": -0.9082574248313904, "logps/chosen": -1082.5439453125, "logps/rejected": -1388.14501953125, "loss": 0.3629, "rewards/accuracies": 0.84375, "rewards/chosen": -6.576789855957031, "rewards/margins": 2.5638484954833984, "rewards/rejected": -9.140637397766113, "step": 1240 }, { "epoch": 0.8105151440933954, "grad_norm": 24.36231097267006, "learning_rate": 2.1100701984864354e-08, "logits/chosen": -0.924676775932312, "logits/rejected": -0.9948889017105103, "logps/chosen": -1087.4019775390625, "logps/rejected": -1230.5699462890625, "loss": 0.3895, "rewards/accuracies": 0.8125, "rewards/chosen": -5.527956962585449, "rewards/margins": 1.3158284425735474, "rewards/rejected": -6.843785285949707, "step": 1241 }, { "epoch": 0.8111682586333578, "grad_norm": 29.98400675361614, "learning_rate": 2.096073304812408e-08, "logits/chosen": -0.8991495966911316, "logits/rejected": -0.9502657651901245, "logps/chosen": -1102.3123779296875, "logps/rejected": -1379.7208251953125, "loss": 0.3557, "rewards/accuracies": 0.90625, "rewards/chosen": -6.3212361335754395, "rewards/margins": 2.6031382083892822, "rewards/rejected": -8.924375534057617, "step": 1242 }, { "epoch": 0.8118213731733203, "grad_norm": 19.8045035109399, "learning_rate": 2.0821175521134203e-08, "logits/chosen": -0.8717656135559082, "logits/rejected": -0.8530954122543335, "logps/chosen": -978.0860595703125, "logps/rejected": -1278.1768798828125, "loss": 0.3568, "rewards/accuracies": 0.84375, "rewards/chosen": -4.698294162750244, "rewards/margins": 2.899409294128418, "rewards/rejected": -7.59770393371582, "step": 1243 }, { "epoch": 0.8124744877132827, "grad_norm": 23.529826779582308, "learning_rate": 2.0682030130309937e-08, "logits/chosen": -1.0024141073226929, "logits/rejected": -0.8768361806869507, "logps/chosen": -1081.8538818359375, "logps/rejected": -1280.859375, "loss": 0.3559, "rewards/accuracies": 1.0, "rewards/chosen": -5.240067481994629, "rewards/margins": 2.442638397216797, "rewards/rejected": -7.682706356048584, "step": 1244 }, { "epoch": 0.8131276022532452, "grad_norm": 36.84318502121178, "learning_rate": 2.0543297599921305e-08, "logits/chosen": -0.8412925004959106, "logits/rejected": -0.7551301121711731, "logps/chosen": -1126.511474609375, "logps/rejected": -1374.8885498046875, "loss": 0.4481, "rewards/accuracies": 0.78125, "rewards/chosen": -6.242119312286377, "rewards/margins": 2.2767138481140137, "rewards/rejected": -8.51883316040039, "step": 1245 }, { "epoch": 0.8137807167932076, "grad_norm": 27.498227486362723, "learning_rate": 2.0404978652089323e-08, "logits/chosen": -0.6808075904846191, "logits/rejected": -0.7349047660827637, "logps/chosen": -877.9880981445312, "logps/rejected": -1123.840087890625, "loss": 0.3956, "rewards/accuracies": 0.90625, "rewards/chosen": -4.855025291442871, "rewards/margins": 1.9713177680969238, "rewards/rejected": -6.826342582702637, "step": 1246 }, { "epoch": 0.81443383133317, "grad_norm": 31.33636263030433, "learning_rate": 2.0267074006782235e-08, "logits/chosen": -0.7306153178215027, "logits/rejected": -0.7039578557014465, "logps/chosen": -1020.278076171875, "logps/rejected": -1221.2576904296875, "loss": 0.4415, "rewards/accuracies": 0.84375, "rewards/chosen": -5.24103307723999, "rewards/margins": 1.8187503814697266, "rewards/rejected": -7.059783458709717, "step": 1247 }, { "epoch": 0.8150869458731325, "grad_norm": 23.27240519426899, "learning_rate": 2.0129584381811826e-08, "logits/chosen": -0.7635297775268555, "logits/rejected": -0.6661262512207031, "logps/chosen": -1066.372314453125, "logps/rejected": -1230.029052734375, "loss": 0.3597, "rewards/accuracies": 0.75, "rewards/chosen": -6.463615894317627, "rewards/margins": 1.785343885421753, "rewards/rejected": -8.2489595413208, "step": 1248 }, { "epoch": 0.815740060413095, "grad_norm": 29.215895350871378, "learning_rate": 1.9992510492829618e-08, "logits/chosen": -0.8741496205329895, "logits/rejected": -0.8110368251800537, "logps/chosen": -913.3331909179688, "logps/rejected": -1101.623046875, "loss": 0.3678, "rewards/accuracies": 0.8125, "rewards/chosen": -4.769600868225098, "rewards/margins": 1.9510536193847656, "rewards/rejected": -6.720654487609863, "step": 1249 }, { "epoch": 0.8163931749530574, "grad_norm": 37.00306210512274, "learning_rate": 1.9855853053323178e-08, "logits/chosen": -0.9076151847839355, "logits/rejected": -0.9450336694717407, "logps/chosen": -978.0199584960938, "logps/rejected": -1196.162109375, "loss": 0.3918, "rewards/accuracies": 0.84375, "rewards/chosen": -5.30790376663208, "rewards/margins": 2.2430849075317383, "rewards/rejected": -7.550989151000977, "step": 1250 }, { "epoch": 0.8170462894930198, "grad_norm": 26.71800819631346, "learning_rate": 1.9719612774612404e-08, "logits/chosen": -0.9339046478271484, "logits/rejected": -0.9458125829696655, "logps/chosen": -1008.9201049804688, "logps/rejected": -1295.292724609375, "loss": 0.4199, "rewards/accuracies": 0.90625, "rewards/chosen": -5.112905025482178, "rewards/margins": 2.7176475524902344, "rewards/rejected": -7.830551624298096, "step": 1251 }, { "epoch": 0.8176994040329822, "grad_norm": 22.65444787969263, "learning_rate": 1.958379036584582e-08, "logits/chosen": -0.7930296063423157, "logits/rejected": -0.7819668054580688, "logps/chosen": -965.6203002929688, "logps/rejected": -1127.9246826171875, "loss": 0.3648, "rewards/accuracies": 0.75, "rewards/chosen": -5.067156791687012, "rewards/margins": 1.8418265581130981, "rewards/rejected": -6.908982753753662, "step": 1252 }, { "epoch": 0.8183525185729448, "grad_norm": 35.94238776510094, "learning_rate": 1.9448386533996897e-08, "logits/chosen": -0.8811150789260864, "logits/rejected": -0.8617693185806274, "logps/chosen": -905.3536987304688, "logps/rejected": -1050.594482421875, "loss": 0.4079, "rewards/accuracies": 0.875, "rewards/chosen": -4.579432010650635, "rewards/margins": 1.3525540828704834, "rewards/rejected": -5.931985855102539, "step": 1253 }, { "epoch": 0.8190056331129072, "grad_norm": 20.114155683568644, "learning_rate": 1.9313401983860278e-08, "logits/chosen": -0.7634121775627136, "logits/rejected": -0.7326263189315796, "logps/chosen": -1029.6595458984375, "logps/rejected": -1226.5960693359375, "loss": 0.3304, "rewards/accuracies": 0.875, "rewards/chosen": -4.899470806121826, "rewards/margins": 1.9020302295684814, "rewards/rejected": -6.801499843597412, "step": 1254 }, { "epoch": 0.8196587476528696, "grad_norm": 29.389608351454772, "learning_rate": 1.9178837418048287e-08, "logits/chosen": -1.035788655281067, "logits/rejected": -1.034264326095581, "logps/chosen": -939.3095703125, "logps/rejected": -1057.8206787109375, "loss": 0.4131, "rewards/accuracies": 0.84375, "rewards/chosen": -5.060790538787842, "rewards/margins": 1.1942658424377441, "rewards/rejected": -6.255056381225586, "step": 1255 }, { "epoch": 0.820311862192832, "grad_norm": 23.633542412432398, "learning_rate": 1.9044693536987146e-08, "logits/chosen": -0.865630567073822, "logits/rejected": -0.8657881617546082, "logps/chosen": -1064.190185546875, "logps/rejected": -1238.50390625, "loss": 0.3135, "rewards/accuracies": 0.875, "rewards/chosen": -5.71279764175415, "rewards/margins": 1.6474566459655762, "rewards/rejected": -7.36025333404541, "step": 1256 }, { "epoch": 0.8209649767327946, "grad_norm": 31.576320870780314, "learning_rate": 1.8910971038913316e-08, "logits/chosen": -0.8827898502349854, "logits/rejected": -0.9015779495239258, "logps/chosen": -941.9280395507812, "logps/rejected": -1156.314208984375, "loss": 0.4099, "rewards/accuracies": 0.9375, "rewards/chosen": -4.529621124267578, "rewards/margins": 2.061371088027954, "rewards/rejected": -6.590992450714111, "step": 1257 }, { "epoch": 0.821618091272757, "grad_norm": 26.25520258324422, "learning_rate": 1.877767061986997e-08, "logits/chosen": -0.879159688949585, "logits/rejected": -0.8486706018447876, "logps/chosen": -1051.3345947265625, "logps/rejected": -1229.3165283203125, "loss": 0.2901, "rewards/accuracies": 0.78125, "rewards/chosen": -5.476308822631836, "rewards/margins": 1.9429969787597656, "rewards/rejected": -7.41930627822876, "step": 1258 }, { "epoch": 0.8222712058127194, "grad_norm": 27.762333821264512, "learning_rate": 1.864479297370325e-08, "logits/chosen": -0.8901294469833374, "logits/rejected": -0.9367501735687256, "logps/chosen": -1139.62890625, "logps/rejected": -1377.3262939453125, "loss": 0.3116, "rewards/accuracies": 0.90625, "rewards/chosen": -5.882551670074463, "rewards/margins": 2.453892230987549, "rewards/rejected": -8.336442947387695, "step": 1259 }, { "epoch": 0.8229243203526818, "grad_norm": 19.94747891616991, "learning_rate": 1.8512338792058745e-08, "logits/chosen": -0.89691162109375, "logits/rejected": -0.9170194864273071, "logps/chosen": -1008.8082275390625, "logps/rejected": -1234.414306640625, "loss": 0.3774, "rewards/accuracies": 0.75, "rewards/chosen": -5.763920783996582, "rewards/margins": 2.05275559425354, "rewards/rejected": -7.816676616668701, "step": 1260 }, { "epoch": 0.8235774348926443, "grad_norm": 28.264623159069007, "learning_rate": 1.838030876437784e-08, "logits/chosen": -0.8986479640007019, "logits/rejected": -0.8794419765472412, "logps/chosen": -1030.9847412109375, "logps/rejected": -1238.065673828125, "loss": 0.4215, "rewards/accuracies": 0.875, "rewards/chosen": -5.154675006866455, "rewards/margins": 1.8639512062072754, "rewards/rejected": -7.018627166748047, "step": 1261 }, { "epoch": 0.8242305494326068, "grad_norm": 18.41175881448945, "learning_rate": 1.8248703577894132e-08, "logits/chosen": -0.7217578887939453, "logits/rejected": -0.7507769465446472, "logps/chosen": -950.8618774414062, "logps/rejected": -1251.9049072265625, "loss": 0.3286, "rewards/accuracies": 0.9375, "rewards/chosen": -4.839773178100586, "rewards/margins": 2.6825602054595947, "rewards/rejected": -7.522334098815918, "step": 1262 }, { "epoch": 0.8248836639725692, "grad_norm": 21.04106550876138, "learning_rate": 1.8117523917629895e-08, "logits/chosen": -0.8280400037765503, "logits/rejected": -0.803841233253479, "logps/chosen": -980.7047119140625, "logps/rejected": -1142.6981201171875, "loss": 0.4046, "rewards/accuracies": 0.71875, "rewards/chosen": -5.227568626403809, "rewards/margins": 1.4354455471038818, "rewards/rejected": -6.6630144119262695, "step": 1263 }, { "epoch": 0.8255367785125316, "grad_norm": 16.262147010001502, "learning_rate": 1.798677046639244e-08, "logits/chosen": -0.761043131351471, "logits/rejected": -0.6971719861030579, "logps/chosen": -947.5308837890625, "logps/rejected": -1009.4982299804688, "loss": 0.3669, "rewards/accuracies": 0.6875, "rewards/chosen": -4.8415422439575195, "rewards/margins": 0.8895235657691956, "rewards/rejected": -5.731066703796387, "step": 1264 }, { "epoch": 0.8261898930524941, "grad_norm": 19.058057525896476, "learning_rate": 1.7856443904770657e-08, "logits/chosen": -1.1435941457748413, "logits/rejected": -1.0320429801940918, "logps/chosen": -1002.7920532226562, "logps/rejected": -1178.949462890625, "loss": 0.3415, "rewards/accuracies": 0.90625, "rewards/chosen": -4.919374942779541, "rewards/margins": 2.0565154552459717, "rewards/rejected": -6.975890636444092, "step": 1265 }, { "epoch": 0.8268430075924565, "grad_norm": 23.23707352681105, "learning_rate": 1.772654491113138e-08, "logits/chosen": -0.9951072335243225, "logits/rejected": -0.8988428711891174, "logps/chosen": -1025.6783447265625, "logps/rejected": -1171.551513671875, "loss": 0.3785, "rewards/accuracies": 0.75, "rewards/chosen": -5.015107154846191, "rewards/margins": 1.7816100120544434, "rewards/rejected": -6.796716690063477, "step": 1266 }, { "epoch": 0.827496122132419, "grad_norm": 25.86540317412067, "learning_rate": 1.75970741616159e-08, "logits/chosen": -0.7014292478561401, "logits/rejected": -0.7118043899536133, "logps/chosen": -897.89794921875, "logps/rejected": -1087.0958251953125, "loss": 0.4355, "rewards/accuracies": 0.875, "rewards/chosen": -4.325657844543457, "rewards/margins": 1.5333755016326904, "rewards/rejected": -5.859033584594727, "step": 1267 }, { "epoch": 0.8281492366723814, "grad_norm": 18.171682491599366, "learning_rate": 1.746803233013645e-08, "logits/chosen": -0.7887166738510132, "logits/rejected": -0.8805134892463684, "logps/chosen": -869.1028442382812, "logps/rejected": -1045.51416015625, "loss": 0.4541, "rewards/accuracies": 0.8125, "rewards/chosen": -4.430434226989746, "rewards/margins": 1.542751669883728, "rewards/rejected": -5.973185062408447, "step": 1268 }, { "epoch": 0.8288023512123439, "grad_norm": 19.860357482891896, "learning_rate": 1.733942008837269e-08, "logits/chosen": -0.8767520785331726, "logits/rejected": -0.6930796504020691, "logps/chosen": -1000.611572265625, "logps/rejected": -1124.2705078125, "loss": 0.3817, "rewards/accuracies": 0.8125, "rewards/chosen": -4.829126358032227, "rewards/margins": 1.4662240743637085, "rewards/rejected": -6.295350551605225, "step": 1269 }, { "epoch": 0.8294554657523063, "grad_norm": 15.737729321214841, "learning_rate": 1.721123810576821e-08, "logits/chosen": -0.9429450035095215, "logits/rejected": -0.8901304006576538, "logps/chosen": -980.0512084960938, "logps/rejected": -1083.7476806640625, "loss": 0.3119, "rewards/accuracies": 0.8125, "rewards/chosen": -4.375136852264404, "rewards/margins": 1.5506840944290161, "rewards/rejected": -5.925821304321289, "step": 1270 }, { "epoch": 0.8301085802922687, "grad_norm": 18.42427316829238, "learning_rate": 1.7083487049527033e-08, "logits/chosen": -0.977911114692688, "logits/rejected": -0.9723238348960876, "logps/chosen": -864.9402465820312, "logps/rejected": -978.0227661132812, "loss": 0.3579, "rewards/accuracies": 0.84375, "rewards/chosen": -4.461297512054443, "rewards/margins": 1.4185537099838257, "rewards/rejected": -5.8798508644104, "step": 1271 }, { "epoch": 0.8307616948322312, "grad_norm": 28.41186955631412, "learning_rate": 1.695616758461017e-08, "logits/chosen": -0.8489782214164734, "logits/rejected": -0.7123806476593018, "logps/chosen": -989.315185546875, "logps/rejected": -1167.1005859375, "loss": 0.4038, "rewards/accuracies": 0.875, "rewards/chosen": -5.38248348236084, "rewards/margins": 2.059591293334961, "rewards/rejected": -7.442075729370117, "step": 1272 }, { "epoch": 0.8314148093721937, "grad_norm": 17.295026440713308, "learning_rate": 1.6829280373732123e-08, "logits/chosen": -0.8142927885055542, "logits/rejected": -0.7259283661842346, "logps/chosen": -954.6568603515625, "logps/rejected": -1167.76416015625, "loss": 0.3945, "rewards/accuracies": 0.875, "rewards/chosen": -5.306705474853516, "rewards/margins": 2.2607154846191406, "rewards/rejected": -7.56742000579834, "step": 1273 }, { "epoch": 0.8320679239121561, "grad_norm": 19.266167582706977, "learning_rate": 1.670282607735748e-08, "logits/chosen": -0.7883981466293335, "logits/rejected": -0.5904198884963989, "logps/chosen": -1010.0977783203125, "logps/rejected": -1187.25634765625, "loss": 0.3784, "rewards/accuracies": 0.84375, "rewards/chosen": -5.103325843811035, "rewards/margins": 2.1519057750701904, "rewards/rejected": -7.255231857299805, "step": 1274 }, { "epoch": 0.8327210384521185, "grad_norm": 19.640653551050946, "learning_rate": 1.657680535369744e-08, "logits/chosen": -0.8553881645202637, "logits/rejected": -0.783621609210968, "logps/chosen": -958.4749755859375, "logps/rejected": -1111.0194091796875, "loss": 0.3705, "rewards/accuracies": 0.84375, "rewards/chosen": -4.92903995513916, "rewards/margins": 1.7265620231628418, "rewards/rejected": -6.6556010246276855, "step": 1275 }, { "epoch": 0.833374152992081, "grad_norm": 40.42956174656325, "learning_rate": 1.645121885870637e-08, "logits/chosen": -0.8692556619644165, "logits/rejected": -0.8151346445083618, "logps/chosen": -1030.67724609375, "logps/rejected": -1192.6016845703125, "loss": 0.4011, "rewards/accuracies": 0.75, "rewards/chosen": -5.321305751800537, "rewards/margins": 1.7291826009750366, "rewards/rejected": -7.050488471984863, "step": 1276 }, { "epoch": 0.8340272675320435, "grad_norm": 27.494527127687896, "learning_rate": 1.6326067246078455e-08, "logits/chosen": -0.7602800130844116, "logits/rejected": -0.7602342367172241, "logps/chosen": -919.4981079101562, "logps/rejected": -1069.651611328125, "loss": 0.3332, "rewards/accuracies": 0.90625, "rewards/chosen": -4.5918426513671875, "rewards/margins": 1.435691475868225, "rewards/rejected": -6.027534008026123, "step": 1277 }, { "epoch": 0.8346803820720059, "grad_norm": 17.408825623095783, "learning_rate": 1.620135116724427e-08, "logits/chosen": -0.8369264602661133, "logits/rejected": -0.7914663553237915, "logps/chosen": -1004.7100219726562, "logps/rejected": -1143.3367919921875, "loss": 0.3466, "rewards/accuracies": 0.84375, "rewards/chosen": -5.365845680236816, "rewards/margins": 1.667271375656128, "rewards/rejected": -7.033117294311523, "step": 1278 }, { "epoch": 0.8353334966119683, "grad_norm": 18.46052855986182, "learning_rate": 1.607707127136734e-08, "logits/chosen": -0.9659842848777771, "logits/rejected": -0.9460926055908203, "logps/chosen": -1054.6634521484375, "logps/rejected": -1332.1539306640625, "loss": 0.3493, "rewards/accuracies": 0.90625, "rewards/chosen": -5.598286151885986, "rewards/margins": 2.5486865043640137, "rewards/rejected": -8.14697265625, "step": 1279 }, { "epoch": 0.8359866111519307, "grad_norm": 52.90133689155389, "learning_rate": 1.595322820534084e-08, "logits/chosen": -1.1675491333007812, "logits/rejected": -0.9613858461380005, "logps/chosen": -981.5598754882812, "logps/rejected": -1093.9095458984375, "loss": 0.4236, "rewards/accuracies": 0.8125, "rewards/chosen": -4.7773823738098145, "rewards/margins": 1.6107470989227295, "rewards/rejected": -6.388129234313965, "step": 1280 }, { "epoch": 0.8366397256918933, "grad_norm": 25.872062130935507, "learning_rate": 1.582982261378416e-08, "logits/chosen": -0.718327522277832, "logits/rejected": -0.7760236859321594, "logps/chosen": -1003.913330078125, "logps/rejected": -1260.5673828125, "loss": 0.3669, "rewards/accuracies": 0.84375, "rewards/chosen": -5.025420188903809, "rewards/margins": 2.33792781829834, "rewards/rejected": -7.36334753036499, "step": 1281 }, { "epoch": 0.8372928402318557, "grad_norm": 47.99050518823117, "learning_rate": 1.5706855139039598e-08, "logits/chosen": -0.6850143671035767, "logits/rejected": -0.7747923135757446, "logps/chosen": -880.8267211914062, "logps/rejected": -1138.4072265625, "loss": 0.3798, "rewards/accuracies": 0.875, "rewards/chosen": -4.495758533477783, "rewards/margins": 1.8549747467041016, "rewards/rejected": -6.350733280181885, "step": 1282 }, { "epoch": 0.8379459547718181, "grad_norm": 19.640501117831906, "learning_rate": 1.5584326421168982e-08, "logits/chosen": -0.9612534642219543, "logits/rejected": -0.9289946556091309, "logps/chosen": -1052.7708740234375, "logps/rejected": -1209.37353515625, "loss": 0.3761, "rewards/accuracies": 0.78125, "rewards/chosen": -5.764304161071777, "rewards/margins": 1.2603614330291748, "rewards/rejected": -7.024665832519531, "step": 1283 }, { "epoch": 0.8385990693117805, "grad_norm": 17.854998086406365, "learning_rate": 1.546223709795036e-08, "logits/chosen": -0.9502691626548767, "logits/rejected": -0.8256719708442688, "logps/chosen": -876.907470703125, "logps/rejected": -1027.9510498046875, "loss": 0.3932, "rewards/accuracies": 0.84375, "rewards/chosen": -4.2758636474609375, "rewards/margins": 1.5407800674438477, "rewards/rejected": -5.816644191741943, "step": 1284 }, { "epoch": 0.839252183851743, "grad_norm": 18.55849706082879, "learning_rate": 1.534058780487466e-08, "logits/chosen": -0.9169440269470215, "logits/rejected": -0.9256635904312134, "logps/chosen": -974.2261352539062, "logps/rejected": -1282.3883056640625, "loss": 0.3313, "rewards/accuracies": 0.90625, "rewards/chosen": -4.323151111602783, "rewards/margins": 2.5996651649475098, "rewards/rejected": -6.922817230224609, "step": 1285 }, { "epoch": 0.8399052983917055, "grad_norm": 17.71886089469172, "learning_rate": 1.5219379175142422e-08, "logits/chosen": -0.8969942927360535, "logits/rejected": -0.8221906423568726, "logps/chosen": -997.0826416015625, "logps/rejected": -1204.2249755859375, "loss": 0.3596, "rewards/accuracies": 0.9375, "rewards/chosen": -5.254983901977539, "rewards/margins": 2.350426197052002, "rewards/rejected": -7.605410575866699, "step": 1286 }, { "epoch": 0.8405584129316679, "grad_norm": 18.76637143478923, "learning_rate": 1.509861183966048e-08, "logits/chosen": -1.0163681507110596, "logits/rejected": -0.9551371932029724, "logps/chosen": -1005.5806884765625, "logps/rejected": -1197.47119140625, "loss": 0.3389, "rewards/accuracies": 0.90625, "rewards/chosen": -4.851705074310303, "rewards/margins": 2.037259340286255, "rewards/rejected": -6.8889641761779785, "step": 1287 }, { "epoch": 0.8412115274716303, "grad_norm": 26.75304121034945, "learning_rate": 1.49782864270386e-08, "logits/chosen": -0.8632940649986267, "logits/rejected": -0.8258289694786072, "logps/chosen": -895.1854858398438, "logps/rejected": -1046.0596923828125, "loss": 0.3176, "rewards/accuracies": 0.84375, "rewards/chosen": -4.798386573791504, "rewards/margins": 1.3754218816757202, "rewards/rejected": -6.1738080978393555, "step": 1288 }, { "epoch": 0.8418646420115928, "grad_norm": 24.281872431787555, "learning_rate": 1.4858403563586364e-08, "logits/chosen": -0.7966564893722534, "logits/rejected": -0.781533420085907, "logps/chosen": -933.5511474609375, "logps/rejected": -1309.903564453125, "loss": 0.3223, "rewards/accuracies": 0.90625, "rewards/chosen": -4.543218612670898, "rewards/margins": 2.614068031311035, "rewards/rejected": -7.157286167144775, "step": 1289 }, { "epoch": 0.8425177565515553, "grad_norm": 18.318840074907698, "learning_rate": 1.4738963873309796e-08, "logits/chosen": -1.0536984205245972, "logits/rejected": -0.9841817021369934, "logps/chosen": -1026.2513427734375, "logps/rejected": -1251.1527099609375, "loss": 0.4029, "rewards/accuracies": 0.90625, "rewards/chosen": -5.173938751220703, "rewards/margins": 2.2958824634552, "rewards/rejected": -7.469821453094482, "step": 1290 }, { "epoch": 0.8431708710915177, "grad_norm": 19.079583722272528, "learning_rate": 1.4619967977908154e-08, "logits/chosen": -0.920739471912384, "logits/rejected": -0.8924881815910339, "logps/chosen": -936.5213623046875, "logps/rejected": -1040.7528076171875, "loss": 0.3601, "rewards/accuracies": 0.8125, "rewards/chosen": -4.9877400398254395, "rewards/margins": 1.2131364345550537, "rewards/rejected": -6.200876235961914, "step": 1291 }, { "epoch": 0.8438239856314801, "grad_norm": 21.171536102950306, "learning_rate": 1.450141649677067e-08, "logits/chosen": -0.7656310796737671, "logits/rejected": -0.6925913095474243, "logps/chosen": -938.579833984375, "logps/rejected": -1122.44873046875, "loss": 0.3767, "rewards/accuracies": 0.8125, "rewards/chosen": -5.016373157501221, "rewards/margins": 1.5836703777313232, "rewards/rejected": -6.600043296813965, "step": 1292 }, { "epoch": 0.8444771001714426, "grad_norm": 26.59407222785762, "learning_rate": 1.4383310046973362e-08, "logits/chosen": -0.9900301694869995, "logits/rejected": -0.9108420610427856, "logps/chosen": -944.6455078125, "logps/rejected": -1131.0509033203125, "loss": 0.3737, "rewards/accuracies": 0.84375, "rewards/chosen": -4.449494361877441, "rewards/margins": 2.0268986225128174, "rewards/rejected": -6.476393222808838, "step": 1293 }, { "epoch": 0.845130214711405, "grad_norm": 21.349267385371785, "learning_rate": 1.426564924327578e-08, "logits/chosen": -0.7166858911514282, "logits/rejected": -0.7389101982116699, "logps/chosen": -960.0745849609375, "logps/rejected": -1161.590576171875, "loss": 0.3609, "rewards/accuracies": 0.84375, "rewards/chosen": -4.704523086547852, "rewards/margins": 1.972301721572876, "rewards/rejected": -6.676824569702148, "step": 1294 }, { "epoch": 0.8457833292513675, "grad_norm": 17.76499362390843, "learning_rate": 1.414843469811785e-08, "logits/chosen": -0.8625224232673645, "logits/rejected": -0.8022060394287109, "logps/chosen": -844.1168212890625, "logps/rejected": -1009.7998657226562, "loss": 0.297, "rewards/accuracies": 0.875, "rewards/chosen": -4.143881320953369, "rewards/margins": 1.9904934167861938, "rewards/rejected": -6.134374618530273, "step": 1295 }, { "epoch": 0.8464364437913299, "grad_norm": 27.604014088902886, "learning_rate": 1.4031667021616644e-08, "logits/chosen": -0.8415222764015198, "logits/rejected": -0.833803653717041, "logps/chosen": -940.93994140625, "logps/rejected": -1100.514892578125, "loss": 0.3987, "rewards/accuracies": 0.75, "rewards/chosen": -4.876412391662598, "rewards/margins": 1.3780218362808228, "rewards/rejected": -6.254434108734131, "step": 1296 }, { "epoch": 0.8470895583312924, "grad_norm": 30.61975390516564, "learning_rate": 1.3915346821563234e-08, "logits/chosen": -0.9406237602233887, "logits/rejected": -0.9669230580329895, "logps/chosen": -1041.82763671875, "logps/rejected": -1264.9764404296875, "loss": 0.4221, "rewards/accuracies": 0.84375, "rewards/chosen": -5.639151096343994, "rewards/margins": 2.355329990386963, "rewards/rejected": -7.994481563568115, "step": 1297 }, { "epoch": 0.8477426728712548, "grad_norm": 47.86314159060928, "learning_rate": 1.3799474703419511e-08, "logits/chosen": -0.7521982192993164, "logits/rejected": -0.7642690539360046, "logps/chosen": -1003.8787231445312, "logps/rejected": -1157.30029296875, "loss": 0.3954, "rewards/accuracies": 0.84375, "rewards/chosen": -5.433172225952148, "rewards/margins": 1.7837271690368652, "rewards/rejected": -7.216899394989014, "step": 1298 }, { "epoch": 0.8483957874112172, "grad_norm": 20.5044227740988, "learning_rate": 1.3684051270315056e-08, "logits/chosen": -0.8989154100418091, "logits/rejected": -0.892413854598999, "logps/chosen": -1036.949462890625, "logps/rejected": -1166.9058837890625, "loss": 0.3666, "rewards/accuracies": 0.8125, "rewards/chosen": -5.681972026824951, "rewards/margins": 1.3601738214492798, "rewards/rejected": -7.0421462059021, "step": 1299 }, { "epoch": 0.8490489019511797, "grad_norm": 20.765349915051292, "learning_rate": 1.356907712304397e-08, "logits/chosen": -0.8609282374382019, "logits/rejected": -0.8219651579856873, "logps/chosen": -944.8560180664062, "logps/rejected": -1174.7581787109375, "loss": 0.3058, "rewards/accuracies": 0.90625, "rewards/chosen": -5.124340057373047, "rewards/margins": 2.451206922531128, "rewards/rejected": -7.575546741485596, "step": 1300 }, { "epoch": 0.8490489019511797, "eval_logits/chosen": -0.668989896774292, "eval_logits/rejected": -0.6087003350257874, "eval_logps/chosen": -986.1474609375, "eval_logps/rejected": -1149.446533203125, "eval_loss": 0.3876606225967407, "eval_rewards/accuracies": 0.8040000200271606, "eval_rewards/chosen": -5.122402191162109, "eval_rewards/margins": 1.7203764915466309, "eval_rewards/rejected": -6.84277868270874, "eval_runtime": 615.7139, "eval_samples_per_second": 6.497, "eval_steps_per_second": 0.406, "step": 1300 }, { "epoch": 0.8497020164911422, "grad_norm": 23.618304831288146, "learning_rate": 1.3454552860061775e-08, "logits/chosen": -0.9938647150993347, "logits/rejected": -0.9096130728721619, "logps/chosen": -1019.2464599609375, "logps/rejected": -1136.9942626953125, "loss": 0.3821, "rewards/accuracies": 0.75, "rewards/chosen": -5.018831253051758, "rewards/margins": 1.5127394199371338, "rewards/rejected": -6.5315704345703125, "step": 1301 }, { "epoch": 0.8503551310311046, "grad_norm": 26.652680433445912, "learning_rate": 1.3340479077482269e-08, "logits/chosen": -0.6800895929336548, "logits/rejected": -0.5056519508361816, "logps/chosen": -1047.76904296875, "logps/rejected": -1239.179931640625, "loss": 0.3573, "rewards/accuracies": 0.84375, "rewards/chosen": -5.758296489715576, "rewards/margins": 2.2488903999328613, "rewards/rejected": -8.007186889648438, "step": 1302 }, { "epoch": 0.851008245571067, "grad_norm": 17.976290131053773, "learning_rate": 1.3226856369074469e-08, "logits/chosen": -0.9246777892112732, "logits/rejected": -0.8199312090873718, "logps/chosen": -956.479736328125, "logps/rejected": -1215.670166015625, "loss": 0.4105, "rewards/accuracies": 0.75, "rewards/chosen": -4.915981769561768, "rewards/margins": 2.1597423553466797, "rewards/rejected": -7.0757246017456055, "step": 1303 }, { "epoch": 0.8516613601110294, "grad_norm": 18.773093682434446, "learning_rate": 1.3113685326259449e-08, "logits/chosen": -0.9623532891273499, "logits/rejected": -0.9360837340354919, "logps/chosen": -991.7564086914062, "logps/rejected": -1178.57958984375, "loss": 0.3686, "rewards/accuracies": 0.84375, "rewards/chosen": -5.276458740234375, "rewards/margins": 1.8608438968658447, "rewards/rejected": -7.137301921844482, "step": 1304 }, { "epoch": 0.852314474650992, "grad_norm": 19.590640282587913, "learning_rate": 1.3000966538107338e-08, "logits/chosen": -0.9980742931365967, "logits/rejected": -0.8249813914299011, "logps/chosen": -965.131591796875, "logps/rejected": -1122.595703125, "loss": 0.3543, "rewards/accuracies": 0.84375, "rewards/chosen": -5.103146553039551, "rewards/margins": 2.172144889831543, "rewards/rejected": -7.2752909660339355, "step": 1305 }, { "epoch": 0.8529675891909544, "grad_norm": 23.660495897995975, "learning_rate": 1.2888700591334224e-08, "logits/chosen": -0.8977683782577515, "logits/rejected": -0.9450709819793701, "logps/chosen": -920.9202270507812, "logps/rejected": -1138.51513671875, "loss": 0.3775, "rewards/accuracies": 0.84375, "rewards/chosen": -5.005034446716309, "rewards/margins": 1.9082834720611572, "rewards/rejected": -6.913317680358887, "step": 1306 }, { "epoch": 0.8536207037309168, "grad_norm": 25.403831749047146, "learning_rate": 1.2776888070299074e-08, "logits/chosen": -0.692973256111145, "logits/rejected": -0.6692020297050476, "logps/chosen": -872.77392578125, "logps/rejected": -1088.57421875, "loss": 0.4124, "rewards/accuracies": 0.75, "rewards/chosen": -4.830261707305908, "rewards/margins": 2.1663036346435547, "rewards/rejected": -6.996564865112305, "step": 1307 }, { "epoch": 0.8542738182708792, "grad_norm": 36.11944114089828, "learning_rate": 1.2665529557000721e-08, "logits/chosen": -0.8878995776176453, "logits/rejected": -0.8831501007080078, "logps/chosen": -1004.2806396484375, "logps/rejected": -1308.138916015625, "loss": 0.3876, "rewards/accuracies": 0.9375, "rewards/chosen": -5.11844539642334, "rewards/margins": 2.552079916000366, "rewards/rejected": -7.670524597167969, "step": 1308 }, { "epoch": 0.8549269328108418, "grad_norm": 19.374431409557353, "learning_rate": 1.2554625631074845e-08, "logits/chosen": -0.9898017048835754, "logits/rejected": -0.8030941486358643, "logps/chosen": -979.740234375, "logps/rejected": -1153.956298828125, "loss": 0.4099, "rewards/accuracies": 0.875, "rewards/chosen": -5.388953685760498, "rewards/margins": 2.137970447540283, "rewards/rejected": -7.526924133300781, "step": 1309 }, { "epoch": 0.8555800473508042, "grad_norm": 19.549188171242683, "learning_rate": 1.2444176869790923e-08, "logits/chosen": -0.8904908895492554, "logits/rejected": -0.8610888719558716, "logps/chosen": -1027.5037841796875, "logps/rejected": -1165.888427734375, "loss": 0.3336, "rewards/accuracies": 0.84375, "rewards/chosen": -5.47878885269165, "rewards/margins": 1.3745296001434326, "rewards/rejected": -6.853318214416504, "step": 1310 }, { "epoch": 0.8562331618907666, "grad_norm": 32.18918359672568, "learning_rate": 1.2334183848049218e-08, "logits/chosen": -0.8858809471130371, "logits/rejected": -0.9060701727867126, "logps/chosen": -936.8377685546875, "logps/rejected": -1113.317626953125, "loss": 0.3508, "rewards/accuracies": 0.875, "rewards/chosen": -4.633793830871582, "rewards/margins": 1.7703123092651367, "rewards/rejected": -6.404106140136719, "step": 1311 }, { "epoch": 0.856886276430729, "grad_norm": 23.153513339145228, "learning_rate": 1.2224647138377852e-08, "logits/chosen": -0.879304826259613, "logits/rejected": -0.860927164554596, "logps/chosen": -909.355712890625, "logps/rejected": -1061.8111572265625, "loss": 0.4269, "rewards/accuracies": 0.8125, "rewards/chosen": -4.555379867553711, "rewards/margins": 1.5526858568191528, "rewards/rejected": -6.108066082000732, "step": 1312 }, { "epoch": 0.8575393909706915, "grad_norm": 19.303528656660927, "learning_rate": 1.2115567310929764e-08, "logits/chosen": -1.0215394496917725, "logits/rejected": -0.8691954612731934, "logps/chosen": -959.844970703125, "logps/rejected": -1066.708251953125, "loss": 0.346, "rewards/accuracies": 0.84375, "rewards/chosen": -4.928271770477295, "rewards/margins": 1.406358242034912, "rewards/rejected": -6.334630012512207, "step": 1313 }, { "epoch": 0.858192505510654, "grad_norm": 35.2373703782269, "learning_rate": 1.2006944933479757e-08, "logits/chosen": -0.93189936876297, "logits/rejected": -0.7885631918907166, "logps/chosen": -965.0560913085938, "logps/rejected": -1143.0811767578125, "loss": 0.3388, "rewards/accuracies": 0.875, "rewards/chosen": -4.93606424331665, "rewards/margins": 1.9708058834075928, "rewards/rejected": -6.906870365142822, "step": 1314 }, { "epoch": 0.8588456200506164, "grad_norm": 16.96897023927652, "learning_rate": 1.1898780571421552e-08, "logits/chosen": -0.7246195077896118, "logits/rejected": -0.5464783906936646, "logps/chosen": -1039.4256591796875, "logps/rejected": -1129.1058349609375, "loss": 0.3656, "rewards/accuracies": 0.75, "rewards/chosen": -5.497856616973877, "rewards/margins": 1.3049273490905762, "rewards/rejected": -6.802783966064453, "step": 1315 }, { "epoch": 0.8594987345905788, "grad_norm": 19.679916600270765, "learning_rate": 1.1791074787764843e-08, "logits/chosen": -1.0116046667099, "logits/rejected": -0.7844774723052979, "logps/chosen": -1002.8685913085938, "logps/rejected": -1154.26904296875, "loss": 0.3629, "rewards/accuracies": 0.9375, "rewards/chosen": -5.192471504211426, "rewards/margins": 2.362677812576294, "rewards/rejected": -7.555149555206299, "step": 1316 }, { "epoch": 0.8601518491305413, "grad_norm": 25.296782369673497, "learning_rate": 1.1683828143132357e-08, "logits/chosen": -0.830581545829773, "logits/rejected": -0.9192355275154114, "logps/chosen": -1129.3555908203125, "logps/rejected": -1313.74658203125, "loss": 0.4019, "rewards/accuracies": 0.875, "rewards/chosen": -6.07790994644165, "rewards/margins": 2.141233205795288, "rewards/rejected": -8.219143867492676, "step": 1317 }, { "epoch": 0.8608049636705037, "grad_norm": 29.317004046508643, "learning_rate": 1.1577041195756954e-08, "logits/chosen": -0.9937784671783447, "logits/rejected": -0.9926940202713013, "logps/chosen": -1031.589111328125, "logps/rejected": -1172.4385986328125, "loss": 0.3605, "rewards/accuracies": 0.78125, "rewards/chosen": -5.357040882110596, "rewards/margins": 1.5132580995559692, "rewards/rejected": -6.870299816131592, "step": 1318 }, { "epoch": 0.8614580782104662, "grad_norm": 16.67396538130868, "learning_rate": 1.1470714501478684e-08, "logits/chosen": -0.9257397651672363, "logits/rejected": -0.8267370462417603, "logps/chosen": -980.4049072265625, "logps/rejected": -1161.0723876953125, "loss": 0.3556, "rewards/accuracies": 0.90625, "rewards/chosen": -5.501986503601074, "rewards/margins": 1.8758596181869507, "rewards/rejected": -7.3778462409973145, "step": 1319 }, { "epoch": 0.8621111927504286, "grad_norm": 18.03638683569069, "learning_rate": 1.136484861374195e-08, "logits/chosen": -0.7538772225379944, "logits/rejected": -0.7899537086486816, "logps/chosen": -1040.8414306640625, "logps/rejected": -1305.1431884765625, "loss": 0.395, "rewards/accuracies": 0.8125, "rewards/chosen": -5.39848518371582, "rewards/margins": 2.092883586883545, "rewards/rejected": -7.491368770599365, "step": 1320 }, { "epoch": 0.862764307290391, "grad_norm": 25.436755545695352, "learning_rate": 1.1259444083592584e-08, "logits/chosen": -0.9210352897644043, "logits/rejected": -0.848932147026062, "logps/chosen": -1121.5576171875, "logps/rejected": -1202.206787109375, "loss": 0.3479, "rewards/accuracies": 0.78125, "rewards/chosen": -5.757541179656982, "rewards/margins": 1.2138102054595947, "rewards/rejected": -6.971351146697998, "step": 1321 }, { "epoch": 0.8634174218303535, "grad_norm": 17.62570364052275, "learning_rate": 1.115450145967497e-08, "logits/chosen": -0.8840048313140869, "logits/rejected": -0.7672009468078613, "logps/chosen": -902.815673828125, "logps/rejected": -1068.86279296875, "loss": 0.3754, "rewards/accuracies": 0.8125, "rewards/chosen": -4.672972202301025, "rewards/margins": 1.7041089534759521, "rewards/rejected": -6.377080917358398, "step": 1322 }, { "epoch": 0.864070536370316, "grad_norm": 20.05848285615795, "learning_rate": 1.1050021288229238e-08, "logits/chosen": -0.883508563041687, "logits/rejected": -0.9728587865829468, "logps/chosen": -922.3805541992188, "logps/rejected": -1065.12158203125, "loss": 0.3801, "rewards/accuracies": 0.78125, "rewards/chosen": -4.737578392028809, "rewards/margins": 1.3487595319747925, "rewards/rejected": -6.086338043212891, "step": 1323 }, { "epoch": 0.8647236509102784, "grad_norm": 17.24172271225209, "learning_rate": 1.094600411308838e-08, "logits/chosen": -0.9118102192878723, "logits/rejected": -0.8039119839668274, "logps/chosen": -939.2774047851562, "logps/rejected": -1240.79638671875, "loss": 0.366, "rewards/accuracies": 0.9375, "rewards/chosen": -4.814789295196533, "rewards/margins": 2.4467813968658447, "rewards/rejected": -7.261570930480957, "step": 1324 }, { "epoch": 0.8653767654502408, "grad_norm": 47.60072526382748, "learning_rate": 1.0842450475675447e-08, "logits/chosen": -0.8272143602371216, "logits/rejected": -0.7395732402801514, "logps/chosen": -982.828125, "logps/rejected": -1292.427001953125, "loss": 0.3915, "rewards/accuracies": 0.8125, "rewards/chosen": -5.002832889556885, "rewards/margins": 2.7571911811828613, "rewards/rejected": -7.760024070739746, "step": 1325 }, { "epoch": 0.8660298799902033, "grad_norm": 37.03434033578114, "learning_rate": 1.0739360915000684e-08, "logits/chosen": -0.6964623332023621, "logits/rejected": -0.765415370464325, "logps/chosen": -1048.430419921875, "logps/rejected": -1307.3927001953125, "loss": 0.3597, "rewards/accuracies": 0.875, "rewards/chosen": -5.442670822143555, "rewards/margins": 2.116424560546875, "rewards/rejected": -7.55909538269043, "step": 1326 }, { "epoch": 0.8666829945301657, "grad_norm": 24.87566087436779, "learning_rate": 1.0636735967658783e-08, "logits/chosen": -0.7556520104408264, "logits/rejected": -0.7704268097877502, "logps/chosen": -928.6072998046875, "logps/rejected": -1152.6773681640625, "loss": 0.4114, "rewards/accuracies": 0.84375, "rewards/chosen": -5.012763500213623, "rewards/margins": 2.03975510597229, "rewards/rejected": -7.052518844604492, "step": 1327 }, { "epoch": 0.8673361090701281, "grad_norm": 28.9166809841385, "learning_rate": 1.053457616782606e-08, "logits/chosen": -0.8666751384735107, "logits/rejected": -0.79831463098526, "logps/chosen": -938.53564453125, "logps/rejected": -1244.32177734375, "loss": 0.3871, "rewards/accuracies": 0.8125, "rewards/chosen": -4.967408180236816, "rewards/margins": 2.6878042221069336, "rewards/rejected": -7.65521240234375, "step": 1328 }, { "epoch": 0.8679892236100906, "grad_norm": 18.0419834090539, "learning_rate": 1.0432882047257662e-08, "logits/chosen": -0.7430148720741272, "logits/rejected": -0.725567102432251, "logps/chosen": -994.3425903320312, "logps/rejected": -1212.511962890625, "loss": 0.3318, "rewards/accuracies": 0.9375, "rewards/chosen": -5.153656482696533, "rewards/margins": 2.336616039276123, "rewards/rejected": -7.49027156829834, "step": 1329 }, { "epoch": 0.8686423381500531, "grad_norm": 26.724255517519683, "learning_rate": 1.0331654135284828e-08, "logits/chosen": -0.9742575883865356, "logits/rejected": -0.9050071239471436, "logps/chosen": -986.375732421875, "logps/rejected": -1160.0706787109375, "loss": 0.4229, "rewards/accuracies": 0.78125, "rewards/chosen": -5.533772945404053, "rewards/margins": 1.7066081762313843, "rewards/rejected": -7.240381717681885, "step": 1330 }, { "epoch": 0.8692954526900155, "grad_norm": 41.620788625763744, "learning_rate": 1.0230892958812121e-08, "logits/chosen": -0.7415061593055725, "logits/rejected": -0.7691062092781067, "logps/chosen": -1047.3677978515625, "logps/rejected": -1379.028076171875, "loss": 0.3797, "rewards/accuracies": 0.84375, "rewards/chosen": -5.393304824829102, "rewards/margins": 2.6148576736450195, "rewards/rejected": -8.008163452148438, "step": 1331 }, { "epoch": 0.8699485672299779, "grad_norm": 22.984334643739626, "learning_rate": 1.0130599042314692e-08, "logits/chosen": -0.9216421842575073, "logits/rejected": -0.8291821479797363, "logps/chosen": -998.3714599609375, "logps/rejected": -1113.957763671875, "loss": 0.3409, "rewards/accuracies": 0.71875, "rewards/chosen": -5.584649562835693, "rewards/margins": 1.312453269958496, "rewards/rejected": -6.8971028327941895, "step": 1332 }, { "epoch": 0.8706016817699403, "grad_norm": 24.900796453353124, "learning_rate": 1.0030772907835483e-08, "logits/chosen": -0.908165454864502, "logits/rejected": -0.8499115705490112, "logps/chosen": -1064.900634765625, "logps/rejected": -1277.716064453125, "loss": 0.3541, "rewards/accuracies": 0.75, "rewards/chosen": -5.384472370147705, "rewards/margins": 1.7705702781677246, "rewards/rejected": -7.155043125152588, "step": 1333 }, { "epoch": 0.8712547963099029, "grad_norm": 27.084076031409385, "learning_rate": 9.931415074982652e-09, "logits/chosen": -0.7743809819221497, "logits/rejected": -0.8256383538246155, "logps/chosen": -994.769287109375, "logps/rejected": -1325.2421875, "loss": 0.3249, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3938307762146, "rewards/margins": 2.0958592891693115, "rewards/rejected": -7.489689826965332, "step": 1334 }, { "epoch": 0.8719079108498653, "grad_norm": 19.780940598640473, "learning_rate": 9.83252606092675e-09, "logits/chosen": -0.6853234767913818, "logits/rejected": -0.7641116976737976, "logps/chosen": -982.231689453125, "logps/rejected": -1285.2666015625, "loss": 0.322, "rewards/accuracies": 0.8125, "rewards/chosen": -5.414259910583496, "rewards/margins": 2.343747138977051, "rewards/rejected": -7.758007049560547, "step": 1335 }, { "epoch": 0.8725610253898277, "grad_norm": 21.680307984905237, "learning_rate": 9.734106380398022e-09, "logits/chosen": -0.7005580067634583, "logits/rejected": -0.6642061471939087, "logps/chosen": -933.6368408203125, "logps/rejected": -1211.7689208984375, "loss": 0.3904, "rewards/accuracies": 0.9375, "rewards/chosen": -4.866269111633301, "rewards/margins": 2.560360908508301, "rewards/rejected": -7.426629543304443, "step": 1336 }, { "epoch": 0.8732141399297901, "grad_norm": 31.102753169843872, "learning_rate": 9.636156545683883e-09, "logits/chosen": -1.025209665298462, "logits/rejected": -0.9281519651412964, "logps/chosen": -1016.0449829101562, "logps/rejected": -1132.5885009765625, "loss": 0.4202, "rewards/accuracies": 0.8125, "rewards/chosen": -5.55272912979126, "rewards/margins": 1.6035462617874146, "rewards/rejected": -7.156274795532227, "step": 1337 }, { "epoch": 0.8738672544697527, "grad_norm": 17.364332861748135, "learning_rate": 9.538677066626022e-09, "logits/chosen": -0.7068428993225098, "logits/rejected": -0.7502703666687012, "logps/chosen": -974.12939453125, "logps/rejected": -1213.2840576171875, "loss": 0.2782, "rewards/accuracies": 0.875, "rewards/chosen": -4.726061820983887, "rewards/margins": 2.2374606132507324, "rewards/rejected": -6.963522434234619, "step": 1338 }, { "epoch": 0.8745203690097151, "grad_norm": 17.620559884621457, "learning_rate": 9.441668450617923e-09, "logits/chosen": -0.9793184995651245, "logits/rejected": -0.9558190703392029, "logps/chosen": -965.9219970703125, "logps/rejected": -1112.312744140625, "loss": 0.3764, "rewards/accuracies": 0.71875, "rewards/chosen": -4.9571428298950195, "rewards/margins": 1.8213887214660645, "rewards/rejected": -6.778531551361084, "step": 1339 }, { "epoch": 0.8751734835496775, "grad_norm": 32.17387765620781, "learning_rate": 9.345131202602164e-09, "logits/chosen": -1.1392197608947754, "logits/rejected": -0.9780033826828003, "logps/chosen": -975.8745727539062, "logps/rejected": -1259.0980224609375, "loss": 0.3324, "rewards/accuracies": 0.875, "rewards/chosen": -4.840333461761475, "rewards/margins": 3.1771368980407715, "rewards/rejected": -8.017470359802246, "step": 1340 }, { "epoch": 0.8758265980896399, "grad_norm": 21.131034779855497, "learning_rate": 9.249065825067758e-09, "logits/chosen": -0.8789793848991394, "logits/rejected": -0.8509078621864319, "logps/chosen": -923.0077514648438, "logps/rejected": -1089.787353515625, "loss": 0.3321, "rewards/accuracies": 0.84375, "rewards/chosen": -4.57816219329834, "rewards/margins": 1.576073408126831, "rewards/rejected": -6.15423583984375, "step": 1341 }, { "epoch": 0.8764797126296024, "grad_norm": 29.509029464687337, "learning_rate": 9.153472818047625e-09, "logits/chosen": -0.8012582659721375, "logits/rejected": -0.8273895382881165, "logps/chosen": -1030.0450439453125, "logps/rejected": -1222.031494140625, "loss": 0.495, "rewards/accuracies": 0.71875, "rewards/chosen": -5.729059219360352, "rewards/margins": 1.3719710111618042, "rewards/rejected": -7.101030349731445, "step": 1342 }, { "epoch": 0.8771328271695649, "grad_norm": 18.846614157308878, "learning_rate": 9.058352679115877e-09, "logits/chosen": -1.0124015808105469, "logits/rejected": -1.0128324031829834, "logps/chosen": -1058.6331787109375, "logps/rejected": -1228.419189453125, "loss": 0.2927, "rewards/accuracies": 0.8125, "rewards/chosen": -5.294768333435059, "rewards/margins": 1.7898911237716675, "rewards/rejected": -7.084660053253174, "step": 1343 }, { "epoch": 0.8777859417095273, "grad_norm": 24.392505614425158, "learning_rate": 8.963705903385343e-09, "logits/chosen": -0.7756754159927368, "logits/rejected": -0.7199742197990417, "logps/chosen": -916.379150390625, "logps/rejected": -1156.7076416015625, "loss": 0.4262, "rewards/accuracies": 0.90625, "rewards/chosen": -4.617804050445557, "rewards/margins": 2.4423699378967285, "rewards/rejected": -7.060173034667969, "step": 1344 }, { "epoch": 0.8784390562494897, "grad_norm": 26.689071599825724, "learning_rate": 8.869532983504857e-09, "logits/chosen": -0.6491687297821045, "logits/rejected": -0.6166170835494995, "logps/chosen": -994.8203125, "logps/rejected": -1133.805908203125, "loss": 0.3631, "rewards/accuracies": 0.78125, "rewards/chosen": -5.721258163452148, "rewards/margins": 1.3589121103286743, "rewards/rejected": -7.080170154571533, "step": 1345 }, { "epoch": 0.8790921707894522, "grad_norm": 20.776260248385395, "learning_rate": 8.775834409656858e-09, "logits/chosen": -0.8777629137039185, "logits/rejected": -0.7253664135932922, "logps/chosen": -1020.5516357421875, "logps/rejected": -1153.7655029296875, "loss": 0.3426, "rewards/accuracies": 0.71875, "rewards/chosen": -5.246578216552734, "rewards/margins": 1.892094612121582, "rewards/rejected": -7.138672828674316, "step": 1346 }, { "epoch": 0.8797452853294147, "grad_norm": 17.81070478069546, "learning_rate": 8.68261066955468e-09, "logits/chosen": -0.9101294279098511, "logits/rejected": -0.9133301973342896, "logps/chosen": -1000.2833862304688, "logps/rejected": -1239.5137939453125, "loss": 0.3019, "rewards/accuracies": 0.9375, "rewards/chosen": -5.050441741943359, "rewards/margins": 2.5203206539154053, "rewards/rejected": -7.570762634277344, "step": 1347 }, { "epoch": 0.8803983998693771, "grad_norm": 30.303361478460022, "learning_rate": 8.589862248440139e-09, "logits/chosen": -0.9815719723701477, "logits/rejected": -0.9545872211456299, "logps/chosen": -950.197021484375, "logps/rejected": -1092.115966796875, "loss": 0.4662, "rewards/accuracies": 0.78125, "rewards/chosen": -4.594487190246582, "rewards/margins": 1.228310465812683, "rewards/rejected": -5.8227972984313965, "step": 1348 }, { "epoch": 0.8810515144093395, "grad_norm": 33.14583667447738, "learning_rate": 8.497589629080925e-09, "logits/chosen": -0.825454592704773, "logits/rejected": -0.792005717754364, "logps/chosen": -964.4857177734375, "logps/rejected": -1159.8331298828125, "loss": 0.3974, "rewards/accuracies": 0.875, "rewards/chosen": -4.539005756378174, "rewards/margins": 2.0518078804016113, "rewards/rejected": -6.590812683105469, "step": 1349 }, { "epoch": 0.881704628949302, "grad_norm": 31.91327189244124, "learning_rate": 8.405793291768126e-09, "logits/chosen": -0.7423689365386963, "logits/rejected": -0.7918304800987244, "logps/chosen": -929.157470703125, "logps/rejected": -1141.266845703125, "loss": 0.3452, "rewards/accuracies": 0.875, "rewards/chosen": -5.143529891967773, "rewards/margins": 1.6667006015777588, "rewards/rejected": -6.810230255126953, "step": 1350 }, { "epoch": 0.8823577434892644, "grad_norm": 14.811345823375834, "learning_rate": 8.314473714313718e-09, "logits/chosen": -0.89141845703125, "logits/rejected": -0.8798700571060181, "logps/chosen": -1057.8790283203125, "logps/rejected": -1344.7864990234375, "loss": 0.2692, "rewards/accuracies": 0.875, "rewards/chosen": -5.591994285583496, "rewards/margins": 2.79272198677063, "rewards/rejected": -8.384716033935547, "step": 1351 }, { "epoch": 0.8830108580292269, "grad_norm": 22.26665699973362, "learning_rate": 8.223631372048068e-09, "logits/chosen": -0.8326403498649597, "logits/rejected": -0.8692911863327026, "logps/chosen": -990.7032470703125, "logps/rejected": -1251.9947509765625, "loss": 0.3025, "rewards/accuracies": 0.78125, "rewards/chosen": -4.864250659942627, "rewards/margins": 2.1364824771881104, "rewards/rejected": -7.000733375549316, "step": 1352 }, { "epoch": 0.8836639725691893, "grad_norm": 31.552116884148052, "learning_rate": 8.13326673781748e-09, "logits/chosen": -0.8896104097366333, "logits/rejected": -0.7196336388587952, "logps/chosen": -971.0677490234375, "logps/rejected": -1149.3621826171875, "loss": 0.4401, "rewards/accuracies": 0.78125, "rewards/chosen": -4.905688285827637, "rewards/margins": 2.0950686931610107, "rewards/rejected": -7.000757694244385, "step": 1353 }, { "epoch": 0.8843170871091518, "grad_norm": 17.181228988826543, "learning_rate": 8.043380281981738e-09, "logits/chosen": -0.9177496433258057, "logits/rejected": -0.7978098392486572, "logps/chosen": -1084.5869140625, "logps/rejected": -1308.2545166015625, "loss": 0.3041, "rewards/accuracies": 0.84375, "rewards/chosen": -5.60262393951416, "rewards/margins": 2.6088032722473145, "rewards/rejected": -8.211426734924316, "step": 1354 }, { "epoch": 0.8849702016491142, "grad_norm": 19.23410183751255, "learning_rate": 7.953972472411651e-09, "logits/chosen": -0.9430975914001465, "logits/rejected": -0.9466529488563538, "logps/chosen": -983.9346313476562, "logps/rejected": -1212.6507568359375, "loss": 0.328, "rewards/accuracies": 0.84375, "rewards/chosen": -5.656593322753906, "rewards/margins": 1.8082714080810547, "rewards/rejected": -7.464864730834961, "step": 1355 }, { "epoch": 0.8856233161890766, "grad_norm": 21.918588042524284, "learning_rate": 7.865043774486546e-09, "logits/chosen": -0.853823184967041, "logits/rejected": -0.7541569471359253, "logps/chosen": -981.8515014648438, "logps/rejected": -1045.3902587890625, "loss": 0.3729, "rewards/accuracies": 0.71875, "rewards/chosen": -5.2487664222717285, "rewards/margins": 1.0395838022232056, "rewards/rejected": -6.288350582122803, "step": 1356 }, { "epoch": 0.886276430729039, "grad_norm": 38.12880855883742, "learning_rate": 7.776594651091994e-09, "logits/chosen": -1.0198040008544922, "logits/rejected": -0.9592865705490112, "logps/chosen": -1032.7816162109375, "logps/rejected": -1200.517578125, "loss": 0.3652, "rewards/accuracies": 0.875, "rewards/chosen": -5.793912410736084, "rewards/margins": 1.8678948879241943, "rewards/rejected": -7.661807060241699, "step": 1357 }, { "epoch": 0.8869295452690016, "grad_norm": 34.33999032738849, "learning_rate": 7.688625562617256e-09, "logits/chosen": -0.9091510772705078, "logits/rejected": -0.8259568214416504, "logps/chosen": -914.729248046875, "logps/rejected": -1010.8964233398438, "loss": 0.4656, "rewards/accuracies": 0.75, "rewards/chosen": -4.5010151863098145, "rewards/margins": 1.2648601531982422, "rewards/rejected": -5.765875816345215, "step": 1358 }, { "epoch": 0.887582659808964, "grad_norm": 33.81711789536401, "learning_rate": 7.601136966953003e-09, "logits/chosen": -0.9259005188941956, "logits/rejected": -0.8752458691596985, "logps/chosen": -964.3880004882812, "logps/rejected": -1239.1573486328125, "loss": 0.4209, "rewards/accuracies": 0.84375, "rewards/chosen": -5.050426006317139, "rewards/margins": 2.616529703140259, "rewards/rejected": -7.666955947875977, "step": 1359 }, { "epoch": 0.8882357743489264, "grad_norm": 24.065628566506295, "learning_rate": 7.514129319488837e-09, "logits/chosen": -0.8353631496429443, "logits/rejected": -0.7178384065628052, "logps/chosen": -931.4916381835938, "logps/rejected": -1108.391357421875, "loss": 0.3406, "rewards/accuracies": 0.84375, "rewards/chosen": -4.812725067138672, "rewards/margins": 1.8928982019424438, "rewards/rejected": -6.705623626708984, "step": 1360 }, { "epoch": 0.8888888888888888, "grad_norm": 17.622810894307673, "learning_rate": 7.427603073110966e-09, "logits/chosen": -0.8384911417961121, "logits/rejected": -0.7364537119865417, "logps/chosen": -1067.7464599609375, "logps/rejected": -1383.726318359375, "loss": 0.2551, "rewards/accuracies": 0.96875, "rewards/chosen": -5.4259748458862305, "rewards/margins": 3.5197863578796387, "rewards/rejected": -8.945761680603027, "step": 1361 }, { "epoch": 0.8895420034288514, "grad_norm": 24.007523464336668, "learning_rate": 7.341558678199866e-09, "logits/chosen": -0.8948401212692261, "logits/rejected": -0.7564103603363037, "logps/chosen": -1035.5931396484375, "logps/rejected": -1149.049560546875, "loss": 0.3504, "rewards/accuracies": 0.84375, "rewards/chosen": -5.795162200927734, "rewards/margins": 1.4507133960723877, "rewards/rejected": -7.245875358581543, "step": 1362 }, { "epoch": 0.8901951179688138, "grad_norm": 31.397527914915205, "learning_rate": 7.2559965826278765e-09, "logits/chosen": -0.9367034435272217, "logits/rejected": -0.8936917185783386, "logps/chosen": -965.4476318359375, "logps/rejected": -1156.2060546875, "loss": 0.3125, "rewards/accuracies": 0.90625, "rewards/chosen": -5.646866798400879, "rewards/margins": 1.8237485885620117, "rewards/rejected": -7.470614910125732, "step": 1363 }, { "epoch": 0.8908482325087762, "grad_norm": 32.492551038820444, "learning_rate": 7.170917231756957e-09, "logits/chosen": -0.7623252868652344, "logits/rejected": -0.699958086013794, "logps/chosen": -1050.454345703125, "logps/rejected": -1148.5802001953125, "loss": 0.4612, "rewards/accuracies": 0.8125, "rewards/chosen": -5.705660343170166, "rewards/margins": 1.5220611095428467, "rewards/rejected": -7.227721214294434, "step": 1364 }, { "epoch": 0.8915013470487386, "grad_norm": 22.366295992357657, "learning_rate": 7.0863210684362514e-09, "logits/chosen": -0.750536322593689, "logits/rejected": -0.6350942254066467, "logps/chosen": -1094.57421875, "logps/rejected": -1305.535400390625, "loss": 0.3831, "rewards/accuracies": 0.71875, "rewards/chosen": -6.483131408691406, "rewards/margins": 2.0676817893981934, "rewards/rejected": -8.550813674926758, "step": 1365 }, { "epoch": 0.8921544615887012, "grad_norm": 20.289457072065797, "learning_rate": 7.002208532999931e-09, "logits/chosen": -0.9724587202072144, "logits/rejected": -0.9823508262634277, "logps/chosen": -1013.5618896484375, "logps/rejected": -1186.172119140625, "loss": 0.355, "rewards/accuracies": 0.84375, "rewards/chosen": -5.142480850219727, "rewards/margins": 1.8304882049560547, "rewards/rejected": -6.972969055175781, "step": 1366 }, { "epoch": 0.8928075761286636, "grad_norm": 51.43639219938111, "learning_rate": 6.918580063264723e-09, "logits/chosen": -0.948373556137085, "logits/rejected": -0.855690598487854, "logps/chosen": -1088.060302734375, "logps/rejected": -1273.875732421875, "loss": 0.4293, "rewards/accuracies": 0.84375, "rewards/chosen": -5.645302772521973, "rewards/margins": 2.207998514175415, "rewards/rejected": -7.853301048278809, "step": 1367 }, { "epoch": 0.893460690668626, "grad_norm": 23.319891219088692, "learning_rate": 6.835436094527802e-09, "logits/chosen": -0.8042424917221069, "logits/rejected": -0.7829912304878235, "logps/chosen": -1005.8231201171875, "logps/rejected": -1181.2496337890625, "loss": 0.3815, "rewards/accuracies": 1.0, "rewards/chosen": -5.598033428192139, "rewards/margins": 1.9655909538269043, "rewards/rejected": -7.563624858856201, "step": 1368 }, { "epoch": 0.8941138052085884, "grad_norm": 21.36102394772282, "learning_rate": 6.75277705956443e-09, "logits/chosen": -0.6407432556152344, "logits/rejected": -0.764612078666687, "logps/chosen": -1037.61474609375, "logps/rejected": -1353.3521728515625, "loss": 0.3952, "rewards/accuracies": 0.96875, "rewards/chosen": -5.1136603355407715, "rewards/margins": 2.811863899230957, "rewards/rejected": -7.925524711608887, "step": 1369 }, { "epoch": 0.8947669197485509, "grad_norm": 28.704910489557932, "learning_rate": 6.670603388625729e-09, "logits/chosen": -0.9759268760681152, "logits/rejected": -1.0759167671203613, "logps/chosen": -1123.990966796875, "logps/rejected": -1394.31591796875, "loss": 0.4414, "rewards/accuracies": 0.84375, "rewards/chosen": -5.98316764831543, "rewards/margins": 1.978651523590088, "rewards/rejected": -7.961818695068359, "step": 1370 }, { "epoch": 0.8954200342885134, "grad_norm": 37.37125749087838, "learning_rate": 6.588915509436422e-09, "logits/chosen": -0.7405807375907898, "logits/rejected": -0.6690051555633545, "logps/chosen": -972.5208129882812, "logps/rejected": -1203.4818115234375, "loss": 0.3805, "rewards/accuracies": 0.84375, "rewards/chosen": -5.685582160949707, "rewards/margins": 1.8616018295288086, "rewards/rejected": -7.547183990478516, "step": 1371 }, { "epoch": 0.8960731488284758, "grad_norm": 20.307666787607005, "learning_rate": 6.507713847192642e-09, "logits/chosen": -0.8488802909851074, "logits/rejected": -0.8470434546470642, "logps/chosen": -920.7410278320312, "logps/rejected": -1137.112060546875, "loss": 0.2597, "rewards/accuracies": 0.9375, "rewards/chosen": -5.060013771057129, "rewards/margins": 2.1794750690460205, "rewards/rejected": -7.239489555358887, "step": 1372 }, { "epoch": 0.8967262633684382, "grad_norm": 43.88372556931514, "learning_rate": 6.4269988245596705e-09, "logits/chosen": -1.0249723196029663, "logits/rejected": -1.007939338684082, "logps/chosen": -1000.8651123046875, "logps/rejected": -1186.3309326171875, "loss": 0.3355, "rewards/accuracies": 0.84375, "rewards/chosen": -5.083978176116943, "rewards/margins": 1.7574626207351685, "rewards/rejected": -6.8414411544799805, "step": 1373 }, { "epoch": 0.8973793779084007, "grad_norm": 24.61215223955591, "learning_rate": 6.3467708616698255e-09, "logits/chosen": -1.024997591972351, "logits/rejected": -0.9208765625953674, "logps/chosen": -1024.623779296875, "logps/rejected": -1188.1405029296875, "loss": 0.3648, "rewards/accuracies": 0.8125, "rewards/chosen": -4.875184535980225, "rewards/margins": 1.9895105361938477, "rewards/rejected": -6.864694595336914, "step": 1374 }, { "epoch": 0.8980324924483631, "grad_norm": 23.059001195470955, "learning_rate": 6.267030376120152e-09, "logits/chosen": -0.9334471821784973, "logits/rejected": -0.8829833269119263, "logps/chosen": -958.0858154296875, "logps/rejected": -1172.60546875, "loss": 0.3587, "rewards/accuracies": 0.875, "rewards/chosen": -4.882957458496094, "rewards/margins": 1.8839558362960815, "rewards/rejected": -6.766912937164307, "step": 1375 }, { "epoch": 0.8986856069883256, "grad_norm": 31.968479083377638, "learning_rate": 6.187777782970338e-09, "logits/chosen": -1.0105090141296387, "logits/rejected": -0.9062252640724182, "logps/chosen": -1128.7545166015625, "logps/rejected": -1256.02490234375, "loss": 0.4209, "rewards/accuracies": 0.9375, "rewards/chosen": -6.249847888946533, "rewards/margins": 1.6132521629333496, "rewards/rejected": -7.863100528717041, "step": 1376 }, { "epoch": 0.899338721528288, "grad_norm": 20.840264483151362, "learning_rate": 6.109013494740522e-09, "logits/chosen": -0.8968515992164612, "logits/rejected": -0.8042609691619873, "logps/chosen": -956.0653686523438, "logps/rejected": -1098.1114501953125, "loss": 0.3802, "rewards/accuracies": 0.84375, "rewards/chosen": -5.395586013793945, "rewards/margins": 1.7047362327575684, "rewards/rejected": -7.1003217697143555, "step": 1377 }, { "epoch": 0.8999918360682505, "grad_norm": 22.924320591500365, "learning_rate": 6.030737921409168e-09, "logits/chosen": -0.9722306132316589, "logits/rejected": -0.8865904808044434, "logps/chosen": -942.35791015625, "logps/rejected": -1074.9583740234375, "loss": 0.3486, "rewards/accuracies": 0.8125, "rewards/chosen": -5.057037353515625, "rewards/margins": 1.712819218635559, "rewards/rejected": -6.769855976104736, "step": 1378 }, { "epoch": 0.9006449506082129, "grad_norm": 25.553546509337146, "learning_rate": 5.952951470410872e-09, "logits/chosen": -0.848723292350769, "logits/rejected": -0.8915231227874756, "logps/chosen": -1017.4171142578125, "logps/rejected": -1173.70166015625, "loss": 0.3365, "rewards/accuracies": 0.8125, "rewards/chosen": -5.365781307220459, "rewards/margins": 1.5210890769958496, "rewards/rejected": -6.886870861053467, "step": 1379 }, { "epoch": 0.9012980651481753, "grad_norm": 25.539039082715245, "learning_rate": 5.875654546634334e-09, "logits/chosen": -1.0224645137786865, "logits/rejected": -0.9910560846328735, "logps/chosen": -890.1279907226562, "logps/rejected": -1026.0372314453125, "loss": 0.4851, "rewards/accuracies": 0.78125, "rewards/chosen": -4.696094989776611, "rewards/margins": 1.3432387113571167, "rewards/rejected": -6.039333820343018, "step": 1380 }, { "epoch": 0.9019511796881378, "grad_norm": 31.968150161581548, "learning_rate": 5.798847552420183e-09, "logits/chosen": -0.7949383854866028, "logits/rejected": -0.8956403732299805, "logps/chosen": -938.8556518554688, "logps/rejected": -1220.24658203125, "loss": 0.35, "rewards/accuracies": 0.96875, "rewards/chosen": -5.045302391052246, "rewards/margins": 2.233097553253174, "rewards/rejected": -7.27839994430542, "step": 1381 }, { "epoch": 0.9026042942281003, "grad_norm": 24.45518586693916, "learning_rate": 5.722530887558874e-09, "logits/chosen": -0.8497747182846069, "logits/rejected": -0.7980440258979797, "logps/chosen": -941.43310546875, "logps/rejected": -1097.9815673828125, "loss": 0.3532, "rewards/accuracies": 0.90625, "rewards/chosen": -5.3820109367370605, "rewards/margins": 1.8423418998718262, "rewards/rejected": -7.224352836608887, "step": 1382 }, { "epoch": 0.9032574087680627, "grad_norm": 22.862968219213002, "learning_rate": 5.646704949288683e-09, "logits/chosen": -0.9405361413955688, "logits/rejected": -0.8489739298820496, "logps/chosen": -985.428955078125, "logps/rejected": -1097.078857421875, "loss": 0.4181, "rewards/accuracies": 0.78125, "rewards/chosen": -5.368486404418945, "rewards/margins": 1.47150456905365, "rewards/rejected": -6.839991569519043, "step": 1383 }, { "epoch": 0.9039105233080251, "grad_norm": 15.920740655238594, "learning_rate": 5.571370132293552e-09, "logits/chosen": -0.9731810092926025, "logits/rejected": -0.9640232920646667, "logps/chosen": -991.5859375, "logps/rejected": -1153.059326171875, "loss": 0.3359, "rewards/accuracies": 0.90625, "rewards/chosen": -5.1485514640808105, "rewards/margins": 1.600454330444336, "rewards/rejected": -6.749006271362305, "step": 1384 }, { "epoch": 0.9045636378479875, "grad_norm": 40.67819573629027, "learning_rate": 5.496526828701075e-09, "logits/chosen": -0.9371462464332581, "logits/rejected": -0.9148290753364563, "logps/chosen": -1122.163818359375, "logps/rejected": -1289.2926025390625, "loss": 0.4809, "rewards/accuracies": 0.84375, "rewards/chosen": -6.275271892547607, "rewards/margins": 1.8808785676956177, "rewards/rejected": -8.156150817871094, "step": 1385 }, { "epoch": 0.9052167523879501, "grad_norm": 25.740903240918104, "learning_rate": 5.4221754280804774e-09, "logits/chosen": -0.9231817722320557, "logits/rejected": -0.8849901556968689, "logps/chosen": -1015.8035888671875, "logps/rejected": -1216.7371826171875, "loss": 0.3014, "rewards/accuracies": 0.96875, "rewards/chosen": -5.52568244934082, "rewards/margins": 1.806875467300415, "rewards/rejected": -7.3325581550598145, "step": 1386 }, { "epoch": 0.9058698669279125, "grad_norm": 32.25704450398711, "learning_rate": 5.348316317440549e-09, "logits/chosen": -0.8346178531646729, "logits/rejected": -0.8496906757354736, "logps/chosen": -1041.9498291015625, "logps/rejected": -1215.4976806640625, "loss": 0.4455, "rewards/accuracies": 0.6875, "rewards/chosen": -5.537223815917969, "rewards/margins": 1.5639135837554932, "rewards/rejected": -7.101138114929199, "step": 1387 }, { "epoch": 0.9065229814678749, "grad_norm": 22.898299911399935, "learning_rate": 5.274949881227641e-09, "logits/chosen": -0.8780845403671265, "logits/rejected": -0.8350746631622314, "logps/chosen": -926.8447875976562, "logps/rejected": -1088.592529296875, "loss": 0.3918, "rewards/accuracies": 0.875, "rewards/chosen": -4.732271194458008, "rewards/margins": 1.4746408462524414, "rewards/rejected": -6.206912040710449, "step": 1388 }, { "epoch": 0.9071760960078373, "grad_norm": 25.38924080412644, "learning_rate": 5.202076501323694e-09, "logits/chosen": -0.9332727789878845, "logits/rejected": -0.876400887966156, "logps/chosen": -1018.9508056640625, "logps/rejected": -1312.7470703125, "loss": 0.4043, "rewards/accuracies": 0.84375, "rewards/chosen": -5.826478004455566, "rewards/margins": 2.748919725418091, "rewards/rejected": -8.575397491455078, "step": 1389 }, { "epoch": 0.9078292105477999, "grad_norm": 28.770762632476412, "learning_rate": 5.129696557044172e-09, "logits/chosen": -0.9943676590919495, "logits/rejected": -0.8872275948524475, "logps/chosen": -1113.85205078125, "logps/rejected": -1228.1134033203125, "loss": 0.4223, "rewards/accuracies": 0.84375, "rewards/chosen": -6.0906147956848145, "rewards/margins": 1.2218999862670898, "rewards/rejected": -7.312514781951904, "step": 1390 }, { "epoch": 0.9084823250877623, "grad_norm": 23.730110125114983, "learning_rate": 5.057810425136189e-09, "logits/chosen": -1.0045166015625, "logits/rejected": -0.9036121964454651, "logps/chosen": -982.4205322265625, "logps/rejected": -1079.776123046875, "loss": 0.3588, "rewards/accuracies": 0.6875, "rewards/chosen": -5.33574104309082, "rewards/margins": 1.3380316495895386, "rewards/rejected": -6.673772811889648, "step": 1391 }, { "epoch": 0.9091354396277247, "grad_norm": 27.63473440877699, "learning_rate": 4.98641847977651e-09, "logits/chosen": -0.7540165781974792, "logits/rejected": -0.7030526399612427, "logps/chosen": -1042.4677734375, "logps/rejected": -1347.1019287109375, "loss": 0.3761, "rewards/accuracies": 0.8125, "rewards/chosen": -5.64056396484375, "rewards/margins": 2.689081907272339, "rewards/rejected": -8.329645156860352, "step": 1392 }, { "epoch": 0.9097885541676871, "grad_norm": 20.262425063605342, "learning_rate": 4.915521092569552e-09, "logits/chosen": -0.8860914707183838, "logits/rejected": -0.943866491317749, "logps/chosen": -1029.2822265625, "logps/rejected": -1265.9954833984375, "loss": 0.3605, "rewards/accuracies": 0.9375, "rewards/chosen": -6.087381839752197, "rewards/margins": 2.124335527420044, "rewards/rejected": -8.21171760559082, "step": 1393 }, { "epoch": 0.9104416687076496, "grad_norm": 25.627440100114217, "learning_rate": 4.845118632545531e-09, "logits/chosen": -0.9025149345397949, "logits/rejected": -0.9109005928039551, "logps/chosen": -976.8741455078125, "logps/rejected": -1393.4610595703125, "loss": 0.352, "rewards/accuracies": 0.90625, "rewards/chosen": -5.313702583312988, "rewards/margins": 3.813965082168579, "rewards/rejected": -9.127667427062988, "step": 1394 }, { "epoch": 0.9110947832476121, "grad_norm": 22.8158973952252, "learning_rate": 4.775211466158469e-09, "logits/chosen": -0.9520535469055176, "logits/rejected": -0.926558792591095, "logps/chosen": -991.7243041992188, "logps/rejected": -1153.231201171875, "loss": 0.3798, "rewards/accuracies": 0.90625, "rewards/chosen": -5.2474493980407715, "rewards/margins": 2.104310989379883, "rewards/rejected": -7.351759910583496, "step": 1395 }, { "epoch": 0.9117478977875745, "grad_norm": 31.595434065769368, "learning_rate": 4.705799957284351e-09, "logits/chosen": -0.9866628050804138, "logits/rejected": -0.9612003564834595, "logps/chosen": -898.5245361328125, "logps/rejected": -1169.009521484375, "loss": 0.399, "rewards/accuracies": 0.9375, "rewards/chosen": -4.514897346496582, "rewards/margins": 2.6222004890441895, "rewards/rejected": -7.1370978355407715, "step": 1396 }, { "epoch": 0.9124010123275369, "grad_norm": 26.555330502651298, "learning_rate": 4.636884467219171e-09, "logits/chosen": -1.0796997547149658, "logits/rejected": -0.954846203327179, "logps/chosen": -1147.51611328125, "logps/rejected": -1267.271240234375, "loss": 0.4135, "rewards/accuracies": 0.875, "rewards/chosen": -5.350531578063965, "rewards/margins": 1.6353040933609009, "rewards/rejected": -6.985836505889893, "step": 1397 }, { "epoch": 0.9130541268674994, "grad_norm": 21.158609964601236, "learning_rate": 4.568465354677087e-09, "logits/chosen": -0.9235592484474182, "logits/rejected": -0.8905725479125977, "logps/chosen": -957.9666137695312, "logps/rejected": -1054.5474853515625, "loss": 0.3576, "rewards/accuracies": 0.8125, "rewards/chosen": -5.137127876281738, "rewards/margins": 1.1683361530303955, "rewards/rejected": -6.305464267730713, "step": 1398 }, { "epoch": 0.9137072414074618, "grad_norm": 18.97231329613263, "learning_rate": 4.500542975788579e-09, "logits/chosen": -0.8841540217399597, "logits/rejected": -0.9338924288749695, "logps/chosen": -904.0048828125, "logps/rejected": -1080.072021484375, "loss": 0.2991, "rewards/accuracies": 0.875, "rewards/chosen": -4.445211410522461, "rewards/margins": 1.6414602994918823, "rewards/rejected": -6.086671352386475, "step": 1399 }, { "epoch": 0.9143603559474243, "grad_norm": 19.43570834743748, "learning_rate": 4.433117684098508e-09, "logits/chosen": -0.9700881242752075, "logits/rejected": -0.8894848823547363, "logps/chosen": -919.09375, "logps/rejected": -1196.0753173828125, "loss": 0.3467, "rewards/accuracies": 0.875, "rewards/chosen": -5.059053897857666, "rewards/margins": 2.5139126777648926, "rewards/rejected": -7.572966575622559, "step": 1400 }, { "epoch": 0.9143603559474243, "eval_logits/chosen": -0.6672475337982178, "eval_logits/rejected": -0.6070554256439209, "eval_logps/chosen": -997.260986328125, "eval_logps/rejected": -1163.262939453125, "eval_loss": 0.38706615567207336, "eval_rewards/accuracies": 0.8090000152587891, "eval_rewards/chosen": -5.233537673950195, "eval_rewards/margins": 1.7474075555801392, "eval_rewards/rejected": -6.980944633483887, "eval_runtime": 616.0403, "eval_samples_per_second": 6.493, "eval_steps_per_second": 0.406, "step": 1400 }, { "epoch": 0.9150134704873867, "grad_norm": 21.623147531365383, "learning_rate": 4.366189830564393e-09, "logits/chosen": -0.8763200640678406, "logits/rejected": -0.7712318897247314, "logps/chosen": -994.0523071289062, "logps/rejected": -1056.1307373046875, "loss": 0.4043, "rewards/accuracies": 0.71875, "rewards/chosen": -5.576797008514404, "rewards/margins": 0.9099751710891724, "rewards/rejected": -6.486772060394287, "step": 1401 }, { "epoch": 0.9156665850273492, "grad_norm": 23.79888391402557, "learning_rate": 4.299759763554456e-09, "logits/chosen": -0.9876704216003418, "logits/rejected": -0.9826672077178955, "logps/chosen": -1038.6239013671875, "logps/rejected": -1214.2568359375, "loss": 0.379, "rewards/accuracies": 0.84375, "rewards/chosen": -5.297689437866211, "rewards/margins": 1.9423954486846924, "rewards/rejected": -7.240084171295166, "step": 1402 }, { "epoch": 0.9163196995673116, "grad_norm": 21.82743599285643, "learning_rate": 4.233827828845915e-09, "logits/chosen": -0.7085953950881958, "logits/rejected": -0.8077543377876282, "logps/chosen": -873.767578125, "logps/rejected": -1173.175048828125, "loss": 0.3597, "rewards/accuracies": 0.9375, "rewards/chosen": -4.473335266113281, "rewards/margins": 2.4106528759002686, "rewards/rejected": -6.8839874267578125, "step": 1403 }, { "epoch": 0.916972814107274, "grad_norm": 21.412720186872857, "learning_rate": 4.1683943696231515e-09, "logits/chosen": -0.8199781775474548, "logits/rejected": -0.8375627994537354, "logps/chosen": -1003.9866333007812, "logps/rejected": -1262.1685791015625, "loss": 0.3588, "rewards/accuracies": 0.84375, "rewards/chosen": -5.443426132202148, "rewards/margins": 1.931605339050293, "rewards/rejected": -7.375031471252441, "step": 1404 }, { "epoch": 0.9176259286472365, "grad_norm": 35.589071157888114, "learning_rate": 4.103459726475889e-09, "logits/chosen": -0.6660647988319397, "logits/rejected": -0.8292348384857178, "logps/chosen": -933.234375, "logps/rejected": -1223.6353759765625, "loss": 0.4142, "rewards/accuracies": 0.71875, "rewards/chosen": -5.21882438659668, "rewards/margins": 1.956418752670288, "rewards/rejected": -7.175242900848389, "step": 1405 }, { "epoch": 0.918279043187199, "grad_norm": 52.59743146250588, "learning_rate": 4.03902423739747e-09, "logits/chosen": -1.0262573957443237, "logits/rejected": -1.0548584461212158, "logps/chosen": -885.991455078125, "logps/rejected": -1079.2177734375, "loss": 0.3407, "rewards/accuracies": 0.84375, "rewards/chosen": -4.75430154800415, "rewards/margins": 1.5191203355789185, "rewards/rejected": -6.2734222412109375, "step": 1406 }, { "epoch": 0.9189321577271614, "grad_norm": 20.398243030951686, "learning_rate": 3.975088237783064e-09, "logits/chosen": -0.9631592035293579, "logits/rejected": -0.8363844156265259, "logps/chosen": -1049.6431884765625, "logps/rejected": -1225.7779541015625, "loss": 0.3676, "rewards/accuracies": 0.84375, "rewards/chosen": -5.047906398773193, "rewards/margins": 2.2165191173553467, "rewards/rejected": -7.264426231384277, "step": 1407 }, { "epoch": 0.9195852722671238, "grad_norm": 28.609017006751863, "learning_rate": 3.911652060427928e-09, "logits/chosen": -0.8288122415542603, "logits/rejected": -0.7530689239501953, "logps/chosen": -998.0337524414062, "logps/rejected": -1154.56884765625, "loss": 0.4154, "rewards/accuracies": 0.9375, "rewards/chosen": -4.994250774383545, "rewards/margins": 1.7526618242263794, "rewards/rejected": -6.746912956237793, "step": 1408 }, { "epoch": 0.9202383868070863, "grad_norm": 19.446007209573214, "learning_rate": 3.848716035525678e-09, "logits/chosen": -0.6308072805404663, "logits/rejected": -0.697937548160553, "logps/chosen": -899.0272827148438, "logps/rejected": -1132.28857421875, "loss": 0.4446, "rewards/accuracies": 0.8125, "rewards/chosen": -4.762282371520996, "rewards/margins": 1.923186182975769, "rewards/rejected": -6.6854681968688965, "step": 1409 }, { "epoch": 0.9208915013470488, "grad_norm": 33.9684949616552, "learning_rate": 3.786280490666604e-09, "logits/chosen": -0.9285703897476196, "logits/rejected": -0.9438356161117554, "logps/chosen": -1024.6214599609375, "logps/rejected": -1281.627197265625, "loss": 0.2981, "rewards/accuracies": 0.90625, "rewards/chosen": -5.685998916625977, "rewards/margins": 2.824934244155884, "rewards/rejected": -8.510933876037598, "step": 1410 }, { "epoch": 0.9215446158870112, "grad_norm": 17.31313517775045, "learning_rate": 3.7243457508358778e-09, "logits/chosen": -0.6968719959259033, "logits/rejected": -0.6861792206764221, "logps/chosen": -897.244384765625, "logps/rejected": -1154.441162109375, "loss": 0.349, "rewards/accuracies": 0.9375, "rewards/chosen": -4.37484884262085, "rewards/margins": 2.1692397594451904, "rewards/rejected": -6.544088840484619, "step": 1411 }, { "epoch": 0.9221977304269736, "grad_norm": 21.26908971504018, "learning_rate": 3.6629121384119664e-09, "logits/chosen": -0.9373629689216614, "logits/rejected": -0.8761916160583496, "logps/chosen": -969.7335205078125, "logps/rejected": -1193.5028076171875, "loss": 0.3883, "rewards/accuracies": 0.9375, "rewards/chosen": -4.746880054473877, "rewards/margins": 2.257174015045166, "rewards/rejected": -7.004053592681885, "step": 1412 }, { "epoch": 0.922850844966936, "grad_norm": 26.66535621429327, "learning_rate": 3.6019799731648704e-09, "logits/chosen": -0.9955894947052002, "logits/rejected": -0.8084239959716797, "logps/chosen": -931.5289306640625, "logps/rejected": -1104.51806640625, "loss": 0.3852, "rewards/accuracies": 0.84375, "rewards/chosen": -4.929518699645996, "rewards/margins": 2.009385108947754, "rewards/rejected": -6.93890380859375, "step": 1413 }, { "epoch": 0.9235039595068986, "grad_norm": 25.588965364341004, "learning_rate": 3.5415495722544874e-09, "logits/chosen": -0.9307757616043091, "logits/rejected": -0.7715475559234619, "logps/chosen": -1015.3209228515625, "logps/rejected": -1181.02978515625, "loss": 0.3766, "rewards/accuracies": 0.875, "rewards/chosen": -4.992738246917725, "rewards/margins": 2.280240058898926, "rewards/rejected": -7.272977828979492, "step": 1414 }, { "epoch": 0.924157074046861, "grad_norm": 21.254439169004378, "learning_rate": 3.4816212502289944e-09, "logits/chosen": -0.9563465714454651, "logits/rejected": -0.9163129329681396, "logps/chosen": -1033.4365234375, "logps/rejected": -1225.145751953125, "loss": 0.3757, "rewards/accuracies": 0.8125, "rewards/chosen": -5.287700176239014, "rewards/margins": 2.000286102294922, "rewards/rejected": -7.287986755371094, "step": 1415 }, { "epoch": 0.9248101885868234, "grad_norm": 25.934486653060304, "learning_rate": 3.422195319023169e-09, "logits/chosen": -0.7184072136878967, "logits/rejected": -0.6855770945549011, "logps/chosen": -935.3275146484375, "logps/rejected": -1191.0899658203125, "loss": 0.3761, "rewards/accuracies": 0.90625, "rewards/chosen": -4.7941179275512695, "rewards/margins": 2.6774706840515137, "rewards/rejected": -7.471588611602783, "step": 1416 }, { "epoch": 0.9254633031267858, "grad_norm": 39.367015906718585, "learning_rate": 3.363272087956759e-09, "logits/chosen": -0.9661139845848083, "logits/rejected": -0.8155698776245117, "logps/chosen": -987.3919677734375, "logps/rejected": -1184.998291015625, "loss": 0.4115, "rewards/accuracies": 0.875, "rewards/chosen": -5.236489295959473, "rewards/margins": 2.2704925537109375, "rewards/rejected": -7.506982803344727, "step": 1417 }, { "epoch": 0.9261164176667483, "grad_norm": 16.74180053580894, "learning_rate": 3.304851863732938e-09, "logits/chosen": -0.814327597618103, "logits/rejected": -0.8117744326591492, "logps/chosen": -920.962890625, "logps/rejected": -1163.671875, "loss": 0.3848, "rewards/accuracies": 0.84375, "rewards/chosen": -4.750037670135498, "rewards/margins": 1.8664512634277344, "rewards/rejected": -6.616489410400391, "step": 1418 }, { "epoch": 0.9267695322067108, "grad_norm": 24.932758940158795, "learning_rate": 3.2469349504366083e-09, "logits/chosen": -0.8866950869560242, "logits/rejected": -0.8952164649963379, "logps/chosen": -1020.4898681640625, "logps/rejected": -1310.095947265625, "loss": 0.3745, "rewards/accuracies": 0.84375, "rewards/chosen": -5.152378082275391, "rewards/margins": 2.8529391288757324, "rewards/rejected": -8.005317687988281, "step": 1419 }, { "epoch": 0.9274226467466732, "grad_norm": 17.473544447319867, "learning_rate": 3.1895216495329114e-09, "logits/chosen": -0.9697394371032715, "logits/rejected": -0.932720959186554, "logps/chosen": -894.943359375, "logps/rejected": -1137.5650634765625, "loss": 0.3651, "rewards/accuracies": 0.875, "rewards/chosen": -4.790702819824219, "rewards/margins": 1.8912099599838257, "rewards/rejected": -6.681912422180176, "step": 1420 }, { "epoch": 0.9280757612866356, "grad_norm": 22.83318699304187, "learning_rate": 3.132612259865597e-09, "logits/chosen": -0.7634729146957397, "logits/rejected": -0.7357776165008545, "logps/chosen": -986.0407104492188, "logps/rejected": -1127.8607177734375, "loss": 0.406, "rewards/accuracies": 0.8125, "rewards/chosen": -5.239987850189209, "rewards/margins": 1.540162444114685, "rewards/rejected": -6.780150413513184, "step": 1421 }, { "epoch": 0.9287288758265981, "grad_norm": 32.45917372041385, "learning_rate": 3.076207077655524e-09, "logits/chosen": -0.8629635572433472, "logits/rejected": -0.8516905307769775, "logps/chosen": -1002.01708984375, "logps/rejected": -1152.742919921875, "loss": 0.4029, "rewards/accuracies": 0.875, "rewards/chosen": -5.313127517700195, "rewards/margins": 1.448228359222412, "rewards/rejected": -6.761356353759766, "step": 1422 }, { "epoch": 0.9293819903665606, "grad_norm": 27.915496949424785, "learning_rate": 3.0203063964990614e-09, "logits/chosen": -0.8668137788772583, "logits/rejected": -0.8934547901153564, "logps/chosen": -950.4146728515625, "logps/rejected": -1186.294921875, "loss": 0.4064, "rewards/accuracies": 0.8125, "rewards/chosen": -5.059856414794922, "rewards/margins": 2.1348183155059814, "rewards/rejected": -7.194675445556641, "step": 1423 }, { "epoch": 0.930035104906523, "grad_norm": 35.81126804895922, "learning_rate": 2.9649105073665583e-09, "logits/chosen": -0.7803625464439392, "logits/rejected": -0.8244260549545288, "logps/chosen": -877.6205444335938, "logps/rejected": -1151.6063232421875, "loss": 0.4328, "rewards/accuracies": 0.84375, "rewards/chosen": -4.741922378540039, "rewards/margins": 2.114166021347046, "rewards/rejected": -6.856088638305664, "step": 1424 }, { "epoch": 0.9306882194464854, "grad_norm": 22.2918690515681, "learning_rate": 2.9100196986009073e-09, "logits/chosen": -0.7256823778152466, "logits/rejected": -0.7021087408065796, "logps/chosen": -923.400146484375, "logps/rejected": -1084.9371337890625, "loss": 0.3417, "rewards/accuracies": 0.90625, "rewards/chosen": -5.120212554931641, "rewards/margins": 1.8833941221237183, "rewards/rejected": -7.003606796264648, "step": 1425 }, { "epoch": 0.9313413339864479, "grad_norm": 32.21256635361222, "learning_rate": 2.8556342559159508e-09, "logits/chosen": -1.0215641260147095, "logits/rejected": -0.8117510676383972, "logps/chosen": -985.9193115234375, "logps/rejected": -1249.2657470703125, "loss": 0.4066, "rewards/accuracies": 0.8125, "rewards/chosen": -4.611599922180176, "rewards/margins": 2.8269217014312744, "rewards/rejected": -7.438521862030029, "step": 1426 }, { "epoch": 0.9319944485264103, "grad_norm": 21.118231191065334, "learning_rate": 2.8017544623950673e-09, "logits/chosen": -0.8426204919815063, "logits/rejected": -0.919529378414154, "logps/chosen": -965.6964721679688, "logps/rejected": -1166.8994140625, "loss": 0.3623, "rewards/accuracies": 0.78125, "rewards/chosen": -4.932854652404785, "rewards/margins": 1.1300407648086548, "rewards/rejected": -6.06289529800415, "step": 1427 }, { "epoch": 0.9326475630663728, "grad_norm": 24.156889442754775, "learning_rate": 2.7483805984896304e-09, "logits/chosen": -0.8445752859115601, "logits/rejected": -0.8731241226196289, "logps/chosen": -1016.215576171875, "logps/rejected": -1198.81689453125, "loss": 0.3818, "rewards/accuracies": 0.84375, "rewards/chosen": -5.215708255767822, "rewards/margins": 1.9218271970748901, "rewards/rejected": -7.13753604888916, "step": 1428 }, { "epoch": 0.9333006776063352, "grad_norm": 19.25063375366721, "learning_rate": 2.6955129420176194e-09, "logits/chosen": -0.9523279070854187, "logits/rejected": -0.8416871428489685, "logps/chosen": -860.4990844726562, "logps/rejected": -992.48876953125, "loss": 0.4255, "rewards/accuracies": 0.9375, "rewards/chosen": -4.198513984680176, "rewards/margins": 1.6185945272445679, "rewards/rejected": -5.817108631134033, "step": 1429 }, { "epoch": 0.9339537921462977, "grad_norm": 20.874432815718333, "learning_rate": 2.6431517681621107e-09, "logits/chosen": -0.9927769303321838, "logits/rejected": -0.9092074632644653, "logps/chosen": -1038.987548828125, "logps/rejected": -1260.34130859375, "loss": 0.3983, "rewards/accuracies": 0.90625, "rewards/chosen": -5.678443431854248, "rewards/margins": 2.3545010089874268, "rewards/rejected": -8.032944679260254, "step": 1430 }, { "epoch": 0.9346069066862601, "grad_norm": 15.08771013125819, "learning_rate": 2.5912973494698785e-09, "logits/chosen": -0.8484581708908081, "logits/rejected": -0.7962210774421692, "logps/chosen": -1113.666015625, "logps/rejected": -1251.378662109375, "loss": 0.3751, "rewards/accuracies": 0.71875, "rewards/chosen": -6.1646318435668945, "rewards/margins": 1.506376028060913, "rewards/rejected": -7.671008586883545, "step": 1431 }, { "epoch": 0.9352600212262225, "grad_norm": 22.592908330898158, "learning_rate": 2.5399499558499847e-09, "logits/chosen": -0.8906291723251343, "logits/rejected": -1.0438287258148193, "logps/chosen": -870.3036499023438, "logps/rejected": -1057.3095703125, "loss": 0.4427, "rewards/accuracies": 0.75, "rewards/chosen": -4.733056545257568, "rewards/margins": 1.3122425079345703, "rewards/rejected": -6.045300006866455, "step": 1432 }, { "epoch": 0.935913135766185, "grad_norm": 24.33564645984773, "learning_rate": 2.4891098545723242e-09, "logits/chosen": -1.0513837337493896, "logits/rejected": -0.9483792781829834, "logps/chosen": -1050.873291015625, "logps/rejected": -1154.373291015625, "loss": 0.4221, "rewards/accuracies": 0.84375, "rewards/chosen": -5.649351596832275, "rewards/margins": 1.523519515991211, "rewards/rejected": -7.1728715896606445, "step": 1433 }, { "epoch": 0.9365662503061475, "grad_norm": 23.735122709661717, "learning_rate": 2.4387773102663157e-09, "logits/chosen": -0.806923508644104, "logits/rejected": -0.8132039308547974, "logps/chosen": -977.7556762695312, "logps/rejected": -1201.8583984375, "loss": 0.411, "rewards/accuracies": 0.90625, "rewards/chosen": -4.82599401473999, "rewards/margins": 1.9918920993804932, "rewards/rejected": -6.8178863525390625, "step": 1434 }, { "epoch": 0.9372193648461099, "grad_norm": 30.38805197625348, "learning_rate": 2.388952584919457e-09, "logits/chosen": -0.9621034264564514, "logits/rejected": -0.9715672731399536, "logps/chosen": -1033.365478515625, "logps/rejected": -1146.636962890625, "loss": 0.3824, "rewards/accuracies": 0.75, "rewards/chosen": -5.168354511260986, "rewards/margins": 1.095251202583313, "rewards/rejected": -6.263606071472168, "step": 1435 }, { "epoch": 0.9378724793860723, "grad_norm": 27.14035171290439, "learning_rate": 2.33963593787595e-09, "logits/chosen": -0.7088282704353333, "logits/rejected": -0.5756462812423706, "logps/chosen": -985.4832763671875, "logps/rejected": -1134.68798828125, "loss": 0.3882, "rewards/accuracies": 0.8125, "rewards/chosen": -5.14437198638916, "rewards/margins": 1.92146635055542, "rewards/rejected": -7.065837860107422, "step": 1436 }, { "epoch": 0.9385255939260347, "grad_norm": 20.965417931035965, "learning_rate": 2.2908276258354343e-09, "logits/chosen": -0.9305647611618042, "logits/rejected": -1.0245065689086914, "logps/chosen": -1019.478271484375, "logps/rejected": -1227.153564453125, "loss": 0.3591, "rewards/accuracies": 0.84375, "rewards/chosen": -4.850151062011719, "rewards/margins": 1.6930534839630127, "rewards/rejected": -6.543204307556152, "step": 1437 }, { "epoch": 0.9391787084659973, "grad_norm": 20.707076213674817, "learning_rate": 2.2425279028515652e-09, "logits/chosen": -0.9270403385162354, "logits/rejected": -0.8247102499008179, "logps/chosen": -1035.72119140625, "logps/rejected": -1167.691162109375, "loss": 0.4176, "rewards/accuracies": 0.84375, "rewards/chosen": -5.455660820007324, "rewards/margins": 1.8896361589431763, "rewards/rejected": -7.345297336578369, "step": 1438 }, { "epoch": 0.9398318230059597, "grad_norm": 17.145480220037275, "learning_rate": 2.194737020330728e-09, "logits/chosen": -0.8578017950057983, "logits/rejected": -0.6604302525520325, "logps/chosen": -1015.2515869140625, "logps/rejected": -1188.9595947265625, "loss": 0.3346, "rewards/accuracies": 0.84375, "rewards/chosen": -5.383336067199707, "rewards/margins": 2.303807497024536, "rewards/rejected": -7.6871442794799805, "step": 1439 }, { "epoch": 0.9404849375459221, "grad_norm": 18.277620474543976, "learning_rate": 2.147455227030748e-09, "logits/chosen": -1.0063140392303467, "logits/rejected": -0.8611899018287659, "logps/chosen": -1026.3895263671875, "logps/rejected": -1207.8428955078125, "loss": 0.3533, "rewards/accuracies": 0.90625, "rewards/chosen": -5.047481536865234, "rewards/margins": 2.0378239154815674, "rewards/rejected": -7.085305213928223, "step": 1440 }, { "epoch": 0.9411380520858845, "grad_norm": 39.94584518067749, "learning_rate": 2.1006827690595473e-09, "logits/chosen": -0.9561357498168945, "logits/rejected": -1.031121850013733, "logps/chosen": -1034.476318359375, "logps/rejected": -1264.8309326171875, "loss": 0.3684, "rewards/accuracies": 0.90625, "rewards/chosen": -5.420408725738525, "rewards/margins": 2.2229015827178955, "rewards/rejected": -7.643310070037842, "step": 1441 }, { "epoch": 0.941791166625847, "grad_norm": 33.800789226780005, "learning_rate": 2.0544198898739263e-09, "logits/chosen": -0.9700863361358643, "logits/rejected": -0.870128870010376, "logps/chosen": -1038.8701171875, "logps/rejected": -1134.0107421875, "loss": 0.4121, "rewards/accuracies": 0.8125, "rewards/chosen": -5.802235126495361, "rewards/margins": 1.2960669994354248, "rewards/rejected": -7.098300933837891, "step": 1442 }, { "epoch": 0.9424442811658095, "grad_norm": 37.25432369178176, "learning_rate": 2.008666830278216e-09, "logits/chosen": -0.9192866086959839, "logits/rejected": -0.8795109391212463, "logps/chosen": -982.6119995117188, "logps/rejected": -1067.9573974609375, "loss": 0.3474, "rewards/accuracies": 0.8125, "rewards/chosen": -4.948570251464844, "rewards/margins": 1.369868516921997, "rewards/rejected": -6.318438529968262, "step": 1443 }, { "epoch": 0.9430973957057719, "grad_norm": 25.570377183088176, "learning_rate": 1.963423828423094e-09, "logits/chosen": -0.7798184156417847, "logits/rejected": -0.7910678386688232, "logps/chosen": -981.431640625, "logps/rejected": -1218.7794189453125, "loss": 0.4121, "rewards/accuracies": 0.84375, "rewards/chosen": -5.284530162811279, "rewards/margins": 2.037384271621704, "rewards/rejected": -7.321914196014404, "step": 1444 }, { "epoch": 0.9437505102457343, "grad_norm": 29.169742339003346, "learning_rate": 1.9186911198043277e-09, "logits/chosen": -0.9367291927337646, "logits/rejected": -0.9252871870994568, "logps/chosen": -1102.89794921875, "logps/rejected": -1253.6875, "loss": 0.4379, "rewards/accuracies": 0.71875, "rewards/chosen": -6.0728302001953125, "rewards/margins": 1.685518503189087, "rewards/rejected": -7.758349418640137, "step": 1445 }, { "epoch": 0.9444036247856968, "grad_norm": 22.556085107470444, "learning_rate": 1.8744689372615308e-09, "logits/chosen": -0.9977632761001587, "logits/rejected": -0.826669454574585, "logps/chosen": -1179.10107421875, "logps/rejected": -1295.236328125, "loss": 0.3242, "rewards/accuracies": 0.90625, "rewards/chosen": -5.635008335113525, "rewards/margins": 1.6209893226623535, "rewards/rejected": -7.255997657775879, "step": 1446 }, { "epoch": 0.9450567393256593, "grad_norm": 23.986548258092327, "learning_rate": 1.8307575109769657e-09, "logits/chosen": -0.7967054843902588, "logits/rejected": -0.8129853010177612, "logps/chosen": -920.0318603515625, "logps/rejected": -1080.5286865234375, "loss": 0.397, "rewards/accuracies": 0.78125, "rewards/chosen": -4.808022499084473, "rewards/margins": 1.5115902423858643, "rewards/rejected": -6.319612503051758, "step": 1447 }, { "epoch": 0.9457098538656217, "grad_norm": 22.43816147534732, "learning_rate": 1.7875570684743323e-09, "logits/chosen": -1.020730972290039, "logits/rejected": -0.9586259722709656, "logps/chosen": -981.6170654296875, "logps/rejected": -1160.238525390625, "loss": 0.3927, "rewards/accuracies": 0.78125, "rewards/chosen": -5.438063621520996, "rewards/margins": 1.8004133701324463, "rewards/rejected": -7.238476276397705, "step": 1448 }, { "epoch": 0.9463629684055841, "grad_norm": 41.87512216168752, "learning_rate": 1.7448678346175915e-09, "logits/chosen": -0.9691171050071716, "logits/rejected": -0.854966402053833, "logps/chosen": -895.74560546875, "logps/rejected": -1024.83935546875, "loss": 0.4039, "rewards/accuracies": 0.84375, "rewards/chosen": -4.703372478485107, "rewards/margins": 1.5487737655639648, "rewards/rejected": -6.252146244049072, "step": 1449 }, { "epoch": 0.9470160829455466, "grad_norm": 20.13570473014106, "learning_rate": 1.7026900316098214e-09, "logits/chosen": -0.7298191785812378, "logits/rejected": -0.7361303567886353, "logps/chosen": -966.2626342773438, "logps/rejected": -1121.1221923828125, "loss": 0.4022, "rewards/accuracies": 0.75, "rewards/chosen": -4.953649520874023, "rewards/margins": 1.2899563312530518, "rewards/rejected": -6.243605136871338, "step": 1450 }, { "epoch": 0.947669197485509, "grad_norm": 26.69200848432792, "learning_rate": 1.6610238789920073e-09, "logits/chosen": -0.8612526059150696, "logits/rejected": -0.9243814945220947, "logps/chosen": -1065.5809326171875, "logps/rejected": -1275.85791015625, "loss": 0.4606, "rewards/accuracies": 0.84375, "rewards/chosen": -5.303916931152344, "rewards/margins": 2.0302257537841797, "rewards/rejected": -7.334142684936523, "step": 1451 }, { "epoch": 0.9483223120254715, "grad_norm": 23.657796736360822, "learning_rate": 1.6198695936419538e-09, "logits/chosen": -0.8324539661407471, "logits/rejected": -0.8346168994903564, "logps/chosen": -916.5712890625, "logps/rejected": -1095.519775390625, "loss": 0.3319, "rewards/accuracies": 0.84375, "rewards/chosen": -4.682393550872803, "rewards/margins": 1.9298332929611206, "rewards/rejected": -6.612226963043213, "step": 1452 }, { "epoch": 0.9489754265654339, "grad_norm": 32.52042528833027, "learning_rate": 1.5792273897730856e-09, "logits/chosen": -0.9859724044799805, "logits/rejected": -0.8119895458221436, "logps/chosen": -1032.33740234375, "logps/rejected": -1179.5372314453125, "loss": 0.406, "rewards/accuracies": 0.6875, "rewards/chosen": -5.521050930023193, "rewards/margins": 2.0502986907958984, "rewards/rejected": -7.57135009765625, "step": 1453 }, { "epoch": 0.9496285411053964, "grad_norm": 23.165414471220327, "learning_rate": 1.5390974789334266e-09, "logits/chosen": -0.8730648756027222, "logits/rejected": -0.8281824588775635, "logps/chosen": -1028.20166015625, "logps/rejected": -1375.81640625, "loss": 0.3187, "rewards/accuracies": 0.875, "rewards/chosen": -5.710119724273682, "rewards/margins": 3.208172082901001, "rewards/rejected": -8.918291091918945, "step": 1454 }, { "epoch": 0.9502816556453588, "grad_norm": 24.673015232487636, "learning_rate": 1.4994800700044219e-09, "logits/chosen": -1.045493721961975, "logits/rejected": -1.0213874578475952, "logps/chosen": -1014.9378662109375, "logps/rejected": -1189.329345703125, "loss": 0.3627, "rewards/accuracies": 0.90625, "rewards/chosen": -4.880398273468018, "rewards/margins": 2.3164453506469727, "rewards/rejected": -7.196844100952148, "step": 1455 }, { "epoch": 0.9509347701853212, "grad_norm": 73.76277605023918, "learning_rate": 1.4603753691998733e-09, "logits/chosen": -0.7908294200897217, "logits/rejected": -0.7953348755836487, "logps/chosen": -932.0293579101562, "logps/rejected": -1270.8121337890625, "loss": 0.416, "rewards/accuracies": 0.875, "rewards/chosen": -4.794908046722412, "rewards/margins": 2.717374801635742, "rewards/rejected": -7.512282848358154, "step": 1456 }, { "epoch": 0.9515878847252837, "grad_norm": 22.972378938308747, "learning_rate": 1.4217835800648837e-09, "logits/chosen": -0.9420186877250671, "logits/rejected": -0.9727803468704224, "logps/chosen": -861.2728271484375, "logps/rejected": -1017.69091796875, "loss": 0.3061, "rewards/accuracies": 0.875, "rewards/chosen": -4.226019859313965, "rewards/margins": 1.59080970287323, "rewards/rejected": -5.816829681396484, "step": 1457 }, { "epoch": 0.9522409992652462, "grad_norm": 71.0682992055047, "learning_rate": 1.3837049034747806e-09, "logits/chosen": -0.9005981087684631, "logits/rejected": -0.8757720589637756, "logps/chosen": -918.2922973632812, "logps/rejected": -1133.818115234375, "loss": 0.4251, "rewards/accuracies": 0.84375, "rewards/chosen": -4.245543003082275, "rewards/margins": 2.0595555305480957, "rewards/rejected": -6.305099010467529, "step": 1458 }, { "epoch": 0.9528941138052086, "grad_norm": 45.29358517020249, "learning_rate": 1.3461395376340501e-09, "logits/chosen": -0.914055585861206, "logits/rejected": -0.7696114778518677, "logps/chosen": -906.3142700195312, "logps/rejected": -1021.6484375, "loss": 0.5118, "rewards/accuracies": 0.78125, "rewards/chosen": -4.326722621917725, "rewards/margins": 1.5497199296951294, "rewards/rejected": -5.876442909240723, "step": 1459 }, { "epoch": 0.953547228345171, "grad_norm": 54.488844427179124, "learning_rate": 1.3090876780753712e-09, "logits/chosen": -0.6701656579971313, "logits/rejected": -0.6975820660591125, "logps/chosen": -914.3140258789062, "logps/rejected": -1174.50048828125, "loss": 0.3897, "rewards/accuracies": 0.84375, "rewards/chosen": -5.02104377746582, "rewards/margins": 2.0837643146514893, "rewards/rejected": -7.104808330535889, "step": 1460 }, { "epoch": 0.9542003428851334, "grad_norm": 25.299453873176645, "learning_rate": 1.2725495176585166e-09, "logits/chosen": -0.7396911382675171, "logits/rejected": -0.6380915641784668, "logps/chosen": -968.8209838867188, "logps/rejected": -1188.0623779296875, "loss": 0.3772, "rewards/accuracies": 0.84375, "rewards/chosen": -5.297964572906494, "rewards/margins": 1.943697452545166, "rewards/rejected": -7.241661548614502, "step": 1461 }, { "epoch": 0.954853457425096, "grad_norm": 27.103808867302387, "learning_rate": 1.2365252465694086e-09, "logits/chosen": -0.863556444644928, "logits/rejected": -0.7426696419715881, "logps/chosen": -987.9991455078125, "logps/rejected": -1103.204345703125, "loss": 0.3622, "rewards/accuracies": 0.84375, "rewards/chosen": -5.20358943939209, "rewards/margins": 1.4331691265106201, "rewards/rejected": -6.636758804321289, "step": 1462 }, { "epoch": 0.9555065719650584, "grad_norm": 17.394073074491665, "learning_rate": 1.2010150523190988e-09, "logits/chosen": -0.9870079159736633, "logits/rejected": -0.9677950739860535, "logps/chosen": -968.0743408203125, "logps/rejected": -1198.095458984375, "loss": 0.3549, "rewards/accuracies": 0.90625, "rewards/chosen": -4.881644248962402, "rewards/margins": 2.15342378616333, "rewards/rejected": -7.035068035125732, "step": 1463 }, { "epoch": 0.9561596865050208, "grad_norm": 24.783037834413452, "learning_rate": 1.1660191197428226e-09, "logits/chosen": -0.7138622999191284, "logits/rejected": -0.7153616547584534, "logps/chosen": -958.835693359375, "logps/rejected": -1190.4322509765625, "loss": 0.327, "rewards/accuracies": 0.84375, "rewards/chosen": -5.410097599029541, "rewards/margins": 1.866647481918335, "rewards/rejected": -7.276744842529297, "step": 1464 }, { "epoch": 0.9568128010449832, "grad_norm": 30.254292249337972, "learning_rate": 1.13153763099898e-09, "logits/chosen": -0.7923008799552917, "logits/rejected": -0.8359322547912598, "logps/chosen": -964.593994140625, "logps/rejected": -1163.7933349609375, "loss": 0.42, "rewards/accuracies": 0.8125, "rewards/chosen": -4.957150459289551, "rewards/margins": 2.0364115238189697, "rewards/rejected": -6.993561744689941, "step": 1465 }, { "epoch": 0.9574659155849458, "grad_norm": 19.941049592800624, "learning_rate": 1.0975707655682453e-09, "logits/chosen": -0.8487153649330139, "logits/rejected": -0.7613034844398499, "logps/chosen": -860.0794677734375, "logps/rejected": -1030.533203125, "loss": 0.433, "rewards/accuracies": 0.78125, "rewards/chosen": -4.2322821617126465, "rewards/margins": 1.7400192022323608, "rewards/rejected": -5.972302436828613, "step": 1466 }, { "epoch": 0.9581190301249082, "grad_norm": 30.31638895507804, "learning_rate": 1.0641187002526142e-09, "logits/chosen": -0.9801903963088989, "logits/rejected": -0.8500902056694031, "logps/chosen": -886.7667236328125, "logps/rejected": -925.6876220703125, "loss": 0.4019, "rewards/accuracies": 0.75, "rewards/chosen": -4.857925891876221, "rewards/margins": 0.8767415881156921, "rewards/rejected": -5.734667778015137, "step": 1467 }, { "epoch": 0.9587721446648706, "grad_norm": 42.16901233596558, "learning_rate": 1.0311816091744697e-09, "logits/chosen": -0.7535737752914429, "logits/rejected": -0.7454916834831238, "logps/chosen": -932.8473510742188, "logps/rejected": -1178.4783935546875, "loss": 0.3957, "rewards/accuracies": 0.875, "rewards/chosen": -4.699604034423828, "rewards/margins": 2.059380531311035, "rewards/rejected": -6.7589850425720215, "step": 1468 }, { "epoch": 0.959425259204833, "grad_norm": 26.651901787055372, "learning_rate": 9.987596637756946e-10, "logits/chosen": -0.8313369154930115, "logits/rejected": -0.6576776504516602, "logps/chosen": -987.6339111328125, "logps/rejected": -1117.3695068359375, "loss": 0.3691, "rewards/accuracies": 0.78125, "rewards/chosen": -4.855637073516846, "rewards/margins": 1.547339677810669, "rewards/rejected": -6.402976989746094, "step": 1469 }, { "epoch": 0.9600783737447955, "grad_norm": 21.683003406272572, "learning_rate": 9.668530328167612e-10, "logits/chosen": -0.812420129776001, "logits/rejected": -0.6992568373680115, "logps/chosen": -993.4514770507812, "logps/rejected": -1210.0225830078125, "loss": 0.3348, "rewards/accuracies": 0.875, "rewards/chosen": -5.396690845489502, "rewards/margins": 2.229128837585449, "rewards/rejected": -7.625819206237793, "step": 1470 }, { "epoch": 0.960731488284758, "grad_norm": 17.28840064330659, "learning_rate": 9.354618823758653e-10, "logits/chosen": -1.02504301071167, "logits/rejected": -0.8524007797241211, "logps/chosen": -944.0267333984375, "logps/rejected": -1072.3201904296875, "loss": 0.3258, "rewards/accuracies": 0.875, "rewards/chosen": -4.7975006103515625, "rewards/margins": 1.4966485500335693, "rewards/rejected": -6.294148921966553, "step": 1471 }, { "epoch": 0.9613846028247204, "grad_norm": 19.607498707495395, "learning_rate": 9.045863758480709e-10, "logits/chosen": -1.0104511976242065, "logits/rejected": -1.0129821300506592, "logps/chosen": -1002.5841064453125, "logps/rejected": -1276.6905517578125, "loss": 0.3336, "rewards/accuracies": 0.84375, "rewards/chosen": -5.3765411376953125, "rewards/margins": 2.1960718631744385, "rewards/rejected": -7.572613716125488, "step": 1472 }, { "epoch": 0.9620377173646828, "grad_norm": 34.47668846556969, "learning_rate": 8.742266739444337e-10, "logits/chosen": -0.9214047193527222, "logits/rejected": -0.9010781645774841, "logps/chosen": -976.4214477539062, "logps/rejected": -1183.3560791015625, "loss": 0.3637, "rewards/accuracies": 0.875, "rewards/chosen": -5.587342739105225, "rewards/margins": 1.8902616500854492, "rewards/rejected": -7.477603912353516, "step": 1473 }, { "epoch": 0.9626908319046453, "grad_norm": 18.40991830007408, "learning_rate": 8.44382934691179e-10, "logits/chosen": -0.920522153377533, "logits/rejected": -0.8545042276382446, "logps/chosen": -954.7621459960938, "logps/rejected": -1119.32666015625, "loss": 0.3603, "rewards/accuracies": 0.8125, "rewards/chosen": -5.054173946380615, "rewards/margins": 1.6411688327789307, "rewards/rejected": -6.695343017578125, "step": 1474 }, { "epoch": 0.9633439464446077, "grad_norm": 30.063682711715693, "learning_rate": 8.150553134289029e-10, "logits/chosen": -0.7451218962669373, "logits/rejected": -0.7642776966094971, "logps/chosen": -958.5621948242188, "logps/rejected": -1134.0743408203125, "loss": 0.3813, "rewards/accuracies": 0.78125, "rewards/chosen": -4.907833099365234, "rewards/margins": 1.4756911993026733, "rewards/rejected": -6.383524417877197, "step": 1475 }, { "epoch": 0.9639970609845702, "grad_norm": 28.63972492266538, "learning_rate": 7.862439628116946e-10, "logits/chosen": -0.9498513340950012, "logits/rejected": -0.8719826936721802, "logps/chosen": -1016.4282836914062, "logps/rejected": -1184.1448974609375, "loss": 0.361, "rewards/accuracies": 0.84375, "rewards/chosen": -5.389301300048828, "rewards/margins": 2.168164014816284, "rewards/rejected": -7.557465553283691, "step": 1476 }, { "epoch": 0.9646501755245326, "grad_norm": 19.29656245173236, "learning_rate": 7.579490328064264e-10, "logits/chosen": -0.628196120262146, "logits/rejected": -0.5795927047729492, "logps/chosen": -942.5938720703125, "logps/rejected": -1108.619873046875, "loss": 0.4409, "rewards/accuracies": 0.84375, "rewards/chosen": -4.877640247344971, "rewards/margins": 1.6121346950531006, "rewards/rejected": -6.48977518081665, "step": 1477 }, { "epoch": 0.9653032900644951, "grad_norm": 22.454212973722687, "learning_rate": 7.301706706919208e-10, "logits/chosen": -0.6595121026039124, "logits/rejected": -0.7709716558456421, "logps/chosen": -993.6766357421875, "logps/rejected": -1450.3536376953125, "loss": 0.4043, "rewards/accuracies": 0.9375, "rewards/chosen": -4.951398849487305, "rewards/margins": 3.6712889671325684, "rewards/rejected": -8.622688293457031, "step": 1478 }, { "epoch": 0.9659564046044575, "grad_norm": 33.820441000688064, "learning_rate": 7.029090210581956e-10, "logits/chosen": -0.8978259563446045, "logits/rejected": -0.7938851118087769, "logps/chosen": -1084.6802978515625, "logps/rejected": -1231.305908203125, "loss": 0.3614, "rewards/accuracies": 0.84375, "rewards/chosen": -5.54904842376709, "rewards/margins": 1.9202641248703003, "rewards/rejected": -7.46931266784668, "step": 1479 }, { "epoch": 0.96660951914442, "grad_norm": 29.055292815530052, "learning_rate": 6.761642258056977e-10, "logits/chosen": -0.8216730952262878, "logits/rejected": -0.8812836408615112, "logps/chosen": -909.160400390625, "logps/rejected": -1194.9307861328125, "loss": 0.43, "rewards/accuracies": 0.8125, "rewards/chosen": -4.529107093811035, "rewards/margins": 1.9421048164367676, "rewards/rejected": -6.471211910247803, "step": 1480 }, { "epoch": 0.9672626336843824, "grad_norm": 28.942362395099714, "learning_rate": 6.499364241446148e-10, "logits/chosen": -0.8953934907913208, "logits/rejected": -0.8497557640075684, "logps/chosen": -954.681884765625, "logps/rejected": -1159.572021484375, "loss": 0.3649, "rewards/accuracies": 0.84375, "rewards/chosen": -4.4994611740112305, "rewards/margins": 2.3079538345336914, "rewards/rejected": -6.80741548538208, "step": 1481 }, { "epoch": 0.9679157482243449, "grad_norm": 35.14445460138126, "learning_rate": 6.242257525940875e-10, "logits/chosen": -0.9045277833938599, "logits/rejected": -0.8981985449790955, "logps/chosen": -969.97998046875, "logps/rejected": -1138.05126953125, "loss": 0.4004, "rewards/accuracies": 0.78125, "rewards/chosen": -5.53133487701416, "rewards/margins": 1.3268393278121948, "rewards/rejected": -6.8581743240356445, "step": 1482 }, { "epoch": 0.9685688627643073, "grad_norm": 24.924893058423713, "learning_rate": 5.990323449815316e-10, "logits/chosen": -0.9852606058120728, "logits/rejected": -0.8136059045791626, "logps/chosen": -1126.402099609375, "logps/rejected": -1153.1251220703125, "loss": 0.3822, "rewards/accuracies": 0.78125, "rewards/chosen": -5.641076564788818, "rewards/margins": 0.9179675579071045, "rewards/rejected": -6.559044361114502, "step": 1483 }, { "epoch": 0.9692219773042697, "grad_norm": 22.221974478367503, "learning_rate": 5.743563324419387e-10, "logits/chosen": -0.9423245787620544, "logits/rejected": -0.7548604011535645, "logps/chosen": -1162.7821044921875, "logps/rejected": -1321.6494140625, "loss": 0.3694, "rewards/accuracies": 0.78125, "rewards/chosen": -6.515698432922363, "rewards/margins": 2.173647403717041, "rewards/rejected": -8.689346313476562, "step": 1484 }, { "epoch": 0.9698750918442322, "grad_norm": 22.796985665535065, "learning_rate": 5.501978434171883e-10, "logits/chosen": -0.8227739930152893, "logits/rejected": -0.6972899436950684, "logps/chosen": -992.651123046875, "logps/rejected": -1125.826416015625, "loss": 0.41, "rewards/accuracies": 0.8125, "rewards/chosen": -5.140564441680908, "rewards/margins": 1.821034550666809, "rewards/rejected": -6.961598873138428, "step": 1485 }, { "epoch": 0.9705282063841947, "grad_norm": 23.18949792218578, "learning_rate": 5.265570036553813e-10, "logits/chosen": -0.8643534779548645, "logits/rejected": -0.7788809537887573, "logps/chosen": -953.4285888671875, "logps/rejected": -1124.8902587890625, "loss": 0.3599, "rewards/accuracies": 0.8125, "rewards/chosen": -4.857746124267578, "rewards/margins": 1.7846039533615112, "rewards/rejected": -6.642350196838379, "step": 1486 }, { "epoch": 0.9711813209241571, "grad_norm": 51.28508939305074, "learning_rate": 5.034339362101958e-10, "logits/chosen": -0.699268639087677, "logits/rejected": -0.8176553249359131, "logps/chosen": -902.2926025390625, "logps/rejected": -1181.4716796875, "loss": 0.32, "rewards/accuracies": 0.84375, "rewards/chosen": -5.03464412689209, "rewards/margins": 1.879096269607544, "rewards/rejected": -6.913740634918213, "step": 1487 }, { "epoch": 0.9718344354641195, "grad_norm": 27.68604192606488, "learning_rate": 4.808287614402218e-10, "logits/chosen": -1.0526833534240723, "logits/rejected": -0.9321053624153137, "logps/chosen": -1100.566162109375, "logps/rejected": -1366.1162109375, "loss": 0.3962, "rewards/accuracies": 0.9375, "rewards/chosen": -5.126470565795898, "rewards/margins": 2.3651087284088135, "rewards/rejected": -7.491579055786133, "step": 1488 }, { "epoch": 0.9724875500040819, "grad_norm": 21.713246091016085, "learning_rate": 4.587415970083719e-10, "logits/chosen": -0.7959469556808472, "logits/rejected": -0.9242347478866577, "logps/chosen": -991.1103515625, "logps/rejected": -1283.2503662109375, "loss": 0.3681, "rewards/accuracies": 0.84375, "rewards/chosen": -5.195345878601074, "rewards/margins": 2.3410024642944336, "rewards/rejected": -7.536348342895508, "step": 1489 }, { "epoch": 0.9731406645440445, "grad_norm": 73.72354463354812, "learning_rate": 4.3717255788121577e-10, "logits/chosen": -1.0061514377593994, "logits/rejected": -0.8946919441223145, "logps/chosen": -1000.9629516601562, "logps/rejected": -1165.5416259765625, "loss": 0.4096, "rewards/accuracies": 0.8125, "rewards/chosen": -5.1635589599609375, "rewards/margins": 1.668057918548584, "rewards/rejected": -6.831617832183838, "step": 1490 }, { "epoch": 0.9737937790840069, "grad_norm": 22.70064351850388, "learning_rate": 4.161217563284469e-10, "logits/chosen": -1.0634889602661133, "logits/rejected": -0.9366331696510315, "logps/chosen": -941.7094116210938, "logps/rejected": -1145.0657958984375, "loss": 0.2551, "rewards/accuracies": 0.875, "rewards/chosen": -4.710692405700684, "rewards/margins": 2.2830896377563477, "rewards/rejected": -6.9937825202941895, "step": 1491 }, { "epoch": 0.9744468936239693, "grad_norm": 39.7036696071761, "learning_rate": 3.9558930192225004e-10, "logits/chosen": -0.8806736469268799, "logits/rejected": -0.9087212085723877, "logps/chosen": -896.1585693359375, "logps/rejected": -1149.5721435546875, "loss": 0.3194, "rewards/accuracies": 0.875, "rewards/chosen": -4.4892706871032715, "rewards/margins": 2.5188791751861572, "rewards/rejected": -7.00814962387085, "step": 1492 }, { "epoch": 0.9751000081639317, "grad_norm": 23.64003160960863, "learning_rate": 3.755753015367236e-10, "logits/chosen": -0.9212431907653809, "logits/rejected": -0.9223770499229431, "logps/chosen": -961.08251953125, "logps/rejected": -1142.14990234375, "loss": 0.3491, "rewards/accuracies": 0.875, "rewards/chosen": -4.822206497192383, "rewards/margins": 1.5445626974105835, "rewards/rejected": -6.366768836975098, "step": 1493 }, { "epoch": 0.9757531227038942, "grad_norm": 26.977497967955454, "learning_rate": 3.560798593473913e-10, "logits/chosen": -1.052283763885498, "logits/rejected": -0.953407347202301, "logps/chosen": -974.070556640625, "logps/rejected": -1134.294921875, "loss": 0.4168, "rewards/accuracies": 0.84375, "rewards/chosen": -4.088522911071777, "rewards/margins": 1.946774959564209, "rewards/rejected": -6.0352983474731445, "step": 1494 }, { "epoch": 0.9764062372438567, "grad_norm": 15.544717499533835, "learning_rate": 3.371030768305583e-10, "logits/chosen": -0.7873333096504211, "logits/rejected": -0.8360827565193176, "logps/chosen": -1022.1478271484375, "logps/rejected": -1273.257568359375, "loss": 0.3568, "rewards/accuracies": 0.875, "rewards/chosen": -5.51603889465332, "rewards/margins": 1.96681809425354, "rewards/rejected": -7.482856750488281, "step": 1495 }, { "epoch": 0.9770593517838191, "grad_norm": 20.11472540145716, "learning_rate": 3.186450527628781e-10, "logits/chosen": -0.741105318069458, "logits/rejected": -0.804491400718689, "logps/chosen": -1016.6484375, "logps/rejected": -1231.481689453125, "loss": 0.368, "rewards/accuracies": 0.90625, "rewards/chosen": -5.079611301422119, "rewards/margins": 1.775844931602478, "rewards/rejected": -6.855456352233887, "step": 1496 }, { "epoch": 0.9777124663237815, "grad_norm": 17.63253263663225, "learning_rate": 3.007058832207976e-10, "logits/chosen": -0.9436084628105164, "logits/rejected": -0.9498050212860107, "logps/chosen": -914.9832763671875, "logps/rejected": -1074.5772705078125, "loss": 0.2992, "rewards/accuracies": 0.875, "rewards/chosen": -4.8038554191589355, "rewards/margins": 1.674473762512207, "rewards/rejected": -6.478329181671143, "step": 1497 }, { "epoch": 0.978365580863744, "grad_norm": 38.54187704615717, "learning_rate": 2.8328566158002386e-10, "logits/chosen": -0.9954776763916016, "logits/rejected": -0.9750261902809143, "logps/chosen": -1050.8953857421875, "logps/rejected": -1183.7637939453125, "loss": 0.4243, "rewards/accuracies": 0.71875, "rewards/chosen": -5.6384124755859375, "rewards/margins": 1.5582480430603027, "rewards/rejected": -7.196660995483398, "step": 1498 }, { "epoch": 0.9790186954037065, "grad_norm": 21.021314956248386, "learning_rate": 2.663844785151248e-10, "logits/chosen": -0.9795162081718445, "logits/rejected": -0.922295093536377, "logps/chosen": -949.4105224609375, "logps/rejected": -1109.667724609375, "loss": 0.3731, "rewards/accuracies": 0.9375, "rewards/chosen": -4.881753444671631, "rewards/margins": 1.8854466676712036, "rewards/rejected": -6.767199993133545, "step": 1499 }, { "epoch": 0.9796718099436689, "grad_norm": 21.956131521928086, "learning_rate": 2.5000242199895163e-10, "logits/chosen": -0.8551816344261169, "logits/rejected": -0.8687204122543335, "logps/chosen": -909.28271484375, "logps/rejected": -1032.6781005859375, "loss": 0.3197, "rewards/accuracies": 0.90625, "rewards/chosen": -4.83204984664917, "rewards/margins": 1.1786363124847412, "rewards/rejected": -6.01068639755249, "step": 1500 }, { "epoch": 0.9796718099436689, "eval_logits/chosen": -0.6721622943878174, "eval_logits/rejected": -0.6120479106903076, "eval_logps/chosen": -988.9237060546875, "eval_logps/rejected": -1153.097900390625, "eval_loss": 0.3866689205169678, "eval_rewards/accuracies": 0.8080000281333923, "eval_rewards/chosen": -5.150165557861328, "eval_rewards/margins": 1.7291275262832642, "eval_rewards/rejected": -6.879292964935303, "eval_runtime": 616.2237, "eval_samples_per_second": 6.491, "eval_steps_per_second": 0.406, "step": 1500 }, { "epoch": 0.9803249244836313, "grad_norm": 40.25994643128802, "learning_rate": 2.341395773022614e-10, "logits/chosen": -0.8601680994033813, "logits/rejected": -0.7846535444259644, "logps/chosen": -1010.7470092773438, "logps/rejected": -1159.432861328125, "loss": 0.4313, "rewards/accuracies": 0.84375, "rewards/chosen": -5.6882429122924805, "rewards/margins": 1.3808894157409668, "rewards/rejected": -7.069132328033447, "step": 1501 }, { "epoch": 0.9809780390235938, "grad_norm": 21.765081447453614, "learning_rate": 2.1879602699325095e-10, "logits/chosen": -0.9793115854263306, "logits/rejected": -0.9564782977104187, "logps/chosen": -968.4996948242188, "logps/rejected": -1242.9364013671875, "loss": 0.3515, "rewards/accuracies": 0.875, "rewards/chosen": -4.951983451843262, "rewards/margins": 2.656872272491455, "rewards/rejected": -7.608855724334717, "step": 1502 }, { "epoch": 0.9816311535635562, "grad_norm": 18.053421531219165, "learning_rate": 2.0397185093710135e-10, "logits/chosen": -0.8464664220809937, "logits/rejected": -0.8488726615905762, "logps/chosen": -921.1729736328125, "logps/rejected": -1140.6810302734375, "loss": 0.3571, "rewards/accuracies": 0.8125, "rewards/chosen": -5.196868419647217, "rewards/margins": 1.8572230339050293, "rewards/rejected": -7.054091453552246, "step": 1503 }, { "epoch": 0.9822842681035187, "grad_norm": 45.38378711687764, "learning_rate": 1.8966712629558956e-10, "logits/chosen": -0.7857608795166016, "logits/rejected": -0.7569239139556885, "logps/chosen": -1089.02783203125, "logps/rejected": -1232.945068359375, "loss": 0.4773, "rewards/accuracies": 0.90625, "rewards/chosen": -5.689802646636963, "rewards/margins": 1.7456611394882202, "rewards/rejected": -7.435463905334473, "step": 1504 }, { "epoch": 0.9829373826434811, "grad_norm": 33.30306187618995, "learning_rate": 1.7588192752669983e-10, "logits/chosen": -0.8797517418861389, "logits/rejected": -0.9536515474319458, "logps/chosen": -915.71240234375, "logps/rejected": -1117.3577880859375, "loss": 0.4076, "rewards/accuracies": 0.875, "rewards/chosen": -5.108963489532471, "rewards/margins": 1.6557157039642334, "rewards/rejected": -6.764679431915283, "step": 1505 }, { "epoch": 0.9835904971834436, "grad_norm": 19.205264223495828, "learning_rate": 1.6261632638419064e-10, "logits/chosen": -0.8795767426490784, "logits/rejected": -0.6969910860061646, "logps/chosen": -948.1537475585938, "logps/rejected": -1066.2906494140625, "loss": 0.33, "rewards/accuracies": 0.90625, "rewards/chosen": -4.414488792419434, "rewards/margins": 1.6049293279647827, "rewards/rejected": -6.019418239593506, "step": 1506 }, { "epoch": 0.984243611723406, "grad_norm": 20.23091846177532, "learning_rate": 1.498703919172506e-10, "logits/chosen": -0.8670247197151184, "logits/rejected": -0.86240154504776, "logps/chosen": -927.6676025390625, "logps/rejected": -1164.9281005859375, "loss": 0.3854, "rewards/accuracies": 0.9375, "rewards/chosen": -4.659823417663574, "rewards/margins": 2.259251117706299, "rewards/rejected": -6.919074058532715, "step": 1507 }, { "epoch": 0.9848967262633684, "grad_norm": 19.609976871290833, "learning_rate": 1.3764419047014307e-10, "logits/chosen": -0.9074758291244507, "logits/rejected": -0.7539640069007874, "logps/chosen": -1016.4730834960938, "logps/rejected": -1181.7667236328125, "loss": 0.3259, "rewards/accuracies": 0.875, "rewards/chosen": -5.008094310760498, "rewards/margins": 2.28283429145813, "rewards/rejected": -7.290928840637207, "step": 1508 }, { "epoch": 0.9855498408033309, "grad_norm": 17.954953795909553, "learning_rate": 1.259377856818622e-10, "logits/chosen": -0.8041942119598389, "logits/rejected": -0.7212069630622864, "logps/chosen": -987.7574462890625, "logps/rejected": -1300.8795166015625, "loss": 0.3057, "rewards/accuracies": 0.96875, "rewards/chosen": -5.226101398468018, "rewards/margins": 2.5325069427490234, "rewards/rejected": -7.758607864379883, "step": 1509 }, { "epoch": 0.9862029553432933, "grad_norm": 35.49175552313481, "learning_rate": 1.147512384857663e-10, "logits/chosen": -0.8118208646774292, "logits/rejected": -0.8001708984375, "logps/chosen": -935.20458984375, "logps/rejected": -1067.834716796875, "loss": 0.3763, "rewards/accuracies": 0.78125, "rewards/chosen": -5.038346767425537, "rewards/margins": 1.3084391355514526, "rewards/rejected": -6.346785545349121, "step": 1510 }, { "epoch": 0.9868560698832558, "grad_norm": 21.866661204153374, "learning_rate": 1.0408460710930045e-10, "logits/chosen": -0.6990280747413635, "logits/rejected": -0.8841565847396851, "logps/chosen": -953.3377685546875, "logps/rejected": -1383.275390625, "loss": 0.359, "rewards/accuracies": 0.875, "rewards/chosen": -4.795334339141846, "rewards/margins": 2.9013049602508545, "rewards/rejected": -7.696639537811279, "step": 1511 }, { "epoch": 0.9875091844232182, "grad_norm": 30.07481661009675, "learning_rate": 9.393794707368563e-11, "logits/chosen": -1.0245449542999268, "logits/rejected": -0.992201566696167, "logps/chosen": -1000.864501953125, "logps/rejected": -1175.2371826171875, "loss": 0.4041, "rewards/accuracies": 0.8125, "rewards/chosen": -5.519082069396973, "rewards/margins": 1.6422016620635986, "rewards/rejected": -7.161284446716309, "step": 1512 }, { "epoch": 0.9881622989631806, "grad_norm": 17.369917763436096, "learning_rate": 8.43113111936189e-11, "logits/chosen": -0.7805638909339905, "logits/rejected": -0.8056908845901489, "logps/chosen": -908.6342163085938, "logps/rejected": -1338.196044921875, "loss": 0.3133, "rewards/accuracies": 0.9375, "rewards/chosen": -4.329880237579346, "rewards/margins": 3.113621711730957, "rewards/rejected": -7.4435014724731445, "step": 1513 }, { "epoch": 0.9888154135031431, "grad_norm": 35.69657571556223, "learning_rate": 7.520474957699585e-11, "logits/chosen": -0.7891541123390198, "logits/rejected": -0.8165507912635803, "logps/chosen": -932.488037109375, "logps/rejected": -1186.4473876953125, "loss": 0.4759, "rewards/accuracies": 0.90625, "rewards/chosen": -4.790083885192871, "rewards/margins": 2.200594425201416, "rewards/rejected": -6.990677833557129, "step": 1514 }, { "epoch": 0.9894685280431056, "grad_norm": 18.050312335076022, "learning_rate": 6.661830962466641e-11, "logits/chosen": -0.8305252194404602, "logits/rejected": -0.8962659239768982, "logps/chosen": -964.072021484375, "logps/rejected": -1240.065673828125, "loss": 0.3223, "rewards/accuracies": 0.9375, "rewards/chosen": -4.888925552368164, "rewards/margins": 2.5497281551361084, "rewards/rejected": -7.438653945922852, "step": 1515 }, { "epoch": 0.990121642583068, "grad_norm": 17.478159809130133, "learning_rate": 5.855203603017945e-11, "logits/chosen": -1.1193163394927979, "logits/rejected": -1.0706626176834106, "logps/chosen": -1132.2928466796875, "logps/rejected": -1322.29248046875, "loss": 0.3302, "rewards/accuracies": 0.8125, "rewards/chosen": -6.359055995941162, "rewards/margins": 1.94695246219635, "rewards/rejected": -8.306008338928223, "step": 1516 }, { "epoch": 0.9907747571230304, "grad_norm": 32.57656802641893, "learning_rate": 5.10059707795496e-11, "logits/chosen": -0.9936408400535583, "logits/rejected": -0.9797040820121765, "logps/chosen": -970.2620849609375, "logps/rejected": -1146.864501953125, "loss": 0.4084, "rewards/accuracies": 0.875, "rewards/chosen": -5.244019508361816, "rewards/margins": 1.6731963157653809, "rewards/rejected": -6.917215824127197, "step": 1517 }, { "epoch": 0.9914278716629928, "grad_norm": 17.499162801776325, "learning_rate": 4.398015315103531e-11, "logits/chosen": -0.9143285155296326, "logits/rejected": -0.8588674068450928, "logps/chosen": -989.8718872070312, "logps/rejected": -1191.24072265625, "loss": 0.3518, "rewards/accuracies": 0.96875, "rewards/chosen": -5.379557132720947, "rewards/margins": 1.8680692911148071, "rewards/rejected": -7.247626304626465, "step": 1518 }, { "epoch": 0.9920809862029554, "grad_norm": 18.95929812206617, "learning_rate": 3.7474619714927827e-11, "logits/chosen": -0.8051342964172363, "logits/rejected": -0.89119952917099, "logps/chosen": -986.9842529296875, "logps/rejected": -1458.8433837890625, "loss": 0.3519, "rewards/accuracies": 0.84375, "rewards/chosen": -4.907092571258545, "rewards/margins": 3.5196523666381836, "rewards/rejected": -8.42674446105957, "step": 1519 }, { "epoch": 0.9927341007429178, "grad_norm": 14.665306889160593, "learning_rate": 3.148940433339575e-11, "logits/chosen": -0.8779653310775757, "logits/rejected": -0.7155288457870483, "logps/chosen": -1045.592041015625, "logps/rejected": -1308.4227294921875, "loss": 0.3158, "rewards/accuracies": 0.9375, "rewards/chosen": -5.181655406951904, "rewards/margins": 2.8715524673461914, "rewards/rejected": -8.053207397460938, "step": 1520 }, { "epoch": 0.9933872152828802, "grad_norm": 18.0393916964981, "learning_rate": 2.6024538160251962e-11, "logits/chosen": -0.9173994660377502, "logits/rejected": -0.8805586099624634, "logps/chosen": -879.5809326171875, "logps/rejected": -1108.05078125, "loss": 0.3632, "rewards/accuracies": 0.9375, "rewards/chosen": -4.384559154510498, "rewards/margins": 1.4541279077529907, "rewards/rejected": -5.838687419891357, "step": 1521 }, { "epoch": 0.9940403298228426, "grad_norm": 29.385526588038253, "learning_rate": 2.1080049640864738e-11, "logits/chosen": -1.0499584674835205, "logits/rejected": -1.0689440965652466, "logps/chosen": -925.3338623046875, "logps/rejected": -1172.06298828125, "loss": 0.3628, "rewards/accuracies": 0.8125, "rewards/chosen": -4.823688507080078, "rewards/margins": 2.227992296218872, "rewards/rejected": -7.051680564880371, "step": 1522 }, { "epoch": 0.9946934443628052, "grad_norm": 27.358046262023528, "learning_rate": 1.665596451193574e-11, "logits/chosen": -0.901128888130188, "logits/rejected": -0.6968897581100464, "logps/chosen": -931.8582763671875, "logps/rejected": -1069.2611083984375, "loss": 0.3809, "rewards/accuracies": 0.8125, "rewards/chosen": -4.321289539337158, "rewards/margins": 2.0190200805664062, "rewards/rejected": -6.340309143066406, "step": 1523 }, { "epoch": 0.9953465589027676, "grad_norm": 45.310244069027156, "learning_rate": 1.2752305801400077e-11, "logits/chosen": -0.9536406993865967, "logits/rejected": -0.9706865549087524, "logps/chosen": -1034.489501953125, "logps/rejected": -1205.436767578125, "loss": 0.399, "rewards/accuracies": 0.84375, "rewards/chosen": -5.685507774353027, "rewards/margins": 1.6740481853485107, "rewards/rejected": -7.359556198120117, "step": 1524 }, { "epoch": 0.99599967344273, "grad_norm": 27.900190174903972, "learning_rate": 9.369093828326402e-12, "logits/chosen": -0.8810493350028992, "logits/rejected": -0.810979962348938, "logps/chosen": -970.609130859375, "logps/rejected": -1154.4971923828125, "loss": 0.2934, "rewards/accuracies": 0.875, "rewards/chosen": -4.879640102386475, "rewards/margins": 1.8229658603668213, "rewards/rejected": -6.702606201171875, "step": 1525 }, { "epoch": 0.9966527879826924, "grad_norm": 34.411440821011766, "learning_rate": 6.506346202772572e-12, "logits/chosen": -0.8746610879898071, "logits/rejected": -0.8008745908737183, "logps/chosen": -990.9203491210938, "logps/rejected": -1058.67333984375, "loss": 0.4343, "rewards/accuracies": 0.625, "rewards/chosen": -5.3791985511779785, "rewards/margins": 0.6661874055862427, "rewards/rejected": -6.045385837554932, "step": 1526 }, { "epoch": 0.9973059025226549, "grad_norm": 31.336013670265675, "learning_rate": 4.164077825707934e-12, "logits/chosen": -0.9213491678237915, "logits/rejected": -0.7777243852615356, "logps/chosen": -913.4546508789062, "logps/rejected": -1077.456787109375, "loss": 0.4079, "rewards/accuracies": 0.78125, "rewards/chosen": -4.575063705444336, "rewards/margins": 1.3315820693969727, "rewards/rejected": -5.906645774841309, "step": 1527 }, { "epoch": 0.9979590170626174, "grad_norm": 23.47986482520017, "learning_rate": 2.3423008889467134e-12, "logits/chosen": -0.9849967360496521, "logits/rejected": -0.9053511619567871, "logps/chosen": -968.4962158203125, "logps/rejected": -1164.7457275390625, "loss": 0.3579, "rewards/accuracies": 0.8125, "rewards/chosen": -4.731381416320801, "rewards/margins": 2.134218215942383, "rewards/rejected": -6.865600109100342, "step": 1528 }, { "epoch": 0.9986121316025798, "grad_norm": 20.62158211293607, "learning_rate": 1.0410248750925e-12, "logits/chosen": -0.8682329654693604, "logits/rejected": -0.8483768105506897, "logps/chosen": -1060.15673828125, "logps/rejected": -1233.65478515625, "loss": 0.4167, "rewards/accuracies": 0.84375, "rewards/chosen": -5.726959228515625, "rewards/margins": 1.6892329454421997, "rewards/rejected": -7.416192054748535, "step": 1529 }, { "epoch": 0.9992652461425422, "grad_norm": 23.44058985395152, "learning_rate": 2.6025655743833196e-13, "logits/chosen": -0.7166305184364319, "logits/rejected": -0.8835107684135437, "logps/chosen": -1093.7403564453125, "logps/rejected": -1359.48291015625, "loss": 0.3416, "rewards/accuracies": 0.84375, "rewards/chosen": -5.947076797485352, "rewards/margins": 1.7625958919525146, "rewards/rejected": -7.709671974182129, "step": 1530 }, { "epoch": 0.9999183606825047, "grad_norm": 18.690941266109423, "learning_rate": 0.0, "logits/chosen": -0.8586152195930481, "logits/rejected": -0.7749535441398621, "logps/chosen": -904.07470703125, "logps/rejected": -1040.50537109375, "loss": 0.352, "rewards/accuracies": 0.78125, "rewards/chosen": -4.729012489318848, "rewards/margins": 1.549518346786499, "rewards/rejected": -6.278531074523926, "step": 1531 }, { "epoch": 0.9999183606825047, "step": 1531, "total_flos": 0.0, "train_loss": 0.46421666473520107, "train_runtime": 86694.5826, "train_samples_per_second": 2.261, "train_steps_per_second": 0.018 } ], "logging_steps": 1, "max_steps": 1531, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }