PyTorch
llama
alignment-handbook
Generated from Trainer
Mamba2InLlama_0_875 / trainer_state.json
Junxiong Wang
add models
e5a6ffa
raw
history blame contribute delete
No virus
223 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 2000,
"global_step": 4168,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002399232245681382,
"grad_norm": 20.90535270812656,
"learning_rate": 1.199040767386091e-09,
"logits/chosen": -0.48379573225975037,
"logits/rejected": -0.48017197847366333,
"logps/chosen": -250.1331329345703,
"logps/rejected": -232.6839141845703,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.0023992322456813818,
"grad_norm": 20.316799458165775,
"learning_rate": 1.199040767386091e-08,
"logits/chosen": -0.4963577091693878,
"logits/rejected": -0.5276286005973816,
"logps/chosen": -441.6046142578125,
"logps/rejected": -363.4785461425781,
"loss": 0.693,
"rewards/accuracies": 0.4722222089767456,
"rewards/chosen": 0.0010866652010008693,
"rewards/margins": 0.0004253386869095266,
"rewards/rejected": 0.0006613265140913427,
"step": 10
},
{
"epoch": 0.0047984644913627635,
"grad_norm": 20.614630611685648,
"learning_rate": 2.398081534772182e-08,
"logits/chosen": -0.5616664290428162,
"logits/rejected": -0.5348426103591919,
"logps/chosen": -311.93389892578125,
"logps/rejected": -278.0029602050781,
"loss": 0.6934,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": 0.00013055796443950385,
"rewards/margins": 0.0002189161314163357,
"rewards/rejected": -8.835792687023059e-05,
"step": 20
},
{
"epoch": 0.007197696737044146,
"grad_norm": 19.284588490453608,
"learning_rate": 3.597122302158273e-08,
"logits/chosen": -0.5190974473953247,
"logits/rejected": -0.5706892013549805,
"logps/chosen": -319.748779296875,
"logps/rejected": -331.7994689941406,
"loss": 0.6931,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0010061769280582666,
"rewards/margins": 0.0013537806225940585,
"rewards/rejected": -0.0003476037527434528,
"step": 30
},
{
"epoch": 0.009596928982725527,
"grad_norm": 19.442113506121437,
"learning_rate": 4.796163069544364e-08,
"logits/chosen": -0.5577880144119263,
"logits/rejected": -0.5859715938568115,
"logps/chosen": -338.12628173828125,
"logps/rejected": -314.81982421875,
"loss": 0.6928,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.00019955830066464841,
"rewards/margins": -5.653758853441104e-05,
"rewards/rejected": 0.000256095954682678,
"step": 40
},
{
"epoch": 0.01199616122840691,
"grad_norm": 20.77025303650937,
"learning_rate": 5.995203836930455e-08,
"logits/chosen": -0.5782157182693481,
"logits/rejected": -0.5549123287200928,
"logps/chosen": -335.87646484375,
"logps/rejected": -289.2035217285156,
"loss": 0.6934,
"rewards/accuracies": 0.42500001192092896,
"rewards/chosen": 0.0005626773927360773,
"rewards/margins": -0.00042233389103785157,
"rewards/rejected": 0.0009850109927356243,
"step": 50
},
{
"epoch": 0.014395393474088292,
"grad_norm": 19.993953212894812,
"learning_rate": 7.194244604316546e-08,
"logits/chosen": -0.5433920621871948,
"logits/rejected": -0.49929919838905334,
"logps/chosen": -355.02740478515625,
"logps/rejected": -338.33148193359375,
"loss": 0.693,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.0006808604812249541,
"rewards/margins": -0.0011384403333067894,
"rewards/rejected": 0.0004575795610435307,
"step": 60
},
{
"epoch": 0.016794625719769675,
"grad_norm": 17.979585480540507,
"learning_rate": 8.393285371702638e-08,
"logits/chosen": -0.49723702669143677,
"logits/rejected": -0.48305654525756836,
"logps/chosen": -353.8045654296875,
"logps/rejected": -327.27716064453125,
"loss": 0.6927,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": 0.002497387584298849,
"rewards/margins": 0.0025346879847347736,
"rewards/rejected": -3.7300120311556384e-05,
"step": 70
},
{
"epoch": 0.019193857965451054,
"grad_norm": 25.26789980160209,
"learning_rate": 9.592326139088728e-08,
"logits/chosen": -0.5563893914222717,
"logits/rejected": -0.4884260594844818,
"logps/chosen": -261.504638671875,
"logps/rejected": -315.64349365234375,
"loss": 0.6929,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.002656942466273904,
"rewards/margins": 0.0009416007669642568,
"rewards/rejected": 0.0017153415828943253,
"step": 80
},
{
"epoch": 0.021593090211132437,
"grad_norm": 18.57391434910598,
"learning_rate": 1.0791366906474819e-07,
"logits/chosen": -0.5545334815979004,
"logits/rejected": -0.5614916086196899,
"logps/chosen": -396.33416748046875,
"logps/rejected": -342.1172180175781,
"loss": 0.6924,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": 0.004076135344803333,
"rewards/margins": 0.0013626832515001297,
"rewards/rejected": 0.002713452558964491,
"step": 90
},
{
"epoch": 0.02399232245681382,
"grad_norm": 20.078943593256316,
"learning_rate": 1.199040767386091e-07,
"logits/chosen": -0.5230361819267273,
"logits/rejected": -0.4858153760433197,
"logps/chosen": -326.3655090332031,
"logps/rejected": -351.19390869140625,
"loss": 0.6923,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.0061579798348248005,
"rewards/margins": 0.001345540746115148,
"rewards/rejected": 0.004812438972294331,
"step": 100
},
{
"epoch": 0.026391554702495202,
"grad_norm": 20.875381676857184,
"learning_rate": 1.3189448441247004e-07,
"logits/chosen": -0.5459330677986145,
"logits/rejected": -0.5579243898391724,
"logps/chosen": -287.3791198730469,
"logps/rejected": -290.72393798828125,
"loss": 0.6915,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.009638044983148575,
"rewards/margins": 0.00030891623464412987,
"rewards/rejected": 0.009329128079116344,
"step": 110
},
{
"epoch": 0.028790786948176585,
"grad_norm": 20.107786625562355,
"learning_rate": 1.4388489208633092e-07,
"logits/chosen": -0.5211232900619507,
"logits/rejected": -0.5448856353759766,
"logps/chosen": -363.11431884765625,
"logps/rejected": -348.9471435546875,
"loss": 0.6917,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.014917564578354359,
"rewards/margins": 0.003776032943278551,
"rewards/rejected": 0.01114153116941452,
"step": 120
},
{
"epoch": 0.031190019193857964,
"grad_norm": 17.419578673944976,
"learning_rate": 1.5587529976019183e-07,
"logits/chosen": -0.5541412830352783,
"logits/rejected": -0.5439847707748413,
"logps/chosen": -273.02838134765625,
"logps/rejected": -365.01483154296875,
"loss": 0.6901,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.019415050745010376,
"rewards/margins": 0.00881609134376049,
"rewards/rejected": 0.01059896033257246,
"step": 130
},
{
"epoch": 0.03358925143953935,
"grad_norm": 20.19183366811833,
"learning_rate": 1.6786570743405277e-07,
"logits/chosen": -0.45227426290512085,
"logits/rejected": -0.45624417066574097,
"logps/chosen": -366.0704040527344,
"logps/rejected": -355.80474853515625,
"loss": 0.6892,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.015087930485606194,
"rewards/margins": 0.005366227589547634,
"rewards/rejected": 0.009721704758703709,
"step": 140
},
{
"epoch": 0.03598848368522073,
"grad_norm": 19.066102175382554,
"learning_rate": 1.7985611510791365e-07,
"logits/chosen": -0.5239461064338684,
"logits/rejected": -0.5222934484481812,
"logps/chosen": -282.2486267089844,
"logps/rejected": -280.42718505859375,
"loss": 0.6902,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": 0.020530493929982185,
"rewards/margins": 0.0036348134744912386,
"rewards/rejected": 0.016895681619644165,
"step": 150
},
{
"epoch": 0.03838771593090211,
"grad_norm": 19.027643825440478,
"learning_rate": 1.9184652278177456e-07,
"logits/chosen": -0.46417126059532166,
"logits/rejected": -0.47142887115478516,
"logps/chosen": -372.260009765625,
"logps/rejected": -299.72418212890625,
"loss": 0.6861,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.024696629494428635,
"rewards/margins": 0.01829499378800392,
"rewards/rejected": 0.006401637103408575,
"step": 160
},
{
"epoch": 0.040786948176583494,
"grad_norm": 20.05176445155851,
"learning_rate": 2.038369304556355e-07,
"logits/chosen": -0.4728211760520935,
"logits/rejected": -0.4653477072715759,
"logps/chosen": -410.3612365722656,
"logps/rejected": -395.3166198730469,
"loss": 0.6874,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": 0.0359230674803257,
"rewards/margins": 0.016820725053548813,
"rewards/rejected": 0.019102338701486588,
"step": 170
},
{
"epoch": 0.04318618042226487,
"grad_norm": 19.717298290033078,
"learning_rate": 2.1582733812949638e-07,
"logits/chosen": -0.5537582039833069,
"logits/rejected": -0.5516412854194641,
"logps/chosen": -294.61224365234375,
"logps/rejected": -295.9138488769531,
"loss": 0.6861,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": 0.02052612230181694,
"rewards/margins": 0.01923990622162819,
"rewards/rejected": 0.0012862167786806822,
"step": 180
},
{
"epoch": 0.04558541266794626,
"grad_norm": 22.451123090776118,
"learning_rate": 2.278177458033573e-07,
"logits/chosen": -0.4669855535030365,
"logits/rejected": -0.46975016593933105,
"logps/chosen": -386.79052734375,
"logps/rejected": -322.21063232421875,
"loss": 0.6855,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": 0.0220388974994421,
"rewards/margins": 0.006877691484987736,
"rewards/rejected": 0.015161206014454365,
"step": 190
},
{
"epoch": 0.04798464491362764,
"grad_norm": 18.561601949682256,
"learning_rate": 2.398081534772182e-07,
"logits/chosen": -0.5395389199256897,
"logits/rejected": -0.4788607060909271,
"logps/chosen": -370.59832763671875,
"logps/rejected": -354.6778869628906,
"loss": 0.6811,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.026123318821191788,
"rewards/margins": 0.02886904776096344,
"rewards/rejected": -0.002745730336755514,
"step": 200
},
{
"epoch": 0.05038387715930902,
"grad_norm": 22.094980613810247,
"learning_rate": 2.517985611510791e-07,
"logits/chosen": -0.5375515818595886,
"logits/rejected": -0.547138512134552,
"logps/chosen": -304.6062927246094,
"logps/rejected": -330.10687255859375,
"loss": 0.6814,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.03125152364373207,
"rewards/margins": 0.03389766812324524,
"rewards/rejected": -0.002646142616868019,
"step": 210
},
{
"epoch": 0.052783109404990404,
"grad_norm": 18.489557098025607,
"learning_rate": 2.637889688249401e-07,
"logits/chosen": -0.5321250557899475,
"logits/rejected": -0.5411959886550903,
"logps/chosen": -391.7878723144531,
"logps/rejected": -377.56280517578125,
"loss": 0.6834,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.012276771478354931,
"rewards/margins": -0.0032779511529952288,
"rewards/rejected": 0.01555472332984209,
"step": 220
},
{
"epoch": 0.05518234165067178,
"grad_norm": 22.525575101088698,
"learning_rate": 2.7577937649880093e-07,
"logits/chosen": -0.5341587066650391,
"logits/rejected": -0.5006336569786072,
"logps/chosen": -303.1698913574219,
"logps/rejected": -340.33331298828125,
"loss": 0.6738,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": 0.005457176826894283,
"rewards/margins": 0.024495940655469894,
"rewards/rejected": -0.019038762897253036,
"step": 230
},
{
"epoch": 0.05758157389635317,
"grad_norm": 20.562033234462188,
"learning_rate": 2.8776978417266184e-07,
"logits/chosen": -0.5429738759994507,
"logits/rejected": -0.5385856032371521,
"logps/chosen": -357.30609130859375,
"logps/rejected": -311.60260009765625,
"loss": 0.6723,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": 0.030288681387901306,
"rewards/margins": 0.048918746411800385,
"rewards/rejected": -0.01863006316125393,
"step": 240
},
{
"epoch": 0.05998080614203455,
"grad_norm": 22.776979550503004,
"learning_rate": 2.997601918465228e-07,
"logits/chosen": -0.5102118253707886,
"logits/rejected": -0.5135980844497681,
"logps/chosen": -294.0608825683594,
"logps/rejected": -275.83673095703125,
"loss": 0.6674,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.005173470359295607,
"rewards/margins": 0.06198770925402641,
"rewards/rejected": -0.05681424215435982,
"step": 250
},
{
"epoch": 0.06238003838771593,
"grad_norm": 21.890329640256528,
"learning_rate": 3.1175059952038366e-07,
"logits/chosen": -0.5791837573051453,
"logits/rejected": -0.5334831476211548,
"logps/chosen": -353.4739074707031,
"logps/rejected": -343.4547119140625,
"loss": 0.6719,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.02404799312353134,
"rewards/margins": 0.04426788166165352,
"rewards/rejected": -0.06831587105989456,
"step": 260
},
{
"epoch": 0.0647792706333973,
"grad_norm": 20.523920744785585,
"learning_rate": 3.2374100719424457e-07,
"logits/chosen": -0.49191370606422424,
"logits/rejected": -0.5529422163963318,
"logps/chosen": -347.00494384765625,
"logps/rejected": -282.3544921875,
"loss": 0.6609,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.03170743212103844,
"rewards/margins": 0.039711810648441315,
"rewards/rejected": -0.07141923159360886,
"step": 270
},
{
"epoch": 0.0671785028790787,
"grad_norm": 20.886438274884952,
"learning_rate": 3.3573141486810554e-07,
"logits/chosen": -0.5886783599853516,
"logits/rejected": -0.5640865564346313,
"logps/chosen": -364.08575439453125,
"logps/rejected": -354.1321105957031,
"loss": 0.6504,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.020602982491254807,
"rewards/margins": 0.0775846317410469,
"rewards/rejected": -0.09818761050701141,
"step": 280
},
{
"epoch": 0.06957773512476008,
"grad_norm": 19.345700277409666,
"learning_rate": 3.477218225419664e-07,
"logits/chosen": -0.5530000329017639,
"logits/rejected": -0.5117976665496826,
"logps/chosen": -350.86199951171875,
"logps/rejected": -327.6963806152344,
"loss": 0.6513,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.05797697231173515,
"rewards/margins": 0.062293171882629395,
"rewards/rejected": -0.12027014791965485,
"step": 290
},
{
"epoch": 0.07197696737044146,
"grad_norm": 22.275769826792928,
"learning_rate": 3.597122302158273e-07,
"logits/chosen": -0.6038728952407837,
"logits/rejected": -0.6336754560470581,
"logps/chosen": -332.75714111328125,
"logps/rejected": -356.898193359375,
"loss": 0.6532,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.10048200935125351,
"rewards/margins": 0.09853404760360718,
"rewards/rejected": -0.1990160346031189,
"step": 300
},
{
"epoch": 0.07437619961612284,
"grad_norm": 22.363116052564926,
"learning_rate": 3.7170263788968827e-07,
"logits/chosen": -0.5675481557846069,
"logits/rejected": -0.6176060438156128,
"logps/chosen": -353.2454833984375,
"logps/rejected": -325.49066162109375,
"loss": 0.6582,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.0909217894077301,
"rewards/margins": 0.13223211467266083,
"rewards/rejected": -0.22315391898155212,
"step": 310
},
{
"epoch": 0.07677543186180422,
"grad_norm": 24.95798686492851,
"learning_rate": 3.836930455635491e-07,
"logits/chosen": -0.6077001094818115,
"logits/rejected": -0.609139621257782,
"logps/chosen": -343.24127197265625,
"logps/rejected": -309.5650634765625,
"loss": 0.6432,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.055519819259643555,
"rewards/margins": 0.10148320347070694,
"rewards/rejected": -0.1570030152797699,
"step": 320
},
{
"epoch": 0.07917466410748561,
"grad_norm": 21.14907440323966,
"learning_rate": 3.9568345323741003e-07,
"logits/chosen": -0.5618354082107544,
"logits/rejected": -0.5163384079933167,
"logps/chosen": -333.4284973144531,
"logps/rejected": -383.4358825683594,
"loss": 0.6351,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.13757416605949402,
"rewards/margins": 0.1750974953174591,
"rewards/rejected": -0.3126716911792755,
"step": 330
},
{
"epoch": 0.08157389635316699,
"grad_norm": 23.505161956514563,
"learning_rate": 4.07673860911271e-07,
"logits/chosen": -0.5577572584152222,
"logits/rejected": -0.5682773590087891,
"logps/chosen": -311.04046630859375,
"logps/rejected": -350.06011962890625,
"loss": 0.6331,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.10792098939418793,
"rewards/margins": 0.2111283242702484,
"rewards/rejected": -0.31904932856559753,
"step": 340
},
{
"epoch": 0.08397312859884837,
"grad_norm": 24.141694779806222,
"learning_rate": 4.1966426858513185e-07,
"logits/chosen": -0.6674095392227173,
"logits/rejected": -0.6525458097457886,
"logps/chosen": -385.8694152832031,
"logps/rejected": -387.1976013183594,
"loss": 0.6451,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.24417324364185333,
"rewards/margins": 0.12837204337120056,
"rewards/rejected": -0.3725453317165375,
"step": 350
},
{
"epoch": 0.08637236084452975,
"grad_norm": 25.804069948612213,
"learning_rate": 4.3165467625899276e-07,
"logits/chosen": -0.5833398699760437,
"logits/rejected": -0.6397580504417419,
"logps/chosen": -350.5534973144531,
"logps/rejected": -299.1941833496094,
"loss": 0.6415,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2430131882429123,
"rewards/margins": 0.10048626363277435,
"rewards/rejected": -0.34349945187568665,
"step": 360
},
{
"epoch": 0.08877159309021113,
"grad_norm": 31.03793475076566,
"learning_rate": 4.436450839328537e-07,
"logits/chosen": -0.5922696590423584,
"logits/rejected": -0.5713749527931213,
"logps/chosen": -338.27667236328125,
"logps/rejected": -366.77166748046875,
"loss": 0.6267,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.27600157260894775,
"rewards/margins": 0.22378632426261902,
"rewards/rejected": -0.49978795647621155,
"step": 370
},
{
"epoch": 0.09117082533589252,
"grad_norm": 21.29384938198475,
"learning_rate": 4.556354916067146e-07,
"logits/chosen": -0.6022308468818665,
"logits/rejected": -0.5682617425918579,
"logps/chosen": -323.4892272949219,
"logps/rejected": -347.95111083984375,
"loss": 0.6042,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.2343587428331375,
"rewards/margins": 0.22317072749137878,
"rewards/rejected": -0.4575294554233551,
"step": 380
},
{
"epoch": 0.0935700575815739,
"grad_norm": 23.74603674013515,
"learning_rate": 4.676258992805755e-07,
"logits/chosen": -0.5804970860481262,
"logits/rejected": -0.5728699564933777,
"logps/chosen": -381.6591796875,
"logps/rejected": -358.8669128417969,
"loss": 0.6172,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3958897590637207,
"rewards/margins": 0.13457268476486206,
"rewards/rejected": -0.5304625034332275,
"step": 390
},
{
"epoch": 0.09596928982725528,
"grad_norm": 22.488076430906265,
"learning_rate": 4.796163069544364e-07,
"logits/chosen": -0.6037659049034119,
"logits/rejected": -0.6473450660705566,
"logps/chosen": -350.5100402832031,
"logps/rejected": -356.1009826660156,
"loss": 0.6226,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.3837326467037201,
"rewards/margins": 0.2828107476234436,
"rewards/rejected": -0.6665433645248413,
"step": 400
},
{
"epoch": 0.09836852207293666,
"grad_norm": 28.205747441162394,
"learning_rate": 4.916067146282974e-07,
"logits/chosen": -0.6226581335067749,
"logits/rejected": -0.606611430644989,
"logps/chosen": -347.90966796875,
"logps/rejected": -401.1394958496094,
"loss": 0.6069,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.3836399018764496,
"rewards/margins": 0.22059115767478943,
"rewards/rejected": -0.6042311191558838,
"step": 410
},
{
"epoch": 0.10076775431861804,
"grad_norm": 24.57218142171684,
"learning_rate": 4.999992108529978e-07,
"logits/chosen": -0.5291169881820679,
"logits/rejected": -0.5468065142631531,
"logps/chosen": -444.72589111328125,
"logps/rejected": -439.670654296875,
"loss": 0.5982,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.48520898818969727,
"rewards/margins": 0.3270217478275299,
"rewards/rejected": -0.8122307062149048,
"step": 420
},
{
"epoch": 0.10316698656429943,
"grad_norm": 34.823244275804946,
"learning_rate": 4.999851817115532e-07,
"logits/chosen": -0.6540865302085876,
"logits/rejected": -0.5904898047447205,
"logps/chosen": -351.2285461425781,
"logps/rejected": -386.56890869140625,
"loss": 0.6093,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4589855670928955,
"rewards/margins": 0.3846796751022339,
"rewards/rejected": -0.8436653017997742,
"step": 430
},
{
"epoch": 0.10556621880998081,
"grad_norm": 33.199263240349794,
"learning_rate": 4.999536171027889e-07,
"logits/chosen": -0.5496717691421509,
"logits/rejected": -0.5985559225082397,
"logps/chosen": -409.6986389160156,
"logps/rejected": -411.371826171875,
"loss": 0.604,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5133577585220337,
"rewards/margins": 0.20064587891101837,
"rewards/rejected": -0.7140035629272461,
"step": 440
},
{
"epoch": 0.10796545105566219,
"grad_norm": 28.95788929645283,
"learning_rate": 4.999045192408369e-07,
"logits/chosen": -0.5078392028808594,
"logits/rejected": -0.4781821370124817,
"logps/chosen": -352.42578125,
"logps/rejected": -345.4123840332031,
"loss": 0.6122,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.5539526343345642,
"rewards/margins": 0.1369965374469757,
"rewards/rejected": -0.6909492611885071,
"step": 450
},
{
"epoch": 0.11036468330134357,
"grad_norm": 24.918675200058328,
"learning_rate": 4.998378915697171e-07,
"logits/chosen": -0.5960583090782166,
"logits/rejected": -0.5872009992599487,
"logps/chosen": -367.5823669433594,
"logps/rejected": -395.4332580566406,
"loss": 0.583,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.3061702251434326,
"rewards/margins": 0.42768678069114685,
"rewards/rejected": -0.7338569164276123,
"step": 460
},
{
"epoch": 0.11276391554702495,
"grad_norm": 24.386517807951574,
"learning_rate": 4.997537387630958e-07,
"logits/chosen": -0.5429798364639282,
"logits/rejected": -0.5464817881584167,
"logps/chosen": -310.02203369140625,
"logps/rejected": -340.80865478515625,
"loss": 0.5738,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.4528660178184509,
"rewards/margins": 0.3107239603996277,
"rewards/rejected": -0.7635899782180786,
"step": 470
},
{
"epoch": 0.11516314779270634,
"grad_norm": 27.255184142896073,
"learning_rate": 4.996520667239582e-07,
"logits/chosen": -0.6526015996932983,
"logits/rejected": -0.6507179737091064,
"logps/chosen": -353.07098388671875,
"logps/rejected": -445.12237548828125,
"loss": 0.5705,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6045628786087036,
"rewards/margins": 0.3956468403339386,
"rewards/rejected": -1.0002095699310303,
"step": 480
},
{
"epoch": 0.11756238003838772,
"grad_norm": 32.84348746795199,
"learning_rate": 4.995328825841939e-07,
"logits/chosen": -0.4966016709804535,
"logits/rejected": -0.49989452958106995,
"logps/chosen": -317.2383117675781,
"logps/rejected": -374.27508544921875,
"loss": 0.5899,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4817740321159363,
"rewards/margins": 0.5593485236167908,
"rewards/rejected": -1.041122555732727,
"step": 490
},
{
"epoch": 0.1199616122840691,
"grad_norm": 29.672640234170935,
"learning_rate": 4.993961947040967e-07,
"logits/chosen": -0.525520920753479,
"logits/rejected": -0.5563070178031921,
"logps/chosen": -427.6673889160156,
"logps/rejected": -412.65008544921875,
"loss": 0.5935,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7312201261520386,
"rewards/margins": 0.3187289237976074,
"rewards/rejected": -1.0499489307403564,
"step": 500
},
{
"epoch": 0.12236084452975048,
"grad_norm": 27.84738833817779,
"learning_rate": 4.992420126717784e-07,
"logits/chosen": -0.5528146028518677,
"logits/rejected": -0.5479222536087036,
"logps/chosen": -356.24041748046875,
"logps/rejected": -422.6918029785156,
"loss": 0.5781,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.4140992760658264,
"rewards/margins": 0.6510533094406128,
"rewards/rejected": -1.065152645111084,
"step": 510
},
{
"epoch": 0.12476007677543186,
"grad_norm": 32.48041619734842,
"learning_rate": 4.990703473024958e-07,
"logits/chosen": -0.45184358954429626,
"logits/rejected": -0.48187708854675293,
"logps/chosen": -417.84405517578125,
"logps/rejected": -444.81353759765625,
"loss": 0.5991,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.7138451933860779,
"rewards/margins": 0.37000906467437744,
"rewards/rejected": -1.083854079246521,
"step": 520
},
{
"epoch": 0.12715930902111325,
"grad_norm": 28.128177801840295,
"learning_rate": 4.98881210637893e-07,
"logits/chosen": -0.42285671830177307,
"logits/rejected": -0.4019806385040283,
"logps/chosen": -320.9397277832031,
"logps/rejected": -411.93255615234375,
"loss": 0.5818,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.4967781603336334,
"rewards/margins": 0.4927561283111572,
"rewards/rejected": -0.9895342588424683,
"step": 530
},
{
"epoch": 0.1295585412667946,
"grad_norm": 21.381239649867126,
"learning_rate": 4.986746159451553e-07,
"logits/chosen": -0.29445725679397583,
"logits/rejected": -0.2827056646347046,
"logps/chosen": -360.28509521484375,
"logps/rejected": -394.31768798828125,
"loss": 0.5976,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.4901258945465088,
"rewards/margins": 0.35657569766044617,
"rewards/rejected": -0.8467016220092773,
"step": 540
},
{
"epoch": 0.131957773512476,
"grad_norm": 23.053578304971253,
"learning_rate": 4.984505777160795e-07,
"logits/chosen": -0.2335212230682373,
"logits/rejected": -0.2651960253715515,
"logps/chosen": -433.4956970214844,
"logps/rejected": -464.2955017089844,
"loss": 0.5984,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6037947535514832,
"rewards/margins": 0.3152288496494293,
"rewards/rejected": -0.9190236330032349,
"step": 550
},
{
"epoch": 0.1343570057581574,
"grad_norm": 28.905417781337384,
"learning_rate": 4.982091116660574e-07,
"logits/chosen": -0.321607768535614,
"logits/rejected": -0.3338220715522766,
"logps/chosen": -305.93658447265625,
"logps/rejected": -300.59124755859375,
"loss": 0.6154,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5145214796066284,
"rewards/margins": 0.22794541716575623,
"rewards/rejected": -0.7424668669700623,
"step": 560
},
{
"epoch": 0.13675623800383876,
"grad_norm": 32.795386120218325,
"learning_rate": 4.979502347329732e-07,
"logits/chosen": -0.23663392663002014,
"logits/rejected": -0.24166357517242432,
"logps/chosen": -423.2027282714844,
"logps/rejected": -491.15106201171875,
"loss": 0.5998,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6477493643760681,
"rewards/margins": 0.4360308051109314,
"rewards/rejected": -1.08378005027771,
"step": 570
},
{
"epoch": 0.13915547024952016,
"grad_norm": 37.84072378443296,
"learning_rate": 4.976739650760151e-07,
"logits/chosen": -0.29570311307907104,
"logits/rejected": -0.3070180118083954,
"logps/chosen": -375.17962646484375,
"logps/rejected": -388.2039489746094,
"loss": 0.586,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.4810148775577545,
"rewards/margins": 0.3502056300640106,
"rewards/rejected": -0.8312205076217651,
"step": 580
},
{
"epoch": 0.14155470249520152,
"grad_norm": 45.677768580981564,
"learning_rate": 4.97380322074402e-07,
"logits/chosen": -0.2370149791240692,
"logits/rejected": -0.25640061497688293,
"logps/chosen": -349.70941162109375,
"logps/rejected": -374.61456298828125,
"loss": 0.5895,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6930117607116699,
"rewards/margins": 0.2920604646205902,
"rewards/rejected": -0.985072135925293,
"step": 590
},
{
"epoch": 0.14395393474088292,
"grad_norm": 31.065347761695264,
"learning_rate": 4.970693263260237e-07,
"logits/chosen": -0.26885563135147095,
"logits/rejected": -0.3041172921657562,
"logps/chosen": -403.6191101074219,
"logps/rejected": -410.84967041015625,
"loss": 0.5853,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.5729845762252808,
"rewards/margins": 0.46888118982315063,
"rewards/rejected": -1.0418657064437866,
"step": 600
},
{
"epoch": 0.1463531669865643,
"grad_norm": 29.080698158567,
"learning_rate": 4.967409996459966e-07,
"logits/chosen": -0.2872675359249115,
"logits/rejected": -0.3306855261325836,
"logps/chosen": -405.076904296875,
"logps/rejected": -423.62664794921875,
"loss": 0.5751,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.49526625871658325,
"rewards/margins": 0.3949028551578522,
"rewards/rejected": -0.8901691436767578,
"step": 610
},
{
"epoch": 0.14875239923224567,
"grad_norm": 27.549771571534542,
"learning_rate": 4.963953650651326e-07,
"logits/chosen": -0.15485969185829163,
"logits/rejected": -0.16681411862373352,
"logps/chosen": -478.8113708496094,
"logps/rejected": -422.03955078125,
"loss": 0.569,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.63862544298172,
"rewards/margins": 0.40379634499549866,
"rewards/rejected": -1.042421817779541,
"step": 620
},
{
"epoch": 0.15115163147792707,
"grad_norm": 28.772933296866565,
"learning_rate": 4.960324468283248e-07,
"logits/chosen": -0.20728620886802673,
"logits/rejected": -0.2060108482837677,
"logps/chosen": -367.0924377441406,
"logps/rejected": -390.70458984375,
"loss": 0.5636,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8087286949157715,
"rewards/margins": 0.3034602999687195,
"rewards/rejected": -1.1121888160705566,
"step": 630
},
{
"epoch": 0.15355086372360843,
"grad_norm": 29.609222546231578,
"learning_rate": 4.956522703928451e-07,
"logits/chosen": -0.06690754741430283,
"logits/rejected": -0.06723584234714508,
"logps/chosen": -370.3538818359375,
"logps/rejected": -409.23065185546875,
"loss": 0.5409,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.788346529006958,
"rewards/margins": 0.4131564199924469,
"rewards/rejected": -1.2015029191970825,
"step": 640
},
{
"epoch": 0.15595009596928983,
"grad_norm": 38.41016264507651,
"learning_rate": 4.952548624265606e-07,
"logits/chosen": -0.03009071573615074,
"logits/rejected": 0.02059212513267994,
"logps/chosen": -436.8095703125,
"logps/rejected": -453.0166931152344,
"loss": 0.6085,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8183758854866028,
"rewards/margins": 0.35901501774787903,
"rewards/rejected": -1.1773908138275146,
"step": 650
},
{
"epoch": 0.15834932821497122,
"grad_norm": 25.869613582575887,
"learning_rate": 4.948402508060607e-07,
"logits/chosen": -0.0018309459555894136,
"logits/rejected": -0.01893061026930809,
"logps/chosen": -356.6624755859375,
"logps/rejected": -409.0708923339844,
"loss": 0.6026,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6256797313690186,
"rewards/margins": 0.601173460483551,
"rewards/rejected": -1.2268530130386353,
"step": 660
},
{
"epoch": 0.16074856046065258,
"grad_norm": 35.22680312796026,
"learning_rate": 4.944084646147038e-07,
"logits/chosen": 0.0020178346894681454,
"logits/rejected": 0.031680598855018616,
"logps/chosen": -452.8055114746094,
"logps/rejected": -465.51678466796875,
"loss": 0.5999,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.6981381177902222,
"rewards/margins": 0.3360704779624939,
"rewards/rejected": -1.0342086553573608,
"step": 670
},
{
"epoch": 0.16314779270633398,
"grad_norm": 28.0016914634874,
"learning_rate": 4.939595341405754e-07,
"logits/chosen": -0.039152443408966064,
"logits/rejected": -0.05885768681764603,
"logps/chosen": -401.278564453125,
"logps/rejected": -409.3609924316406,
"loss": 0.57,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7071236968040466,
"rewards/margins": 0.3430066704750061,
"rewards/rejected": -1.0501302480697632,
"step": 680
},
{
"epoch": 0.16554702495201534,
"grad_norm": 30.023172826044828,
"learning_rate": 4.93493490874365e-07,
"logits/chosen": -0.00025105997337959707,
"logits/rejected": 0.005772613920271397,
"logps/chosen": -390.638427734375,
"logps/rejected": -424.7112731933594,
"loss": 0.5461,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.7401353716850281,
"rewards/margins": 0.30253323912620544,
"rewards/rejected": -1.0426685810089111,
"step": 690
},
{
"epoch": 0.16794625719769674,
"grad_norm": 41.393690265481474,
"learning_rate": 4.93010367507156e-07,
"logits/chosen": -0.051719047129154205,
"logits/rejected": -0.06900392472743988,
"logps/chosen": -346.08837890625,
"logps/rejected": -374.042724609375,
"loss": 0.5537,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7158280611038208,
"rewards/margins": 0.5490631461143494,
"rewards/rejected": -1.264891266822815,
"step": 700
},
{
"epoch": 0.17034548944337813,
"grad_norm": 33.6357655925115,
"learning_rate": 4.925101979281332e-07,
"logits/chosen": 0.02222558856010437,
"logits/rejected": 0.006278800778090954,
"logps/chosen": -424.63726806640625,
"logps/rejected": -441.04644775390625,
"loss": 0.5799,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.6551335453987122,
"rewards/margins": 0.6523554921150208,
"rewards/rejected": -1.3074891567230225,
"step": 710
},
{
"epoch": 0.1727447216890595,
"grad_norm": 32.69679455555672,
"learning_rate": 4.919930172222054e-07,
"logits/chosen": -0.12917150557041168,
"logits/rejected": -0.12720082700252533,
"logps/chosen": -402.8379821777344,
"logps/rejected": -441.12677001953125,
"loss": 0.5295,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7797117829322815,
"rewards/margins": 0.4280470311641693,
"rewards/rejected": -1.2077586650848389,
"step": 720
},
{
"epoch": 0.1751439539347409,
"grad_norm": 38.43936411357028,
"learning_rate": 4.914588616675445e-07,
"logits/chosen": -0.17864573001861572,
"logits/rejected": -0.20761199295520782,
"logps/chosen": -344.26312255859375,
"logps/rejected": -408.573486328125,
"loss": 0.5897,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.555050253868103,
"rewards/margins": 0.5430852174758911,
"rewards/rejected": -1.098135232925415,
"step": 730
},
{
"epoch": 0.17754318618042225,
"grad_norm": 36.631671098915504,
"learning_rate": 4.909077687330404e-07,
"logits/chosen": -0.11447083950042725,
"logits/rejected": -0.09544442594051361,
"logps/chosen": -418.90838623046875,
"logps/rejected": -417.09844970703125,
"loss": 0.5492,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.7083614468574524,
"rewards/margins": 0.3665878176689148,
"rewards/rejected": -1.0749492645263672,
"step": 740
},
{
"epoch": 0.17994241842610365,
"grad_norm": 32.57669985590322,
"learning_rate": 4.903397770756729e-07,
"logits/chosen": -0.06074325367808342,
"logits/rejected": -0.08299403637647629,
"logps/chosen": -401.88800048828125,
"logps/rejected": -449.99169921875,
"loss": 0.5669,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.6976863145828247,
"rewards/margins": 0.6443861722946167,
"rewards/rejected": -1.342072606086731,
"step": 750
},
{
"epoch": 0.18234165067178504,
"grad_norm": 27.495851946761935,
"learning_rate": 4.897549265378004e-07,
"logits/chosen": -0.18077705800533295,
"logits/rejected": -0.15703561902046204,
"logps/chosen": -486.8914489746094,
"logps/rejected": -522.5025024414062,
"loss": 0.559,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.925071120262146,
"rewards/margins": 0.4355601668357849,
"rewards/rejected": -1.3606312274932861,
"step": 760
},
{
"epoch": 0.1847408829174664,
"grad_norm": 32.88428335628656,
"learning_rate": 4.891532581443643e-07,
"logits/chosen": -0.10509393364191055,
"logits/rejected": -0.13191482424736023,
"logps/chosen": -433.39697265625,
"logps/rejected": -510.9073791503906,
"loss": 0.5335,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.7138081789016724,
"rewards/margins": 0.8751919865608215,
"rewards/rejected": -1.5890003442764282,
"step": 770
},
{
"epoch": 0.1871401151631478,
"grad_norm": 34.828556425360944,
"learning_rate": 4.885348141000122e-07,
"logits/chosen": -0.02448561228811741,
"logits/rejected": -0.08334103226661682,
"logps/chosen": -395.90985107421875,
"logps/rejected": -471.1104431152344,
"loss": 0.56,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.878553569316864,
"rewards/margins": 0.5636481046676636,
"rewards/rejected": -1.4422016143798828,
"step": 780
},
{
"epoch": 0.18953934740882916,
"grad_norm": 34.102875954970415,
"learning_rate": 4.878996377861367e-07,
"logits/chosen": -0.04264168441295624,
"logits/rejected": -0.09717553108930588,
"logps/chosen": -374.21063232421875,
"logps/rejected": -424.62701416015625,
"loss": 0.5366,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0093395709991455,
"rewards/margins": 0.43130987882614136,
"rewards/rejected": -1.4406496286392212,
"step": 790
},
{
"epoch": 0.19193857965451055,
"grad_norm": 34.09209485411543,
"learning_rate": 4.872477737578327e-07,
"logits/chosen": -0.023031553253531456,
"logits/rejected": -0.03800968453288078,
"logps/chosen": -431.33782958984375,
"logps/rejected": -534.5132446289062,
"loss": 0.5265,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8465608358383179,
"rewards/margins": 1.0085922479629517,
"rewards/rejected": -1.8551530838012695,
"step": 800
},
{
"epoch": 0.19433781190019195,
"grad_norm": 44.78458025907374,
"learning_rate": 4.865792677407718e-07,
"logits/chosen": -0.09794610738754272,
"logits/rejected": -0.08297122269868851,
"logps/chosen": -404.9684143066406,
"logps/rejected": -423.43896484375,
"loss": 0.5785,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8548682928085327,
"rewards/margins": 0.37679168581962585,
"rewards/rejected": -1.2316598892211914,
"step": 810
},
{
"epoch": 0.1967370441458733,
"grad_norm": 37.30397170950818,
"learning_rate": 4.858941666279955e-07,
"logits/chosen": -0.20108501613140106,
"logits/rejected": -0.15580318868160248,
"logps/chosen": -440.3353576660156,
"logps/rejected": -437.89337158203125,
"loss": 0.5843,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.7743014693260193,
"rewards/margins": 0.3654334843158722,
"rewards/rejected": -1.1397349834442139,
"step": 820
},
{
"epoch": 0.1991362763915547,
"grad_norm": 37.97880335267858,
"learning_rate": 4.851925184766247e-07,
"logits/chosen": -0.07934032380580902,
"logits/rejected": -0.06675902754068375,
"logps/chosen": -400.7498779296875,
"logps/rejected": -435.03387451171875,
"loss": 0.5744,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8046972155570984,
"rewards/margins": 0.6158983707427979,
"rewards/rejected": -1.4205955266952515,
"step": 830
},
{
"epoch": 0.20153550863723607,
"grad_norm": 34.877131137485314,
"learning_rate": 4.844743725044897e-07,
"logits/chosen": -0.1209510788321495,
"logits/rejected": -0.12060485780239105,
"logps/chosen": -390.33575439453125,
"logps/rejected": -407.412841796875,
"loss": 0.5509,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7886186838150024,
"rewards/margins": 0.5530378222465515,
"rewards/rejected": -1.3416564464569092,
"step": 840
},
{
"epoch": 0.20393474088291746,
"grad_norm": 40.19277289158246,
"learning_rate": 4.837397790866774e-07,
"logits/chosen": -0.07084405422210693,
"logits/rejected": -0.10281334072351456,
"logps/chosen": -429.7625427246094,
"logps/rejected": -490.245361328125,
"loss": 0.5474,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -0.7108091115951538,
"rewards/margins": 0.9831811785697937,
"rewards/rejected": -1.6939903497695923,
"step": 850
},
{
"epoch": 0.20633397312859886,
"grad_norm": 35.43380411461513,
"learning_rate": 4.829887897519974e-07,
"logits/chosen": 0.014303353615105152,
"logits/rejected": -0.007743634283542633,
"logps/chosen": -381.1875,
"logps/rejected": -453.6239318847656,
"loss": 0.5809,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8310438990592957,
"rewards/margins": 0.5285369157791138,
"rewards/rejected": -1.3595808744430542,
"step": 860
},
{
"epoch": 0.20873320537428022,
"grad_norm": 30.773099092132018,
"learning_rate": 4.82221457179368e-07,
"logits/chosen": 0.005006339401006699,
"logits/rejected": -0.01996953971683979,
"logps/chosen": -400.3504638671875,
"logps/rejected": -444.50653076171875,
"loss": 0.5516,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.6430622339248657,
"rewards/margins": 0.6689059138298035,
"rewards/rejected": -1.3119680881500244,
"step": 870
},
{
"epoch": 0.21113243761996162,
"grad_norm": 30.941839797295746,
"learning_rate": 4.814378351941206e-07,
"logits/chosen": -0.03190199285745621,
"logits/rejected": -0.032009296119213104,
"logps/chosen": -378.9139099121094,
"logps/rejected": -410.24896240234375,
"loss": 0.5687,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.6023445129394531,
"rewards/margins": 0.4468112885951996,
"rewards/rejected": -1.049155831336975,
"step": 880
},
{
"epoch": 0.21353166986564298,
"grad_norm": 30.2018268544055,
"learning_rate": 4.806379787642241e-07,
"logits/chosen": 0.03415294736623764,
"logits/rejected": -0.008319585584104061,
"logps/chosen": -373.29327392578125,
"logps/rejected": -426.83856201171875,
"loss": 0.6009,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.6300404071807861,
"rewards/margins": 0.5418477058410645,
"rewards/rejected": -1.171887993812561,
"step": 890
},
{
"epoch": 0.21593090211132437,
"grad_norm": 30.69767076541483,
"learning_rate": 4.798219439964293e-07,
"logits/chosen": -0.022300051525235176,
"logits/rejected": -0.07942859828472137,
"logps/chosen": -382.47088623046875,
"logps/rejected": -428.5862731933594,
"loss": 0.5421,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.79583740234375,
"rewards/margins": 0.3617878556251526,
"rewards/rejected": -1.1576253175735474,
"step": 900
},
{
"epoch": 0.21833013435700577,
"grad_norm": 64.36072526993395,
"learning_rate": 4.78989788132333e-07,
"logits/chosen": -0.07167644053697586,
"logits/rejected": -0.07725416123867035,
"logps/chosen": -351.6461181640625,
"logps/rejected": -432.82916259765625,
"loss": 0.5306,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7295014262199402,
"rewards/margins": 0.7431732416152954,
"rewards/rejected": -1.4726746082305908,
"step": 910
},
{
"epoch": 0.22072936660268713,
"grad_norm": 31.37167746375858,
"learning_rate": 4.781415695443631e-07,
"logits/chosen": 0.07153941690921783,
"logits/rejected": 0.1024637222290039,
"logps/chosen": -490.06402587890625,
"logps/rejected": -516.02294921875,
"loss": 0.5824,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.3680822849273682,
"rewards/margins": 0.20954158902168274,
"rewards/rejected": -1.5776238441467285,
"step": 920
},
{
"epoch": 0.22312859884836853,
"grad_norm": 29.004390037425598,
"learning_rate": 4.772773477316836e-07,
"logits/chosen": 0.03397312015295029,
"logits/rejected": 0.03711385652422905,
"logps/chosen": -467.2877502441406,
"logps/rejected": -509.09716796875,
"loss": 0.5473,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1240530014038086,
"rewards/margins": 0.45423418283462524,
"rewards/rejected": -1.578287124633789,
"step": 930
},
{
"epoch": 0.2255278310940499,
"grad_norm": 41.30297632121209,
"learning_rate": 4.7639718331602117e-07,
"logits/chosen": 0.08684961497783661,
"logits/rejected": 0.05986959859728813,
"logps/chosen": -420.59814453125,
"logps/rejected": -490.409912109375,
"loss": 0.5313,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9049364924430847,
"rewards/margins": 0.7652468681335449,
"rewards/rejected": -1.6701834201812744,
"step": 940
},
{
"epoch": 0.22792706333973128,
"grad_norm": 39.96596996172262,
"learning_rate": 4.7550113803741275e-07,
"logits/chosen": 0.13893774151802063,
"logits/rejected": 0.16821300983428955,
"logps/chosen": -432.7577209472656,
"logps/rejected": -411.2371520996094,
"loss": 0.5632,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9445673823356628,
"rewards/margins": 0.5061396360397339,
"rewards/rejected": -1.450706958770752,
"step": 950
},
{
"epoch": 0.23032629558541268,
"grad_norm": 35.26144835245691,
"learning_rate": 4.7458927474987454e-07,
"logits/chosen": 0.13862411677837372,
"logits/rejected": 0.17463508248329163,
"logps/chosen": -470.4085998535156,
"logps/rejected": -434.1971740722656,
"loss": 0.5406,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.8276304006576538,
"rewards/margins": 0.33768096566200256,
"rewards/rejected": -1.1653112173080444,
"step": 960
},
{
"epoch": 0.23272552783109404,
"grad_norm": 34.457370464422794,
"learning_rate": 4.7366165741699347e-07,
"logits/chosen": 0.06780462712049484,
"logits/rejected": 0.033076416701078415,
"logps/chosen": -474.2489318847656,
"logps/rejected": -491.00421142578125,
"loss": 0.5508,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -0.8541940450668335,
"rewards/margins": 0.41960257291793823,
"rewards/rejected": -1.2737966775894165,
"step": 970
},
{
"epoch": 0.23512476007677544,
"grad_norm": 50.081994664008306,
"learning_rate": 4.727183511074401e-07,
"logits/chosen": 0.12627606093883514,
"logits/rejected": 0.1392831802368164,
"logps/chosen": -422.95947265625,
"logps/rejected": -465.1092834472656,
"loss": 0.5446,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -0.8909593820571899,
"rewards/margins": 0.47495001554489136,
"rewards/rejected": -1.3659093379974365,
"step": 980
},
{
"epoch": 0.2375239923224568,
"grad_norm": 33.56933991120958,
"learning_rate": 4.717594219904043e-07,
"logits/chosen": 0.11548285186290741,
"logits/rejected": 0.17751248180866241,
"logps/chosen": -428.980224609375,
"logps/rejected": -429.6700134277344,
"loss": 0.553,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9127674102783203,
"rewards/margins": 0.5175878405570984,
"rewards/rejected": -1.4303553104400635,
"step": 990
},
{
"epoch": 0.2399232245681382,
"grad_norm": 38.76253931692222,
"learning_rate": 4.7078493733095393e-07,
"logits/chosen": 0.07841446250677109,
"logits/rejected": 0.07714001089334488,
"logps/chosen": -396.0744323730469,
"logps/rejected": -459.9576721191406,
"loss": 0.5356,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.7617571353912354,
"rewards/margins": 0.5821165442466736,
"rewards/rejected": -1.3438737392425537,
"step": 1000
},
{
"epoch": 0.2423224568138196,
"grad_norm": 40.40221370647514,
"learning_rate": 4.6979496548531614e-07,
"logits/chosen": 0.282027930021286,
"logits/rejected": 0.2300875186920166,
"logps/chosen": -417.5662536621094,
"logps/rejected": -517.0253295898438,
"loss": 0.556,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0011545419692993,
"rewards/margins": 0.4213111400604248,
"rewards/rejected": -1.4224655628204346,
"step": 1010
},
{
"epoch": 0.24472168905950095,
"grad_norm": 38.63551061711667,
"learning_rate": 4.6878957589608293e-07,
"logits/chosen": 0.15491922199726105,
"logits/rejected": 0.10176967084407806,
"logps/chosen": -423.72412109375,
"logps/rejected": -521.2840576171875,
"loss": 0.5489,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.8717803955078125,
"rewards/margins": 0.6004728078842163,
"rewards/rejected": -1.4722532033920288,
"step": 1020
},
{
"epoch": 0.24712092130518235,
"grad_norm": 33.04785124844753,
"learning_rate": 4.6776883908733956e-07,
"logits/chosen": 0.3141445815563202,
"logits/rejected": 0.40079420804977417,
"logps/chosen": -444.20037841796875,
"logps/rejected": -440.73992919921875,
"loss": 0.5367,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -0.9564323425292969,
"rewards/margins": 0.6285899877548218,
"rewards/rejected": -1.5850223302841187,
"step": 1030
},
{
"epoch": 0.2495201535508637,
"grad_norm": 56.89197225086117,
"learning_rate": 4.667328266597178e-07,
"logits/chosen": 0.32467955350875854,
"logits/rejected": 0.3737574815750122,
"logps/chosen": -425.14764404296875,
"logps/rejected": -474.8291015625,
"loss": 0.5182,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9110026359558105,
"rewards/margins": 0.6014169454574585,
"rewards/rejected": -1.5124194622039795,
"step": 1040
},
{
"epoch": 0.2519193857965451,
"grad_norm": 42.632075100473685,
"learning_rate": 4.6568161128537354e-07,
"logits/chosen": 0.23409466445446014,
"logits/rejected": 0.4846338629722595,
"logps/chosen": -437.87469482421875,
"logps/rejected": -416.75433349609375,
"loss": 0.5475,
"rewards/accuracies": 0.44999998807907104,
"rewards/chosen": -1.0662238597869873,
"rewards/margins": 0.3336094319820404,
"rewards/rejected": -1.3998332023620605,
"step": 1050
},
{
"epoch": 0.2543186180422265,
"grad_norm": 46.72130069794758,
"learning_rate": 4.6461526670288877e-07,
"logits/chosen": 0.4986523687839508,
"logits/rejected": 0.5356402397155762,
"logps/chosen": -453.13543701171875,
"logps/rejected": -487.8929748535156,
"loss": 0.5798,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0444475412368774,
"rewards/margins": 0.7141000032424927,
"rewards/rejected": -1.7585475444793701,
"step": 1060
},
{
"epoch": 0.2567178502879079,
"grad_norm": 30.901707992623376,
"learning_rate": 4.635338677120994e-07,
"logits/chosen": 0.6319410800933838,
"logits/rejected": 0.5878476500511169,
"logps/chosen": -435.103271484375,
"logps/rejected": -522.3030395507812,
"loss": 0.5107,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.001800775527954,
"rewards/margins": 0.7891290187835693,
"rewards/rejected": -1.7909300327301025,
"step": 1070
},
{
"epoch": 0.2591170825335892,
"grad_norm": 35.62993699091359,
"learning_rate": 4.6243749016884835e-07,
"logits/chosen": 0.6645074486732483,
"logits/rejected": 0.6307970285415649,
"logps/chosen": -460.1568298339844,
"logps/rejected": -596.9638671875,
"loss": 0.554,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2296950817108154,
"rewards/margins": 0.8476268649101257,
"rewards/rejected": -2.077322006225586,
"step": 1080
},
{
"epoch": 0.2615163147792706,
"grad_norm": 55.48184293718509,
"learning_rate": 4.613262109796645e-07,
"logits/chosen": 0.5279312133789062,
"logits/rejected": 0.44912824034690857,
"logps/chosen": -445.187744140625,
"logps/rejected": -569.0379028320312,
"loss": 0.5184,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.0215142965316772,
"rewards/margins": 0.903215765953064,
"rewards/rejected": -1.9247299432754517,
"step": 1090
},
{
"epoch": 0.263915547024952,
"grad_norm": 35.82390141377677,
"learning_rate": 4.602001080963678e-07,
"logits/chosen": 0.5199450254440308,
"logits/rejected": 0.580736517906189,
"logps/chosen": -457.9462890625,
"logps/rejected": -484.3863830566406,
"loss": 0.5432,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.047241449356079,
"rewards/margins": 0.6471258997917175,
"rewards/rejected": -1.6943671703338623,
"step": 1100
},
{
"epoch": 0.2663147792706334,
"grad_norm": 51.55318372805118,
"learning_rate": 4.590592605106017e-07,
"logits/chosen": 0.34312915802001953,
"logits/rejected": 0.3462589979171753,
"logps/chosen": -462.97137451171875,
"logps/rejected": -475.85235595703125,
"loss": 0.5757,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8622667193412781,
"rewards/margins": 0.5276774168014526,
"rewards/rejected": -1.389944076538086,
"step": 1110
},
{
"epoch": 0.2687140115163148,
"grad_norm": 46.55805600175398,
"learning_rate": 4.5790374824829165e-07,
"logits/chosen": 0.5497294068336487,
"logits/rejected": 0.5141938924789429,
"logps/chosen": -329.0898132324219,
"logps/rejected": -395.189208984375,
"loss": 0.5305,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.8122035264968872,
"rewards/margins": 0.555601179599762,
"rewards/rejected": -1.367804765701294,
"step": 1120
},
{
"epoch": 0.27111324376199614,
"grad_norm": 41.30209061097155,
"learning_rate": 4.5673365236403216e-07,
"logits/chosen": 0.5173945426940918,
"logits/rejected": 0.538547158241272,
"logps/chosen": -337.64508056640625,
"logps/rejected": -434.7604064941406,
"loss": 0.5406,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.6650754809379578,
"rewards/margins": 0.7895157337188721,
"rewards/rejected": -1.454591155052185,
"step": 1130
},
{
"epoch": 0.27351247600767753,
"grad_norm": 35.39722075486902,
"learning_rate": 4.5554905493540075e-07,
"logits/chosen": 0.7431238293647766,
"logits/rejected": 0.720431923866272,
"logps/chosen": -369.39825439453125,
"logps/rejected": -469.0665588378906,
"loss": 0.4965,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8106037974357605,
"rewards/margins": 0.9515643119812012,
"rewards/rejected": -1.762168288230896,
"step": 1140
},
{
"epoch": 0.2759117082533589,
"grad_norm": 80.31107636026294,
"learning_rate": 4.5435003905720074e-07,
"logits/chosen": 0.6994370222091675,
"logits/rejected": 0.7717106938362122,
"logps/chosen": -467.26922607421875,
"logps/rejected": -501.1815490722656,
"loss": 0.5376,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.1343969106674194,
"rewards/margins": 0.7365877628326416,
"rewards/rejected": -1.870984673500061,
"step": 1150
},
{
"epoch": 0.2783109404990403,
"grad_norm": 50.544256487524144,
"learning_rate": 4.531366888356324e-07,
"logits/chosen": 0.604827880859375,
"logits/rejected": 0.5405411720275879,
"logps/chosen": -349.20196533203125,
"logps/rejected": -467.71160888671875,
"loss": 0.5206,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9413111805915833,
"rewards/margins": 0.8840651512145996,
"rewards/rejected": -1.8253761529922485,
"step": 1160
},
{
"epoch": 0.2807101727447217,
"grad_norm": 48.679810649088054,
"learning_rate": 4.519090893823931e-07,
"logits/chosen": 0.7196705341339111,
"logits/rejected": 0.7503910660743713,
"logps/chosen": -434.93377685546875,
"logps/rejected": -479.38836669921875,
"loss": 0.5375,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1369459629058838,
"rewards/margins": 0.6235243678092957,
"rewards/rejected": -1.7604703903198242,
"step": 1170
},
{
"epoch": 0.28310940499040305,
"grad_norm": 42.39683927792113,
"learning_rate": 4.5066732680870734e-07,
"logits/chosen": 0.7495613694190979,
"logits/rejected": 0.7793896794319153,
"logps/chosen": -413.90557861328125,
"logps/rejected": -447.45452880859375,
"loss": 0.5181,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0040034055709839,
"rewards/margins": 0.8347917795181274,
"rewards/rejected": -1.8387953042984009,
"step": 1180
},
{
"epoch": 0.28550863723608444,
"grad_norm": 53.13082069754931,
"learning_rate": 4.494114882192862e-07,
"logits/chosen": 0.4293566644191742,
"logits/rejected": 0.44527220726013184,
"logps/chosen": -425.08538818359375,
"logps/rejected": -490.40765380859375,
"loss": 0.504,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9455874562263489,
"rewards/margins": 1.0670359134674072,
"rewards/rejected": -2.0126233100891113,
"step": 1190
},
{
"epoch": 0.28790786948176583,
"grad_norm": 49.87269165648676,
"learning_rate": 4.4814166170621735e-07,
"logits/chosen": 0.6792656183242798,
"logits/rejected": 0.6856303811073303,
"logps/chosen": -430.4750061035156,
"logps/rejected": -503.75634765625,
"loss": 0.5405,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.135371446609497,
"rewards/margins": 1.0104650259017944,
"rewards/rejected": -2.145836591720581,
"step": 1200
},
{
"epoch": 0.2903071017274472,
"grad_norm": 37.72444617932776,
"learning_rate": 4.468579363427858e-07,
"logits/chosen": 0.41752809286117554,
"logits/rejected": 0.4529293477535248,
"logps/chosen": -450.9541931152344,
"logps/rejected": -478.5503845214844,
"loss": 0.554,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3859989643096924,
"rewards/margins": 0.5862727165222168,
"rewards/rejected": -1.9722716808319092,
"step": 1210
},
{
"epoch": 0.2927063339731286,
"grad_norm": 49.47973747014418,
"learning_rate": 4.4556040217722555e-07,
"logits/chosen": 0.6199735403060913,
"logits/rejected": 0.5173524618148804,
"logps/chosen": -390.5383605957031,
"logps/rejected": -529.1012573242188,
"loss": 0.5218,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9236103892326355,
"rewards/margins": 0.9195195436477661,
"rewards/rejected": -1.8431298732757568,
"step": 1220
},
{
"epoch": 0.29510556621880996,
"grad_norm": 43.77387295728714,
"learning_rate": 4.442491502264033e-07,
"logits/chosen": 0.5372000932693481,
"logits/rejected": 0.5111404061317444,
"logps/chosen": -398.32928466796875,
"logps/rejected": -427.90142822265625,
"loss": 0.5579,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1015335321426392,
"rewards/margins": 0.36569902300834656,
"rewards/rejected": -1.467232584953308,
"step": 1230
},
{
"epoch": 0.29750479846449135,
"grad_norm": 35.2179506302823,
"learning_rate": 4.429242724694338e-07,
"logits/chosen": 0.596865177154541,
"logits/rejected": 0.5551019906997681,
"logps/chosen": -403.04803466796875,
"logps/rejected": -482.10455322265625,
"loss": 0.5353,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8973898887634277,
"rewards/margins": 0.7202876806259155,
"rewards/rejected": -1.6176776885986328,
"step": 1240
},
{
"epoch": 0.29990403071017274,
"grad_norm": 35.54165989722752,
"learning_rate": 4.4158586184122817e-07,
"logits/chosen": 0.6986425518989563,
"logits/rejected": 0.7786028385162354,
"logps/chosen": -455.0581970214844,
"logps/rejected": -487.45220947265625,
"loss": 0.5169,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9782212376594543,
"rewards/margins": 0.773512065410614,
"rewards/rejected": -1.751733422279358,
"step": 1250
},
{
"epoch": 0.30230326295585414,
"grad_norm": 34.976845816469115,
"learning_rate": 4.4023401222597443e-07,
"logits/chosen": 0.5812339782714844,
"logits/rejected": 0.6533055305480957,
"logps/chosen": -456.7413635253906,
"logps/rejected": -492.81500244140625,
"loss": 0.4972,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0948355197906494,
"rewards/margins": 0.6430230140686035,
"rewards/rejected": -1.737858533859253,
"step": 1260
},
{
"epoch": 0.30470249520153553,
"grad_norm": 52.15846550296518,
"learning_rate": 4.3886881845055235e-07,
"logits/chosen": 0.6851636171340942,
"logits/rejected": 0.7039676904678345,
"logps/chosen": -395.2878723144531,
"logps/rejected": -475.46319580078125,
"loss": 0.5178,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.8985649347305298,
"rewards/margins": 0.9367402195930481,
"rewards/rejected": -1.8353052139282227,
"step": 1270
},
{
"epoch": 0.30710172744721687,
"grad_norm": 35.79256631055672,
"learning_rate": 4.374903762778814e-07,
"logits/chosen": 0.6985992193222046,
"logits/rejected": 0.6866432428359985,
"logps/chosen": -429.9559020996094,
"logps/rejected": -467.31231689453125,
"loss": 0.5222,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0913515090942383,
"rewards/margins": 0.6997131109237671,
"rewards/rejected": -1.7910646200180054,
"step": 1280
},
{
"epoch": 0.30950095969289826,
"grad_norm": 68.77406798145645,
"learning_rate": 4.3609878240020356e-07,
"logits/chosen": 0.45225849747657776,
"logits/rejected": 0.5497337579727173,
"logps/chosen": -510.07659912109375,
"logps/rejected": -510.8426208496094,
"loss": 0.5356,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.366571307182312,
"rewards/margins": 0.6978545784950256,
"rewards/rejected": -2.0644257068634033,
"step": 1290
},
{
"epoch": 0.31190019193857965,
"grad_norm": 36.68902407720006,
"learning_rate": 4.346941344323005e-07,
"logits/chosen": 0.585986316204071,
"logits/rejected": 0.6672986745834351,
"logps/chosen": -437.39324951171875,
"logps/rejected": -430.4087829589844,
"loss": 0.5562,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.3243210315704346,
"rewards/margins": 0.4856715798377991,
"rewards/rejected": -1.8099925518035889,
"step": 1300
},
{
"epoch": 0.31429942418426104,
"grad_norm": 38.77370809872286,
"learning_rate": 4.332765309046467e-07,
"logits/chosen": 0.7318406105041504,
"logits/rejected": 0.7771567106246948,
"logps/chosen": -450.19427490234375,
"logps/rejected": -471.3865661621094,
"loss": 0.5554,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.116821527481079,
"rewards/margins": 0.6649090051651001,
"rewards/rejected": -1.7817304134368896,
"step": 1310
},
{
"epoch": 0.31669865642994244,
"grad_norm": 45.82746891169888,
"learning_rate": 4.3184607125649754e-07,
"logits/chosen": 0.49596285820007324,
"logits/rejected": 0.5003286600112915,
"logps/chosen": -430.02996826171875,
"logps/rejected": -527.7648315429688,
"loss": 0.5201,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8098013997077942,
"rewards/margins": 0.9345352053642273,
"rewards/rejected": -1.744336485862732,
"step": 1320
},
{
"epoch": 0.3190978886756238,
"grad_norm": 37.74246685501154,
"learning_rate": 4.304028558289141e-07,
"logits/chosen": 0.38717252016067505,
"logits/rejected": 0.39220350980758667,
"logps/chosen": -416.00494384765625,
"logps/rejected": -460.89739990234375,
"loss": 0.4969,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7099177241325378,
"rewards/margins": 0.6884833574295044,
"rewards/rejected": -1.3984010219573975,
"step": 1330
},
{
"epoch": 0.32149712092130517,
"grad_norm": 34.11722436437858,
"learning_rate": 4.28946985857725e-07,
"logits/chosen": 0.5080984234809875,
"logits/rejected": 0.4866611063480377,
"logps/chosen": -444.2494201660156,
"logps/rejected": -542.6705932617188,
"loss": 0.4997,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.9977186918258667,
"rewards/margins": 1.2409141063690186,
"rewards/rejected": -2.2386326789855957,
"step": 1340
},
{
"epoch": 0.32389635316698656,
"grad_norm": 38.18058435916063,
"learning_rate": 4.2747856346642445e-07,
"logits/chosen": 0.4064346253871918,
"logits/rejected": 0.4254288077354431,
"logps/chosen": -389.40472412109375,
"logps/rejected": -465.85906982421875,
"loss": 0.4983,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9736809730529785,
"rewards/margins": 0.8628204464912415,
"rewards/rejected": -1.8365013599395752,
"step": 1350
},
{
"epoch": 0.32629558541266795,
"grad_norm": 45.560969124424204,
"learning_rate": 4.2599769165900933e-07,
"logits/chosen": 0.4976237714290619,
"logits/rejected": 0.4918050765991211,
"logps/chosen": -478.25140380859375,
"logps/rejected": -495.969482421875,
"loss": 0.5657,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.5931600332260132,
"rewards/margins": 0.48865580558776855,
"rewards/rejected": -2.0818159580230713,
"step": 1360
},
{
"epoch": 0.32869481765834935,
"grad_norm": 35.94258808540943,
"learning_rate": 4.245044743127535e-07,
"logits/chosen": 0.5548725128173828,
"logits/rejected": 0.46006709337234497,
"logps/chosen": -428.947021484375,
"logps/rejected": -524.8760986328125,
"loss": 0.5207,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1485120058059692,
"rewards/margins": 0.7874538898468018,
"rewards/rejected": -1.9359657764434814,
"step": 1370
},
{
"epoch": 0.3310940499040307,
"grad_norm": 42.265392991866655,
"learning_rate": 4.229990161709214e-07,
"logits/chosen": 0.547171950340271,
"logits/rejected": 0.4217755198478699,
"logps/chosen": -401.33447265625,
"logps/rejected": -532.9482421875,
"loss": 0.5491,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.8861383199691772,
"rewards/margins": 1.117545485496521,
"rewards/rejected": -2.0036838054656982,
"step": 1380
},
{
"epoch": 0.3334932821497121,
"grad_norm": 32.39940263140558,
"learning_rate": 4.214814228354204e-07,
"logits/chosen": 0.4310382008552551,
"logits/rejected": 0.47493353486061096,
"logps/chosen": -467.65216064453125,
"logps/rejected": -549.2056884765625,
"loss": 0.5295,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1547861099243164,
"rewards/margins": 1.1499736309051514,
"rewards/rejected": -2.304759979248047,
"step": 1390
},
{
"epoch": 0.33589251439539347,
"grad_norm": 37.76328528326702,
"learning_rate": 4.1995180075939375e-07,
"logits/chosen": 0.6290279626846313,
"logits/rejected": 0.5864508748054504,
"logps/chosen": -488.428466796875,
"logps/rejected": -547.3450927734375,
"loss": 0.5114,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2981139421463013,
"rewards/margins": 0.8224126100540161,
"rewards/rejected": -2.1205263137817383,
"step": 1400
},
{
"epoch": 0.33829174664107486,
"grad_norm": 41.27479960235454,
"learning_rate": 4.1841025723975297e-07,
"logits/chosen": 0.42811208963394165,
"logits/rejected": 0.41358089447021484,
"logps/chosen": -445.8192443847656,
"logps/rejected": -505.38653564453125,
"loss": 0.4953,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9554083943367004,
"rewards/margins": 0.7210197448730469,
"rewards/rejected": -1.676428198814392,
"step": 1410
},
{
"epoch": 0.34069097888675626,
"grad_norm": 37.87482486935292,
"learning_rate": 4.168569004096516e-07,
"logits/chosen": 0.4879208207130432,
"logits/rejected": 0.37299996614456177,
"logps/chosen": -421.3837890625,
"logps/rejected": -540.9444580078125,
"loss": 0.4969,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2045072317123413,
"rewards/margins": 1.0760588645935059,
"rewards/rejected": -2.2805662155151367,
"step": 1420
},
{
"epoch": 0.3430902111324376,
"grad_norm": 34.143233451160405,
"learning_rate": 4.152918392308997e-07,
"logits/chosen": 0.4631095826625824,
"logits/rejected": 0.44977670907974243,
"logps/chosen": -420.1924743652344,
"logps/rejected": -457.46673583984375,
"loss": 0.4946,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1693400144577026,
"rewards/margins": 0.540154218673706,
"rewards/rejected": -1.7094943523406982,
"step": 1430
},
{
"epoch": 0.345489443378119,
"grad_norm": 79.72399784718598,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": 0.30308836698532104,
"logits/rejected": 0.19191868603229523,
"logps/chosen": -421.8958435058594,
"logps/rejected": -534.3394775390625,
"loss": 0.5464,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1978578567504883,
"rewards/margins": 0.8276159167289734,
"rewards/rejected": -2.0254738330841064,
"step": 1440
},
{
"epoch": 0.3478886756238004,
"grad_norm": 53.81472585528722,
"learning_rate": 4.121270437720526e-07,
"logits/chosen": 0.2503531575202942,
"logits/rejected": 0.20632532238960266,
"logps/chosen": -388.9275817871094,
"logps/rejected": -504.99627685546875,
"loss": 0.5436,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.158739447593689,
"rewards/margins": 0.6908172965049744,
"rewards/rejected": -1.8495569229125977,
"step": 1450
},
{
"epoch": 0.3502879078694818,
"grad_norm": 45.6784642712931,
"learning_rate": 4.105275314897852e-07,
"logits/chosen": 0.48888054490089417,
"logits/rejected": 0.3766574263572693,
"logps/chosen": -397.5686340332031,
"logps/rejected": -535.5113525390625,
"loss": 0.5107,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.021269679069519,
"rewards/margins": 1.1115610599517822,
"rewards/rejected": -2.1328306198120117,
"step": 1460
},
{
"epoch": 0.35268714011516317,
"grad_norm": 42.50127277305204,
"learning_rate": 4.089167588389508e-07,
"logits/chosen": 0.35595473647117615,
"logits/rejected": 0.4420366883277893,
"logps/chosen": -525.4200439453125,
"logps/rejected": -575.6399536132812,
"loss": 0.5315,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.174477219581604,
"rewards/margins": 0.9557849168777466,
"rewards/rejected": -2.1302618980407715,
"step": 1470
},
{
"epoch": 0.3550863723608445,
"grad_norm": 66.9134896066362,
"learning_rate": 4.072948388088515e-07,
"logits/chosen": 0.4660380482673645,
"logits/rejected": 0.48526984453201294,
"logps/chosen": -472.48773193359375,
"logps/rejected": -540.60546875,
"loss": 0.5512,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.309525966644287,
"rewards/margins": 0.729021430015564,
"rewards/rejected": -2.0385475158691406,
"step": 1480
},
{
"epoch": 0.3574856046065259,
"grad_norm": 48.14455914875948,
"learning_rate": 4.056618851707334e-07,
"logits/chosen": 0.3936781585216522,
"logits/rejected": 0.37658897042274475,
"logps/chosen": -417.9375915527344,
"logps/rejected": -505.556396484375,
"loss": 0.4962,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.8807679414749146,
"rewards/margins": 0.8251503109931946,
"rewards/rejected": -1.7059180736541748,
"step": 1490
},
{
"epoch": 0.3598848368522073,
"grad_norm": 44.71962261776299,
"learning_rate": 4.0401801246980675e-07,
"logits/chosen": 0.2104732245206833,
"logits/rejected": 0.22102966904640198,
"logps/chosen": -413.0994567871094,
"logps/rejected": -452.8529357910156,
"loss": 0.5258,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.222390055656433,
"rewards/margins": 0.6135789155960083,
"rewards/rejected": -1.8359689712524414,
"step": 1500
},
{
"epoch": 0.3622840690978887,
"grad_norm": 38.59038142711945,
"learning_rate": 4.0236333601721043e-07,
"logits/chosen": 0.36115556955337524,
"logits/rejected": 0.27192938327789307,
"logps/chosen": -518.89306640625,
"logps/rejected": -567.1900024414062,
"loss": 0.5451,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.524860143661499,
"rewards/margins": 0.5048703551292419,
"rewards/rejected": -2.0297303199768066,
"step": 1510
},
{
"epoch": 0.3646833013435701,
"grad_norm": 48.99560916590031,
"learning_rate": 4.0069797188192364e-07,
"logits/chosen": 0.2493390589952469,
"logits/rejected": 0.25582900643348694,
"logps/chosen": -457.5439453125,
"logps/rejected": -511.53466796875,
"loss": 0.5255,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0995330810546875,
"rewards/margins": 0.8029910326004028,
"rewards/rejected": -1.9025242328643799,
"step": 1520
},
{
"epoch": 0.3670825335892514,
"grad_norm": 43.28635521609486,
"learning_rate": 3.9902203688262417e-07,
"logits/chosen": 0.24590995907783508,
"logits/rejected": 0.2573690414428711,
"logps/chosen": -447.503173828125,
"logps/rejected": -495.9049377441406,
"loss": 0.5052,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1626179218292236,
"rewards/margins": 0.7147835493087769,
"rewards/rejected": -1.87740159034729,
"step": 1530
},
{
"epoch": 0.3694817658349328,
"grad_norm": 75.4063331165295,
"learning_rate": 3.9733564857949365e-07,
"logits/chosen": 0.36004549264907837,
"logits/rejected": 0.39339983463287354,
"logps/chosen": -538.8134765625,
"logps/rejected": -569.4513549804688,
"loss": 0.5428,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.430328369140625,
"rewards/margins": 0.773267388343811,
"rewards/rejected": -2.2035956382751465,
"step": 1540
},
{
"epoch": 0.3718809980806142,
"grad_norm": 47.00943225874421,
"learning_rate": 3.9563892526597177e-07,
"logits/chosen": 0.38262271881103516,
"logits/rejected": 0.3127327561378479,
"logps/chosen": -405.52008056640625,
"logps/rejected": -523.7188720703125,
"loss": 0.5275,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2701631784439087,
"rewards/margins": 0.673926591873169,
"rewards/rejected": -1.944089651107788,
"step": 1550
},
{
"epoch": 0.3742802303262956,
"grad_norm": 44.77491303021576,
"learning_rate": 3.9393198596045795e-07,
"logits/chosen": 0.2474546879529953,
"logits/rejected": 0.1317511945962906,
"logps/chosen": -421.62994384765625,
"logps/rejected": -519.5099487304688,
"loss": 0.5383,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2208540439605713,
"rewards/margins": 0.7671472430229187,
"rewards/rejected": -1.9880012273788452,
"step": 1560
},
{
"epoch": 0.376679462571977,
"grad_norm": 37.95179606415185,
"learning_rate": 3.922149503979628e-07,
"logits/chosen": 0.2700248658657074,
"logits/rejected": 0.21610090136528015,
"logps/chosen": -471.33056640625,
"logps/rejected": -593.8278198242188,
"loss": 0.5122,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2792729139328003,
"rewards/margins": 1.1995410919189453,
"rewards/rejected": -2.4788146018981934,
"step": 1570
},
{
"epoch": 0.3790786948176583,
"grad_norm": 55.896865397911,
"learning_rate": 3.904879390217095e-07,
"logits/chosen": 0.12995900213718414,
"logits/rejected": 0.12265945971012115,
"logps/chosen": -443.599365234375,
"logps/rejected": -492.68450927734375,
"loss": 0.528,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2007102966308594,
"rewards/margins": 0.6766700744628906,
"rewards/rejected": -1.87738037109375,
"step": 1580
},
{
"epoch": 0.3814779270633397,
"grad_norm": 49.93484321544338,
"learning_rate": 3.8875107297468463e-07,
"logits/chosen": 0.20564258098602295,
"logits/rejected": 0.0780414491891861,
"logps/chosen": -411.8665466308594,
"logps/rejected": -573.74951171875,
"loss": 0.5229,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.0018932819366455,
"rewards/margins": 1.2348394393920898,
"rewards/rejected": -2.2367329597473145,
"step": 1590
},
{
"epoch": 0.3838771593090211,
"grad_norm": 38.069521505621516,
"learning_rate": 3.87004474091141e-07,
"logits/chosen": 0.3447803258895874,
"logits/rejected": 0.3082936704158783,
"logps/chosen": -405.9560852050781,
"logps/rejected": -489.4607849121094,
"loss": 0.541,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.0926564931869507,
"rewards/margins": 0.7233616709709167,
"rewards/rejected": -1.8160178661346436,
"step": 1600
},
{
"epoch": 0.3862763915547025,
"grad_norm": 44.24824114407542,
"learning_rate": 3.8524826488805114e-07,
"logits/chosen": 0.3052324950695038,
"logits/rejected": 0.3181813657283783,
"logps/chosen": -473.97796630859375,
"logps/rejected": -500.7769470214844,
"loss": 0.547,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2699439525604248,
"rewards/margins": 0.7647022008895874,
"rewards/rejected": -2.0346462726593018,
"step": 1610
},
{
"epoch": 0.3886756238003839,
"grad_norm": 47.309718786937964,
"learning_rate": 3.834825685565133e-07,
"logits/chosen": 0.33559301495552063,
"logits/rejected": 0.3656995892524719,
"logps/chosen": -414.19256591796875,
"logps/rejected": -421.0203552246094,
"loss": 0.4953,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.065473198890686,
"rewards/margins": 0.5177011489868164,
"rewards/rejected": -1.5831743478775024,
"step": 1620
},
{
"epoch": 0.39107485604606523,
"grad_norm": 42.86172629937328,
"learning_rate": 3.8170750895311007e-07,
"logits/chosen": 0.1855572611093521,
"logits/rejected": 0.17679139971733093,
"logps/chosen": -452.2789001464844,
"logps/rejected": -500.349609375,
"loss": 0.4908,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -0.9900191426277161,
"rewards/margins": 0.7418977618217468,
"rewards/rejected": -1.7319167852401733,
"step": 1630
},
{
"epoch": 0.3934740882917466,
"grad_norm": 45.001223140761674,
"learning_rate": 3.7992321059122045e-07,
"logits/chosen": 0.2781444787979126,
"logits/rejected": 0.30307430028915405,
"logps/chosen": -414.05523681640625,
"logps/rejected": -462.14239501953125,
"loss": 0.5237,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1657673120498657,
"rewards/margins": 0.6909239888191223,
"rewards/rejected": -1.8566913604736328,
"step": 1640
},
{
"epoch": 0.395873320537428,
"grad_norm": 60.544325020503095,
"learning_rate": 3.7812979863228576e-07,
"logits/chosen": 0.2274487465620041,
"logits/rejected": 0.16551566123962402,
"logps/chosen": -405.3382873535156,
"logps/rejected": -493.6697692871094,
"loss": 0.4928,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2897754907608032,
"rewards/margins": 0.8161094784736633,
"rewards/rejected": -2.1058847904205322,
"step": 1650
},
{
"epoch": 0.3982725527831094,
"grad_norm": 50.449927443360075,
"learning_rate": 3.763273988770296e-07,
"logits/chosen": 0.40345683693885803,
"logits/rejected": 0.39551275968551636,
"logps/chosen": -453.79803466796875,
"logps/rejected": -535.8180541992188,
"loss": 0.4965,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3234025239944458,
"rewards/margins": 0.8552868962287903,
"rewards/rejected": -2.178689479827881,
"step": 1660
},
{
"epoch": 0.4006717850287908,
"grad_norm": 45.22606638463477,
"learning_rate": 3.7451613775663405e-07,
"logits/chosen": 0.2254648655653,
"logits/rejected": 0.15715382993221283,
"logps/chosen": -444.4361267089844,
"logps/rejected": -565.7696533203125,
"loss": 0.5436,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3264541625976562,
"rewards/margins": 1.2607003450393677,
"rewards/rejected": -2.5871543884277344,
"step": 1670
},
{
"epoch": 0.40307101727447214,
"grad_norm": 56.89213037695923,
"learning_rate": 3.726961423238706e-07,
"logits/chosen": 0.2933524250984192,
"logits/rejected": 0.212088942527771,
"logps/chosen": -426.76080322265625,
"logps/rejected": -546.6845703125,
"loss": 0.5149,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.2648175954818726,
"rewards/margins": 1.0558243989944458,
"rewards/rejected": -2.3206419944763184,
"step": 1680
},
{
"epoch": 0.40547024952015354,
"grad_norm": 48.93178210300578,
"learning_rate": 3.708675402441882e-07,
"logits/chosen": 0.2865277826786041,
"logits/rejected": 0.37102895975112915,
"logps/chosen": -484.88519287109375,
"logps/rejected": -502.8384704589844,
"loss": 0.5365,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2438859939575195,
"rewards/margins": 0.6120424270629883,
"rewards/rejected": -1.855928659439087,
"step": 1690
},
{
"epoch": 0.40786948176583493,
"grad_norm": 41.4789900308926,
"learning_rate": 3.6903045978675775e-07,
"logits/chosen": 0.3034370541572571,
"logits/rejected": 0.2625337243080139,
"logps/chosen": -386.1392517089844,
"logps/rejected": -470.08135986328125,
"loss": 0.5013,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9028989672660828,
"rewards/margins": 1.0875600576400757,
"rewards/rejected": -1.9904590845108032,
"step": 1700
},
{
"epoch": 0.4102687140115163,
"grad_norm": 35.84427094735192,
"learning_rate": 3.6718502981547474e-07,
"logits/chosen": 0.385175883769989,
"logits/rejected": 0.2869270443916321,
"logps/chosen": -436.753662109375,
"logps/rejected": -548.1320190429688,
"loss": 0.512,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1051702499389648,
"rewards/margins": 0.6421515345573425,
"rewards/rejected": -1.7473220825195312,
"step": 1710
},
{
"epoch": 0.4126679462571977,
"grad_norm": 36.76356281345392,
"learning_rate": 3.6533137977991986e-07,
"logits/chosen": 0.2681284248828888,
"logits/rejected": 0.27597135305404663,
"logps/chosen": -444.15826416015625,
"logps/rejected": -524.8231201171875,
"loss": 0.5344,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -0.9779054522514343,
"rewards/margins": 0.62675940990448,
"rewards/rejected": -1.6046650409698486,
"step": 1720
},
{
"epoch": 0.41506717850287905,
"grad_norm": 42.054568712185926,
"learning_rate": 3.6346963970627865e-07,
"logits/chosen": 0.3877958655357361,
"logits/rejected": 0.2975226044654846,
"logps/chosen": -420.9158630371094,
"logps/rejected": -515.4686889648438,
"loss": 0.5036,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0269657373428345,
"rewards/margins": 0.7597817182540894,
"rewards/rejected": -1.7867473363876343,
"step": 1730
},
{
"epoch": 0.41746641074856045,
"grad_norm": 44.183218072360475,
"learning_rate": 3.615999401882207e-07,
"logits/chosen": 0.5101007223129272,
"logits/rejected": 0.44053035974502563,
"logps/chosen": -388.8902282714844,
"logps/rejected": -512.8978881835938,
"loss": 0.5121,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2427517175674438,
"rewards/margins": 0.933813214302063,
"rewards/rejected": -2.1765646934509277,
"step": 1740
},
{
"epoch": 0.41986564299424184,
"grad_norm": 38.84095884357132,
"learning_rate": 3.597224123777389e-07,
"logits/chosen": 0.4116114675998688,
"logits/rejected": 0.3661612570285797,
"logps/chosen": -430.21990966796875,
"logps/rejected": -544.9900512695312,
"loss": 0.4882,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1606342792510986,
"rewards/margins": 1.0220921039581299,
"rewards/rejected": -2.1827263832092285,
"step": 1750
},
{
"epoch": 0.42226487523992323,
"grad_norm": 48.48200071110331,
"learning_rate": 3.5783718797595e-07,
"logits/chosen": 0.3250165581703186,
"logits/rejected": 0.41192755103111267,
"logps/chosen": -487.70404052734375,
"logps/rejected": -506.49530029296875,
"loss": 0.5418,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.321012020111084,
"rewards/margins": 0.7061235308647156,
"rewards/rejected": -2.0271353721618652,
"step": 1760
},
{
"epoch": 0.4246641074856046,
"grad_norm": 41.976087972205285,
"learning_rate": 3.559443992238558e-07,
"logits/chosen": 0.38490504026412964,
"logits/rejected": 0.3506616950035095,
"logps/chosen": -414.7301330566406,
"logps/rejected": -553.3479614257812,
"loss": 0.5277,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -0.9950034022331238,
"rewards/margins": 1.1345813274383545,
"rewards/rejected": -2.129584550857544,
"step": 1770
},
{
"epoch": 0.42706333973128596,
"grad_norm": 44.81649257476256,
"learning_rate": 3.540441788930673e-07,
"logits/chosen": 0.3962218165397644,
"logits/rejected": 0.325061172246933,
"logps/chosen": -467.65155029296875,
"logps/rejected": -539.2551879882812,
"loss": 0.4933,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1214022636413574,
"rewards/margins": 1.1308571100234985,
"rewards/rejected": -2.2522594928741455,
"step": 1780
},
{
"epoch": 0.42946257197696736,
"grad_norm": 45.2856769850179,
"learning_rate": 3.5213666027649123e-07,
"logits/chosen": 0.33266204595565796,
"logits/rejected": 0.3824441134929657,
"logps/chosen": -480.2119140625,
"logps/rejected": -476.08984375,
"loss": 0.5238,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3972989320755005,
"rewards/margins": 0.541749119758606,
"rewards/rejected": -1.9390478134155273,
"step": 1790
},
{
"epoch": 0.43186180422264875,
"grad_norm": 54.06455040727181,
"learning_rate": 3.5022197717898017e-07,
"logits/chosen": 0.19602210819721222,
"logits/rejected": 0.23719044029712677,
"logps/chosen": -394.2027587890625,
"logps/rejected": -459.33221435546875,
"loss": 0.4769,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.173718810081482,
"rewards/margins": 0.8884655833244324,
"rewards/rejected": -2.0621845722198486,
"step": 1800
},
{
"epoch": 0.43426103646833014,
"grad_norm": 36.13993495552892,
"learning_rate": 3.4830026390794633e-07,
"logits/chosen": 0.16905806958675385,
"logits/rejected": 0.14926643669605255,
"logps/chosen": -505.55126953125,
"logps/rejected": -551.2828979492188,
"loss": 0.4745,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3583290576934814,
"rewards/margins": 1.004029631614685,
"rewards/rejected": -2.362358570098877,
"step": 1810
},
{
"epoch": 0.43666026871401153,
"grad_norm": 32.25356254003183,
"learning_rate": 3.4637165526394104e-07,
"logits/chosen": 0.23928511142730713,
"logits/rejected": 0.22237971425056458,
"logps/chosen": -415.0269470214844,
"logps/rejected": -494.46405029296875,
"loss": 0.5278,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.063435435295105,
"rewards/margins": 0.7565540671348572,
"rewards/rejected": -1.819989562034607,
"step": 1820
},
{
"epoch": 0.43905950095969287,
"grad_norm": 34.4419746511506,
"learning_rate": 3.4443628653119814e-07,
"logits/chosen": 0.27581119537353516,
"logits/rejected": 0.24289298057556152,
"logps/chosen": -466.73529052734375,
"logps/rejected": -645.5926513671875,
"loss": 0.5157,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2824543714523315,
"rewards/margins": 1.4018195867538452,
"rewards/rejected": -2.684274196624756,
"step": 1830
},
{
"epoch": 0.44145873320537427,
"grad_norm": 41.075704870340594,
"learning_rate": 3.424942934681453e-07,
"logits/chosen": 0.27590471506118774,
"logits/rejected": 0.33334219455718994,
"logps/chosen": -408.6986083984375,
"logps/rejected": -506.6996154785156,
"loss": 0.5031,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9812175035476685,
"rewards/margins": 1.1212607622146606,
"rewards/rejected": -2.10247802734375,
"step": 1840
},
{
"epoch": 0.44385796545105566,
"grad_norm": 51.35010815764105,
"learning_rate": 3.405458122978804e-07,
"logits/chosen": 0.28459858894348145,
"logits/rejected": 0.24139773845672607,
"logps/chosen": -467.11932373046875,
"logps/rejected": -536.8555908203125,
"loss": 0.4962,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.151214838027954,
"rewards/margins": 0.9230279922485352,
"rewards/rejected": -2.0742428302764893,
"step": 1850
},
{
"epoch": 0.44625719769673705,
"grad_norm": 58.94555175884757,
"learning_rate": 3.3859097969861633e-07,
"logits/chosen": 0.3147757649421692,
"logits/rejected": 0.296464741230011,
"logps/chosen": -475.61175537109375,
"logps/rejected": -521.8074951171875,
"loss": 0.5255,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.389574646949768,
"rewards/margins": 0.8023090362548828,
"rewards/rejected": -2.1918835639953613,
"step": 1860
},
{
"epoch": 0.44865642994241844,
"grad_norm": 43.58295874945141,
"learning_rate": 3.366299327940936e-07,
"logits/chosen": 0.2593730092048645,
"logits/rejected": 0.1364545077085495,
"logps/chosen": -485.0771484375,
"logps/rejected": -578.6021728515625,
"loss": 0.512,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2985343933105469,
"rewards/margins": 0.7179661989212036,
"rewards/rejected": -2.01650071144104,
"step": 1870
},
{
"epoch": 0.4510556621880998,
"grad_norm": 33.90433109567072,
"learning_rate": 3.3466280914396117e-07,
"logits/chosen": 0.17524075508117676,
"logits/rejected": 0.12327942997217178,
"logps/chosen": -436.56536865234375,
"logps/rejected": -551.0841064453125,
"loss": 0.5111,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3039577007293701,
"rewards/margins": 0.9412840604782104,
"rewards/rejected": -2.24524188041687,
"step": 1880
},
{
"epoch": 0.4534548944337812,
"grad_norm": 48.57573160920276,
"learning_rate": 3.326897467341281e-07,
"logits/chosen": 0.10545514523983002,
"logits/rejected": 0.10264859348535538,
"logps/chosen": -394.76776123046875,
"logps/rejected": -494.9923400878906,
"loss": 0.525,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1733514070510864,
"rewards/margins": 0.8517268896102905,
"rewards/rejected": -2.025078296661377,
"step": 1890
},
{
"epoch": 0.45585412667946257,
"grad_norm": 52.20592211080183,
"learning_rate": 3.3071088396708335e-07,
"logits/chosen": 0.16945740580558777,
"logits/rejected": 0.12676987051963806,
"logps/chosen": -370.2272033691406,
"logps/rejected": -489.0442810058594,
"loss": 0.503,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9613567590713501,
"rewards/margins": 1.0801159143447876,
"rewards/rejected": -2.0414726734161377,
"step": 1900
},
{
"epoch": 0.45825335892514396,
"grad_norm": 39.11646762477483,
"learning_rate": 3.2872635965218824e-07,
"logits/chosen": 0.36154884099960327,
"logits/rejected": 0.3266182541847229,
"logps/chosen": -472.8251037597656,
"logps/rejected": -570.266845703125,
"loss": 0.524,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5062012672424316,
"rewards/margins": 0.7707003355026245,
"rewards/rejected": -2.2769012451171875,
"step": 1910
},
{
"epoch": 0.46065259117082535,
"grad_norm": 33.57125004563553,
"learning_rate": 3.2673631299593905e-07,
"logits/chosen": 0.20721454918384552,
"logits/rejected": 0.24143996834754944,
"logps/chosen": -474.7398376464844,
"logps/rejected": -535.4746704101562,
"loss": 0.5204,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3117005825042725,
"rewards/margins": 0.8077449798583984,
"rewards/rejected": -2.119445562362671,
"step": 1920
},
{
"epoch": 0.4630518234165067,
"grad_norm": 49.103327518032735,
"learning_rate": 3.247408835922024e-07,
"logits/chosen": 0.3439037799835205,
"logits/rejected": 0.2654734253883362,
"logps/chosen": -527.8143310546875,
"logps/rejected": -611.921875,
"loss": 0.506,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5940402746200562,
"rewards/margins": 0.856569766998291,
"rewards/rejected": -2.4506099224090576,
"step": 1930
},
{
"epoch": 0.4654510556621881,
"grad_norm": 49.876953655813104,
"learning_rate": 3.2274021141242306e-07,
"logits/chosen": 0.43298500776290894,
"logits/rejected": 0.4217461049556732,
"logps/chosen": -458.80535888671875,
"logps/rejected": -546.6775512695312,
"loss": 0.4867,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2974848747253418,
"rewards/margins": 0.8723615407943726,
"rewards/rejected": -2.169846534729004,
"step": 1940
},
{
"epoch": 0.4678502879078695,
"grad_norm": 59.65377964792493,
"learning_rate": 3.2073443679580613e-07,
"logits/chosen": 0.2417244166135788,
"logits/rejected": 0.23499338328838348,
"logps/chosen": -469.7822265625,
"logps/rejected": -545.030517578125,
"loss": 0.4895,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3010753393173218,
"rewards/margins": 0.6901552081108093,
"rewards/rejected": -1.9912303686141968,
"step": 1950
},
{
"epoch": 0.47024952015355087,
"grad_norm": 48.010287716369675,
"learning_rate": 3.1872370043947194e-07,
"logits/chosen": 0.44626665115356445,
"logits/rejected": 0.40652981400489807,
"logps/chosen": -418.04608154296875,
"logps/rejected": -536.6441650390625,
"loss": 0.464,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.8487616777420044,
"rewards/margins": 1.3238131999969482,
"rewards/rejected": -2.1725752353668213,
"step": 1960
},
{
"epoch": 0.47264875239923226,
"grad_norm": 46.983751656355615,
"learning_rate": 3.167081433885874e-07,
"logits/chosen": 0.4636153280735016,
"logits/rejected": 0.4149314761161804,
"logps/chosen": -560.605712890625,
"logps/rejected": -700.2041015625,
"loss": 0.4653,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.5527485609054565,
"rewards/margins": 0.9842365384101868,
"rewards/rejected": -2.536984920501709,
"step": 1970
},
{
"epoch": 0.4750479846449136,
"grad_norm": 46.832783839770435,
"learning_rate": 3.14687907026472e-07,
"logits/chosen": 0.34756892919540405,
"logits/rejected": 0.3631365895271301,
"logps/chosen": -452.46533203125,
"logps/rejected": -579.3941650390625,
"loss": 0.5059,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.4736191034317017,
"rewards/margins": 1.1666433811187744,
"rewards/rejected": -2.6402623653411865,
"step": 1980
},
{
"epoch": 0.477447216890595,
"grad_norm": 51.65835958499199,
"learning_rate": 3.126631330646801e-07,
"logits/chosen": 0.28329282999038696,
"logits/rejected": 0.23793701827526093,
"logps/chosen": -574.2579956054688,
"logps/rejected": -644.44677734375,
"loss": 0.5244,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -2.125730037689209,
"rewards/margins": 0.6223888397216797,
"rewards/rejected": -2.7481188774108887,
"step": 1990
},
{
"epoch": 0.4798464491362764,
"grad_norm": 56.94846979096699,
"learning_rate": 3.1063396353306097e-07,
"logits/chosen": 0.370736300945282,
"logits/rejected": 0.43973660469055176,
"logps/chosen": -495.07171630859375,
"logps/rejected": -519.3072509765625,
"loss": 0.5009,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2859165668487549,
"rewards/margins": 0.9244076609611511,
"rewards/rejected": -2.2103240489959717,
"step": 2000
},
{
"epoch": 0.4798464491362764,
"eval_logits/chosen": 0.5136142373085022,
"eval_logits/rejected": 0.4682252109050751,
"eval_logps/chosen": -468.3976135253906,
"eval_logps/rejected": -586.2582397460938,
"eval_loss": 0.49979615211486816,
"eval_rewards/accuracies": 0.7803571224212646,
"eval_rewards/chosen": -1.4972540140151978,
"eval_rewards/margins": 1.1174662113189697,
"eval_rewards/rejected": -2.614720106124878,
"eval_runtime": 185.1772,
"eval_samples_per_second": 24.09,
"eval_steps_per_second": 0.378,
"step": 2000
},
{
"epoch": 0.4822456813819578,
"grad_norm": 61.64399490626884,
"learning_rate": 3.0860054076979535e-07,
"logits/chosen": 0.34754273295402527,
"logits/rejected": 0.3305366635322571,
"logps/chosen": -490.9940490722656,
"logps/rejected": -570.1460571289062,
"loss": 0.4823,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.5707345008850098,
"rewards/margins": 1.0668280124664307,
"rewards/rejected": -2.6375622749328613,
"step": 2010
},
{
"epoch": 0.4846449136276392,
"grad_norm": 54.18063166333372,
"learning_rate": 3.065630074114115e-07,
"logits/chosen": 0.3459337651729584,
"logits/rejected": 0.36747267842292786,
"logps/chosen": -486.1033630371094,
"logps/rejected": -554.5687866210938,
"loss": 0.5237,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3654518127441406,
"rewards/margins": 1.115387201309204,
"rewards/rejected": -2.4808387756347656,
"step": 2020
},
{
"epoch": 0.4870441458733205,
"grad_norm": 48.37481533662561,
"learning_rate": 3.0452150638277947e-07,
"logits/chosen": 0.3800879120826721,
"logits/rejected": 0.3224307894706726,
"logps/chosen": -418.9652404785156,
"logps/rejected": -507.47747802734375,
"loss": 0.5304,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2180391550064087,
"rewards/margins": 0.8009985685348511,
"rewards/rejected": -2.0190374851226807,
"step": 2030
},
{
"epoch": 0.4894433781190019,
"grad_norm": 35.82205388348395,
"learning_rate": 3.024761808870856e-07,
"logits/chosen": 0.43575650453567505,
"logits/rejected": 0.3246951997280121,
"logps/chosen": -394.49700927734375,
"logps/rejected": -528.8324584960938,
"loss": 0.4962,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9007269740104675,
"rewards/margins": 1.4299715757369995,
"rewards/rejected": -2.3306984901428223,
"step": 2040
},
{
"epoch": 0.4918426103646833,
"grad_norm": 69.46282819499118,
"learning_rate": 3.004271743957875e-07,
"logits/chosen": 0.1757555603981018,
"logits/rejected": 0.11282005161046982,
"logps/chosen": -492.5611877441406,
"logps/rejected": -602.2737426757812,
"loss": 0.5181,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.534406304359436,
"rewards/margins": 0.9544004201889038,
"rewards/rejected": -2.48880672454834,
"step": 2050
},
{
"epoch": 0.4942418426103647,
"grad_norm": 45.79139932334549,
"learning_rate": 2.983746306385499e-07,
"logits/chosen": 0.26720863580703735,
"logits/rejected": 0.22653250396251678,
"logps/chosen": -450.50970458984375,
"logps/rejected": -577.3936767578125,
"loss": 0.481,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3142211437225342,
"rewards/margins": 1.129241704940796,
"rewards/rejected": -2.44346284866333,
"step": 2060
},
{
"epoch": 0.4966410748560461,
"grad_norm": 43.321422686785745,
"learning_rate": 2.963186935931628e-07,
"logits/chosen": 0.3077266812324524,
"logits/rejected": 0.2476225346326828,
"logps/chosen": -480.34619140625,
"logps/rejected": -587.4752197265625,
"loss": 0.489,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.302478551864624,
"rewards/margins": 1.1229597330093384,
"rewards/rejected": -2.425438404083252,
"step": 2070
},
{
"epoch": 0.4990403071017274,
"grad_norm": 45.25903246638121,
"learning_rate": 2.9425950747544176e-07,
"logits/chosen": 0.2362133264541626,
"logits/rejected": 0.20862069725990295,
"logps/chosen": -528.6657104492188,
"logps/rejected": -640.0977172851562,
"loss": 0.4865,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.531398057937622,
"rewards/margins": 1.434666633605957,
"rewards/rejected": -2.966064929962158,
"step": 2080
},
{
"epoch": 0.5014395393474088,
"grad_norm": 46.57096517661485,
"learning_rate": 2.921972167291119e-07,
"logits/chosen": 0.1148526519536972,
"logits/rejected": 0.0883648619055748,
"logps/chosen": -483.4339904785156,
"logps/rejected": -601.8978271484375,
"loss": 0.4907,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3184032440185547,
"rewards/margins": 0.981913685798645,
"rewards/rejected": -2.3003170490264893,
"step": 2090
},
{
"epoch": 0.5038387715930902,
"grad_norm": 39.63447443095947,
"learning_rate": 2.9013196601567567e-07,
"logits/chosen": 0.08627250045537949,
"logits/rejected": 0.10158304125070572,
"logps/chosen": -421.615478515625,
"logps/rejected": -528.127685546875,
"loss": 0.5409,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1398550271987915,
"rewards/margins": 0.95441073179245,
"rewards/rejected": -2.0942656993865967,
"step": 2100
},
{
"epoch": 0.5062380038387716,
"grad_norm": 36.05433569174663,
"learning_rate": 2.8806390020426555e-07,
"logits/chosen": 0.0710478127002716,
"logits/rejected": 0.05162844806909561,
"logps/chosen": -453.2339782714844,
"logps/rejected": -557.0374145507812,
"loss": 0.4948,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1150130033493042,
"rewards/margins": 1.007678747177124,
"rewards/rejected": -2.1226916313171387,
"step": 2110
},
{
"epoch": 0.508637236084453,
"grad_norm": 50.26433068268168,
"learning_rate": 2.8599316436148187e-07,
"logits/chosen": 0.24139384925365448,
"logits/rejected": 0.21716871857643127,
"logps/chosen": -447.4005432128906,
"logps/rejected": -504.754638671875,
"loss": 0.4837,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.421812653541565,
"rewards/margins": 0.5791618227958679,
"rewards/rejected": -2.000974655151367,
"step": 2120
},
{
"epoch": 0.5110364683301344,
"grad_norm": 48.79279161854594,
"learning_rate": 2.8391990374121723e-07,
"logits/chosen": 0.14107191562652588,
"logits/rejected": 0.05996360257267952,
"logps/chosen": -447.6856384277344,
"logps/rejected": -572.6692504882812,
"loss": 0.5309,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3457109928131104,
"rewards/margins": 1.006974458694458,
"rewards/rejected": -2.3526854515075684,
"step": 2130
},
{
"epoch": 0.5134357005758158,
"grad_norm": 49.554326824350056,
"learning_rate": 2.818442637744669e-07,
"logits/chosen": 0.14974358677864075,
"logits/rejected": 0.07151228934526443,
"logps/chosen": -468.7398986816406,
"logps/rejected": -561.0519409179688,
"loss": 0.529,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4705628156661987,
"rewards/margins": 0.9486227035522461,
"rewards/rejected": -2.4191856384277344,
"step": 2140
},
{
"epoch": 0.5158349328214972,
"grad_norm": 49.03414708222374,
"learning_rate": 2.797663900591284e-07,
"logits/chosen": 0.12192866951227188,
"logits/rejected": 0.1623045951128006,
"logps/chosen": -484.0595703125,
"logps/rejected": -536.2966918945312,
"loss": 0.4726,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.466080904006958,
"rewards/margins": 0.9258912205696106,
"rewards/rejected": -2.3919718265533447,
"step": 2150
},
{
"epoch": 0.5182341650671785,
"grad_norm": 47.623947511820035,
"learning_rate": 2.776864283497874e-07,
"logits/chosen": 0.2551673352718353,
"logits/rejected": 0.25919514894485474,
"logps/chosen": -450.1902770996094,
"logps/rejected": -603.4628295898438,
"loss": 0.5047,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.427159309387207,
"rewards/margins": 1.6337556838989258,
"rewards/rejected": -3.0609147548675537,
"step": 2160
},
{
"epoch": 0.5206333973128598,
"grad_norm": 39.32544622434657,
"learning_rate": 2.756045245474943e-07,
"logits/chosen": 0.1113271713256836,
"logits/rejected": 0.06980106979608536,
"logps/chosen": -477.89288330078125,
"logps/rejected": -590.3850708007812,
"loss": 0.5136,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3194444179534912,
"rewards/margins": 0.8708696365356445,
"rewards/rejected": -2.1903140544891357,
"step": 2170
},
{
"epoch": 0.5230326295585412,
"grad_norm": 41.32084808239206,
"learning_rate": 2.7352082468952977e-07,
"logits/chosen": 0.18554797768592834,
"logits/rejected": 0.10465570539236069,
"logps/chosen": -464.09027099609375,
"logps/rejected": -614.1561279296875,
"loss": 0.5172,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5660805702209473,
"rewards/margins": 1.2777998447418213,
"rewards/rejected": -2.8438801765441895,
"step": 2180
},
{
"epoch": 0.5254318618042226,
"grad_norm": 67.92788558845768,
"learning_rate": 2.7143547493916e-07,
"logits/chosen": 0.18377096951007843,
"logits/rejected": 0.10271792113780975,
"logps/chosen": -409.5820007324219,
"logps/rejected": -570.1399536132812,
"loss": 0.4842,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.100378394126892,
"rewards/margins": 1.5573487281799316,
"rewards/rejected": -2.657727003097534,
"step": 2190
},
{
"epoch": 0.527831094049904,
"grad_norm": 50.707037903665324,
"learning_rate": 2.693486215753853e-07,
"logits/chosen": 0.12866708636283875,
"logits/rejected": 0.06411238014698029,
"logps/chosen": -418.26715087890625,
"logps/rejected": -512.0023193359375,
"loss": 0.5342,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2301840782165527,
"rewards/margins": 1.0781285762786865,
"rewards/rejected": -2.30831241607666,
"step": 2200
},
{
"epoch": 0.5302303262955854,
"grad_norm": 46.86231101360818,
"learning_rate": 2.6726041098267805e-07,
"logits/chosen": -0.031896281987428665,
"logits/rejected": -0.030716899782419205,
"logps/chosen": -487.5838928222656,
"logps/rejected": -485.93646240234375,
"loss": 0.5532,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3748475313186646,
"rewards/margins": 0.40763726830482483,
"rewards/rejected": -1.7824846506118774,
"step": 2210
},
{
"epoch": 0.5326295585412668,
"grad_norm": 72.55957637434223,
"learning_rate": 2.6517098964071507e-07,
"logits/chosen": 0.23304399847984314,
"logits/rejected": 0.22425612807273865,
"logps/chosen": -444.8831481933594,
"logps/rejected": -506.5235900878906,
"loss": 0.546,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1518399715423584,
"rewards/margins": 0.4722941517829895,
"rewards/rejected": -1.6241340637207031,
"step": 2220
},
{
"epoch": 0.5350287907869482,
"grad_norm": 43.727526427782365,
"learning_rate": 2.630805041141023e-07,
"logits/chosen": 0.2689264118671417,
"logits/rejected": 0.2254217565059662,
"logps/chosen": -403.8577880859375,
"logps/rejected": -512.3160400390625,
"loss": 0.5046,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.058393120765686,
"rewards/margins": 0.9750925302505493,
"rewards/rejected": -2.0334856510162354,
"step": 2230
},
{
"epoch": 0.5374280230326296,
"grad_norm": 49.11098211804801,
"learning_rate": 2.609891010420941e-07,
"logits/chosen": 0.18164226412773132,
"logits/rejected": 0.17124636471271515,
"logps/chosen": -454.2110290527344,
"logps/rejected": -558.4046630859375,
"loss": 0.465,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.1852877140045166,
"rewards/margins": 1.1636625528335571,
"rewards/rejected": -2.3489503860473633,
"step": 2240
},
{
"epoch": 0.539827255278311,
"grad_norm": 43.26353709722887,
"learning_rate": 2.5889692712830674e-07,
"logits/chosen": 0.052560679614543915,
"logits/rejected": 0.03842206671833992,
"logps/chosen": -396.25408935546875,
"logps/rejected": -478.73236083984375,
"loss": 0.4734,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -0.9645735621452332,
"rewards/margins": 0.9311714172363281,
"rewards/rejected": -1.895745038986206,
"step": 2250
},
{
"epoch": 0.5422264875239923,
"grad_norm": 47.10267589353339,
"learning_rate": 2.5680412913042843e-07,
"logits/chosen": 0.23019644618034363,
"logits/rejected": 0.179383784532547,
"logps/chosen": -421.8323669433594,
"logps/rejected": -528.0892333984375,
"loss": 0.5044,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2181730270385742,
"rewards/margins": 1.0768169164657593,
"rewards/rejected": -2.294990062713623,
"step": 2260
},
{
"epoch": 0.5446257197696737,
"grad_norm": 49.07465366967735,
"learning_rate": 2.5471085384992404e-07,
"logits/chosen": 0.21075716614723206,
"logits/rejected": 0.0905676931142807,
"logps/chosen": -402.1150207519531,
"logps/rejected": -584.5428466796875,
"loss": 0.4919,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.061937928199768,
"rewards/margins": 1.6330102682113647,
"rewards/rejected": -2.694948196411133,
"step": 2270
},
{
"epoch": 0.5470249520153551,
"grad_norm": 47.06580983617911,
"learning_rate": 2.526172481217381e-07,
"logits/chosen": 0.28002408146858215,
"logits/rejected": 0.19437995553016663,
"logps/chosen": -421.2408142089844,
"logps/rejected": -556.0794067382812,
"loss": 0.5198,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5300331115722656,
"rewards/margins": 1.192158579826355,
"rewards/rejected": -2.722191572189331,
"step": 2280
},
{
"epoch": 0.5494241842610365,
"grad_norm": 42.697690556320396,
"learning_rate": 2.5052345880399456e-07,
"logits/chosen": 0.336375892162323,
"logits/rejected": 0.33653944730758667,
"logps/chosen": -417.27496337890625,
"logps/rejected": -494.6957092285156,
"loss": 0.4616,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.37350594997406,
"rewards/margins": 0.7126041650772095,
"rewards/rejected": -2.0861101150512695,
"step": 2290
},
{
"epoch": 0.5518234165067178,
"grad_norm": 44.24690759792965,
"learning_rate": 2.4842963276769555e-07,
"logits/chosen": 0.46479305624961853,
"logits/rejected": 0.34474366903305054,
"logps/chosen": -428.14227294921875,
"logps/rejected": -594.80224609375,
"loss": 0.5059,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.475941777229309,
"rewards/margins": 1.2353615760803223,
"rewards/rejected": -2.711303472518921,
"step": 2300
},
{
"epoch": 0.5542226487523992,
"grad_norm": 42.732671934213585,
"learning_rate": 2.463359168864189e-07,
"logits/chosen": 0.25363442301750183,
"logits/rejected": 0.3057165741920471,
"logps/chosen": -501.9913635253906,
"logps/rejected": -549.3098754882812,
"loss": 0.5308,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.290345311164856,
"rewards/margins": 1.010578989982605,
"rewards/rejected": -2.300924301147461,
"step": 2310
},
{
"epoch": 0.5566218809980806,
"grad_norm": 56.4377037562831,
"learning_rate": 2.4424245802601555e-07,
"logits/chosen": 0.2584269642829895,
"logits/rejected": 0.18541845679283142,
"logps/chosen": -429.4263610839844,
"logps/rejected": -564.8827514648438,
"loss": 0.4823,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1343291997909546,
"rewards/margins": 0.8212429285049438,
"rewards/rejected": -1.9555721282958984,
"step": 2320
},
{
"epoch": 0.559021113243762,
"grad_norm": 43.562067174648554,
"learning_rate": 2.421494030343072e-07,
"logits/chosen": 0.3927503228187561,
"logits/rejected": 0.4579402506351471,
"logps/chosen": -454.2933044433594,
"logps/rejected": -463.19879150390625,
"loss": 0.5602,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2649152278900146,
"rewards/margins": 0.6680216193199158,
"rewards/rejected": -1.9329369068145752,
"step": 2330
},
{
"epoch": 0.5614203454894434,
"grad_norm": 58.65475476508653,
"learning_rate": 2.400568987307861e-07,
"logits/chosen": 0.4964686334133148,
"logits/rejected": 0.5107001662254333,
"logps/chosen": -432.31341552734375,
"logps/rejected": -462.37957763671875,
"loss": 0.4724,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.243574857711792,
"rewards/margins": 0.5827276110649109,
"rewards/rejected": -1.8263022899627686,
"step": 2340
},
{
"epoch": 0.5638195777351248,
"grad_norm": 58.315209990127244,
"learning_rate": 2.379650918963156e-07,
"logits/chosen": 0.3746911585330963,
"logits/rejected": 0.3054753541946411,
"logps/chosen": -421.2218322753906,
"logps/rejected": -541.8524169921875,
"loss": 0.4768,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4736093282699585,
"rewards/margins": 1.0763620138168335,
"rewards/rejected": -2.549971342086792,
"step": 2350
},
{
"epoch": 0.5662188099808061,
"grad_norm": 48.959910400597586,
"learning_rate": 2.3587412926283438e-07,
"logits/chosen": 0.35963717103004456,
"logits/rejected": 0.28781235218048096,
"logps/chosen": -480.2315368652344,
"logps/rejected": -566.6637573242188,
"loss": 0.5414,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1883009672164917,
"rewards/margins": 1.270986795425415,
"rewards/rejected": -2.459287643432617,
"step": 2360
},
{
"epoch": 0.5686180422264875,
"grad_norm": 30.962931166603095,
"learning_rate": 2.337841575030642e-07,
"logits/chosen": 0.35713425278663635,
"logits/rejected": 0.30424803495407104,
"logps/chosen": -492.8209533691406,
"logps/rejected": -574.771240234375,
"loss": 0.5061,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.396402359008789,
"rewards/margins": 0.7614862322807312,
"rewards/rejected": -2.157888889312744,
"step": 2370
},
{
"epoch": 0.5710172744721689,
"grad_norm": 42.85766086532651,
"learning_rate": 2.316953232202206e-07,
"logits/chosen": 0.550395131111145,
"logits/rejected": 0.6783905029296875,
"logps/chosen": -430.48162841796875,
"logps/rejected": -421.599853515625,
"loss": 0.485,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2630993127822876,
"rewards/margins": 0.6726707220077515,
"rewards/rejected": -1.9357702732086182,
"step": 2380
},
{
"epoch": 0.5734165067178503,
"grad_norm": 38.41377863495817,
"learning_rate": 2.2960777293772958e-07,
"logits/chosen": 0.5615749359130859,
"logits/rejected": 0.6018954515457153,
"logps/chosen": -397.6216125488281,
"logps/rejected": -480.299560546875,
"loss": 0.4712,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1725983619689941,
"rewards/margins": 0.9747906923294067,
"rewards/rejected": -2.1473889350891113,
"step": 2390
},
{
"epoch": 0.5758157389635317,
"grad_norm": 40.03422172905619,
"learning_rate": 2.2752165308894974e-07,
"logits/chosen": 0.46104907989501953,
"logits/rejected": 0.44198736548423767,
"logps/chosen": -378.5218505859375,
"logps/rejected": -456.65576171875,
"loss": 0.479,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1959477663040161,
"rewards/margins": 0.9340691566467285,
"rewards/rejected": -2.130016803741455,
"step": 2400
},
{
"epoch": 0.5782149712092131,
"grad_norm": 54.11084171812038,
"learning_rate": 2.254371100069005e-07,
"logits/chosen": 0.457451730966568,
"logits/rejected": 0.320446252822876,
"logps/chosen": -431.6908264160156,
"logps/rejected": -538.86865234375,
"loss": 0.4874,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1469987630844116,
"rewards/margins": 0.8605014085769653,
"rewards/rejected": -2.007500171661377,
"step": 2410
},
{
"epoch": 0.5806142034548945,
"grad_norm": 54.79494157401916,
"learning_rate": 2.2335428991399725e-07,
"logits/chosen": 0.47143587470054626,
"logits/rejected": 0.4143534302711487,
"logps/chosen": -398.853271484375,
"logps/rejected": -594.490966796875,
"loss": 0.5197,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.449357509613037,
"rewards/margins": 1.8377138376235962,
"rewards/rejected": -3.287071704864502,
"step": 2420
},
{
"epoch": 0.5830134357005758,
"grad_norm": 47.37355935293041,
"learning_rate": 2.2127333891179458e-07,
"logits/chosen": 0.4510342478752136,
"logits/rejected": 0.36793094873428345,
"logps/chosen": -419.24163818359375,
"logps/rejected": -571.1874389648438,
"loss": 0.5193,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3948280811309814,
"rewards/margins": 1.2814536094665527,
"rewards/rejected": -2.676281690597534,
"step": 2430
},
{
"epoch": 0.5854126679462572,
"grad_norm": 65.89730578952388,
"learning_rate": 2.1919440297073782e-07,
"logits/chosen": 0.3510410785675049,
"logits/rejected": 0.3182118535041809,
"logps/chosen": -415.6324157714844,
"logps/rejected": -527.152099609375,
"loss": 0.5265,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.5300906896591187,
"rewards/margins": 1.0138437747955322,
"rewards/rejected": -2.5439343452453613,
"step": 2440
},
{
"epoch": 0.5878119001919386,
"grad_norm": 42.30705937238165,
"learning_rate": 2.1711762791992368e-07,
"logits/chosen": 0.43873363733291626,
"logits/rejected": 0.46004414558410645,
"logps/chosen": -474.68341064453125,
"logps/rejected": -534.9579467773438,
"loss": 0.5254,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1993557214736938,
"rewards/margins": 0.8993379473686218,
"rewards/rejected": -2.098693370819092,
"step": 2450
},
{
"epoch": 0.5902111324376199,
"grad_norm": 45.61922027456477,
"learning_rate": 2.1504315943687114e-07,
"logits/chosen": 0.18166793882846832,
"logits/rejected": 0.07724637538194656,
"logps/chosen": -408.96893310546875,
"logps/rejected": -581.2828979492188,
"loss": 0.4625,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.0965474843978882,
"rewards/margins": 1.2784209251403809,
"rewards/rejected": -2.3749685287475586,
"step": 2460
},
{
"epoch": 0.5926103646833013,
"grad_norm": 53.517726559327514,
"learning_rate": 2.1297114303730248e-07,
"logits/chosen": 0.3896231949329376,
"logits/rejected": 0.2409631460905075,
"logps/chosen": -423.19287109375,
"logps/rejected": -586.328857421875,
"loss": 0.5399,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.327804446220398,
"rewards/margins": 1.076370120048523,
"rewards/rejected": -2.404174327850342,
"step": 2470
},
{
"epoch": 0.5950095969289827,
"grad_norm": 41.01802265556329,
"learning_rate": 2.1090172406493616e-07,
"logits/chosen": 0.3331597149372101,
"logits/rejected": 0.2225189208984375,
"logps/chosen": -397.5386657714844,
"logps/rejected": -519.840087890625,
"loss": 0.4603,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.131462812423706,
"rewards/margins": 0.950838565826416,
"rewards/rejected": -2.082301378250122,
"step": 2480
},
{
"epoch": 0.5974088291746641,
"grad_norm": 60.21604361600221,
"learning_rate": 2.0883504768129146e-07,
"logits/chosen": 0.30570241808891296,
"logits/rejected": 0.24165184795856476,
"logps/chosen": -461.2522888183594,
"logps/rejected": -565.5457763671875,
"loss": 0.5168,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1941462755203247,
"rewards/margins": 1.1086372137069702,
"rewards/rejected": -2.302783489227295,
"step": 2490
},
{
"epoch": 0.5998080614203455,
"grad_norm": 45.67541969535949,
"learning_rate": 2.0677125885550571e-07,
"logits/chosen": 0.4085448384284973,
"logits/rejected": 0.48327702283859253,
"logps/chosen": -436.59857177734375,
"logps/rejected": -471.85498046875,
"loss": 0.4864,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.352922797203064,
"rewards/margins": 0.7829147577285767,
"rewards/rejected": -2.1358375549316406,
"step": 2500
},
{
"epoch": 0.6022072936660269,
"grad_norm": 60.523710599155514,
"learning_rate": 2.0471050235416587e-07,
"logits/chosen": 0.14623039960861206,
"logits/rejected": 0.19062075018882751,
"logps/chosen": -451.1435546875,
"logps/rejected": -491.1160583496094,
"loss": 0.4579,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3129417896270752,
"rewards/margins": 0.9290571212768555,
"rewards/rejected": -2.2419991493225098,
"step": 2510
},
{
"epoch": 0.6046065259117083,
"grad_norm": 52.23271499985374,
"learning_rate": 2.026529227311532e-07,
"logits/chosen": 0.29617246985435486,
"logits/rejected": 0.2822147011756897,
"logps/chosen": -423.54315185546875,
"logps/rejected": -501.5276794433594,
"loss": 0.5351,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.437732458114624,
"rewards/margins": 0.6575521230697632,
"rewards/rejected": -2.0952847003936768,
"step": 2520
},
{
"epoch": 0.6070057581573897,
"grad_norm": 44.954316731149845,
"learning_rate": 2.005986643175036e-07,
"logits/chosen": 0.3328186571598053,
"logits/rejected": 0.2537630498409271,
"logps/chosen": -454.51580810546875,
"logps/rejected": -574.1419677734375,
"loss": 0.4529,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1176398992538452,
"rewards/margins": 1.3029248714447021,
"rewards/rejected": -2.420564889907837,
"step": 2530
},
{
"epoch": 0.6094049904030711,
"grad_norm": 63.16908223607974,
"learning_rate": 1.9854787121128328e-07,
"logits/chosen": 0.31036069989204407,
"logits/rejected": 0.34982046484947205,
"logps/chosen": -397.23980712890625,
"logps/rejected": -403.78509521484375,
"loss": 0.5048,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.217882752418518,
"rewards/margins": 0.5209786891937256,
"rewards/rejected": -1.7388614416122437,
"step": 2540
},
{
"epoch": 0.6118042226487524,
"grad_norm": 54.60861450055549,
"learning_rate": 1.9650068726748106e-07,
"logits/chosen": 0.3659752309322357,
"logits/rejected": 0.35895493626594543,
"logps/chosen": -461.1573181152344,
"logps/rejected": -573.6448364257812,
"loss": 0.5214,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.474023699760437,
"rewards/margins": 1.084517240524292,
"rewards/rejected": -2.5585405826568604,
"step": 2550
},
{
"epoch": 0.6142034548944337,
"grad_norm": 60.46600684768552,
"learning_rate": 1.9445725608791718e-07,
"logits/chosen": 0.34406715631484985,
"logits/rejected": 0.28216245770454407,
"logps/chosen": -460.77978515625,
"logps/rejected": -619.3160400390625,
"loss": 0.497,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3329025506973267,
"rewards/margins": 1.6671111583709717,
"rewards/rejected": -3.000014066696167,
"step": 2560
},
{
"epoch": 0.6166026871401151,
"grad_norm": 47.40884309447939,
"learning_rate": 1.924177210111705e-07,
"logits/chosen": 0.29457220435142517,
"logits/rejected": 0.29915186762809753,
"logps/chosen": -407.5345153808594,
"logps/rejected": -541.1593017578125,
"loss": 0.5182,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2408500909805298,
"rewards/margins": 1.2775036096572876,
"rewards/rejected": -2.5183534622192383,
"step": 2570
},
{
"epoch": 0.6190019193857965,
"grad_norm": 45.99798362644753,
"learning_rate": 1.9038222510252364e-07,
"logits/chosen": 0.25425729155540466,
"logits/rejected": 0.24261541664600372,
"logps/chosen": -444.73992919921875,
"logps/rejected": -504.9520568847656,
"loss": 0.499,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1787656545639038,
"rewards/margins": 0.8464619517326355,
"rewards/rejected": -2.0252277851104736,
"step": 2580
},
{
"epoch": 0.6214011516314779,
"grad_norm": 52.02289887758591,
"learning_rate": 1.883509111439277e-07,
"logits/chosen": 0.3976004123687744,
"logits/rejected": 0.28759509325027466,
"logps/chosen": -438.90008544921875,
"logps/rejected": -641.5413818359375,
"loss": 0.5341,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.4814860820770264,
"rewards/margins": 1.3605000972747803,
"rewards/rejected": -2.8419861793518066,
"step": 2590
},
{
"epoch": 0.6238003838771593,
"grad_norm": 32.303683781858304,
"learning_rate": 1.8632392162398665e-07,
"logits/chosen": 0.23672600090503693,
"logits/rejected": 0.15976786613464355,
"logps/chosen": -484.119873046875,
"logps/rejected": -645.5545654296875,
"loss": 0.4698,
"rewards/accuracies": 0.9750000238418579,
"rewards/chosen": -1.1118555068969727,
"rewards/margins": 1.7390865087509155,
"rewards/rejected": -2.8509418964385986,
"step": 2600
},
{
"epoch": 0.6261996161228407,
"grad_norm": 45.29250569251351,
"learning_rate": 1.84301398727962e-07,
"logits/chosen": 0.4794914722442627,
"logits/rejected": 0.37679189443588257,
"logps/chosen": -368.31109619140625,
"logps/rejected": -579.0586547851562,
"loss": 0.496,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.0693645477294922,
"rewards/margins": 1.7801597118377686,
"rewards/rejected": -2.8495242595672607,
"step": 2610
},
{
"epoch": 0.6285988483685221,
"grad_norm": 62.710557092048646,
"learning_rate": 1.8228348432779966e-07,
"logits/chosen": 0.2735206186771393,
"logits/rejected": 0.24361078441143036,
"logps/chosen": -426.20111083984375,
"logps/rejected": -496.5086364746094,
"loss": 0.5445,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3294744491577148,
"rewards/margins": 0.8788881301879883,
"rewards/rejected": -2.208362579345703,
"step": 2620
},
{
"epoch": 0.6309980806142035,
"grad_norm": 73.52463716987671,
"learning_rate": 1.8027031997217773e-07,
"logits/chosen": 0.3862006962299347,
"logits/rejected": 0.27332574129104614,
"logps/chosen": -411.3087463378906,
"logps/rejected": -538.5333251953125,
"loss": 0.4613,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4152649641036987,
"rewards/margins": 1.0575337409973145,
"rewards/rejected": -2.4727988243103027,
"step": 2630
},
{
"epoch": 0.6333973128598849,
"grad_norm": 47.969317887923054,
"learning_rate": 1.7826204687657758e-07,
"logits/chosen": 0.2865044176578522,
"logits/rejected": 0.33233708143234253,
"logps/chosen": -468.67108154296875,
"logps/rejected": -502.1822814941406,
"loss": 0.4738,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1685740947723389,
"rewards/margins": 0.8533760905265808,
"rewards/rejected": -2.0219502449035645,
"step": 2640
},
{
"epoch": 0.6357965451055663,
"grad_norm": 46.75296720560617,
"learning_rate": 1.762588059133781e-07,
"logits/chosen": 0.3442167043685913,
"logits/rejected": 0.4161573350429535,
"logps/chosen": -470.08807373046875,
"logps/rejected": -546.142578125,
"loss": 0.4768,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.227698564529419,
"rewards/margins": 1.140878677368164,
"rewards/rejected": -2.368577003479004,
"step": 2650
},
{
"epoch": 0.6381957773512476,
"grad_norm": 50.401385190215464,
"learning_rate": 1.7426073760197406e-07,
"logits/chosen": 0.10545764863491058,
"logits/rejected": 0.012745514512062073,
"logps/chosen": -432.08685302734375,
"logps/rejected": -608.319091796875,
"loss": 0.4988,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.235762119293213,
"rewards/margins": 1.4522688388824463,
"rewards/rejected": -2.688030958175659,
"step": 2660
},
{
"epoch": 0.6405950095969289,
"grad_norm": 45.71956495654583,
"learning_rate": 1.7226798209891935e-07,
"logits/chosen": 0.20430830121040344,
"logits/rejected": 0.2912927269935608,
"logps/chosen": -453.98944091796875,
"logps/rejected": -510.1253967285156,
"loss": 0.4619,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.4355871677398682,
"rewards/margins": 1.2053136825561523,
"rewards/rejected": -2.6409008502960205,
"step": 2670
},
{
"epoch": 0.6429942418426103,
"grad_norm": 45.15170026766255,
"learning_rate": 1.7028067918809535e-07,
"logits/chosen": 0.3014266788959503,
"logits/rejected": 0.22763225436210632,
"logps/chosen": -384.4528503417969,
"logps/rejected": -595.7291870117188,
"loss": 0.4945,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.171205997467041,
"rewards/margins": 1.6346750259399414,
"rewards/rejected": -2.8058810234069824,
"step": 2680
},
{
"epoch": 0.6453934740882917,
"grad_norm": 64.83143640863342,
"learning_rate": 1.6829896827090584e-07,
"logits/chosen": 0.21800704300403595,
"logits/rejected": 0.220147043466568,
"logps/chosen": -443.7188415527344,
"logps/rejected": -480.36907958984375,
"loss": 0.5184,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3957051038742065,
"rewards/margins": 0.6850441694259644,
"rewards/rejected": -2.080749273300171,
"step": 2690
},
{
"epoch": 0.6477927063339731,
"grad_norm": 37.274772223125495,
"learning_rate": 1.6632298835649844e-07,
"logits/chosen": 0.3046364486217499,
"logits/rejected": 0.18461188673973083,
"logps/chosen": -469.7455139160156,
"logps/rejected": -623.6341552734375,
"loss": 0.477,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3073039054870605,
"rewards/margins": 1.1427420377731323,
"rewards/rejected": -2.4500460624694824,
"step": 2700
},
{
"epoch": 0.6501919385796545,
"grad_norm": 91.33316289592031,
"learning_rate": 1.6435287805201364e-07,
"logits/chosen": 0.46277904510498047,
"logits/rejected": 0.40250563621520996,
"logps/chosen": -462.3423767089844,
"logps/rejected": -543.9591064453125,
"loss": 0.5314,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.518293023109436,
"rewards/margins": 0.8617793321609497,
"rewards/rejected": -2.3800723552703857,
"step": 2710
},
{
"epoch": 0.6525911708253359,
"grad_norm": 44.87565160003366,
"learning_rate": 1.6238877555286207e-07,
"logits/chosen": 0.35751184821128845,
"logits/rejected": 0.29743391275405884,
"logps/chosen": -469.69561767578125,
"logps/rejected": -606.2034301757812,
"loss": 0.4479,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2520592212677002,
"rewards/margins": 1.2662980556488037,
"rewards/rejected": -2.518357276916504,
"step": 2720
},
{
"epoch": 0.6549904030710173,
"grad_norm": 44.32240198316999,
"learning_rate": 1.60430818633031e-07,
"logits/chosen": 0.16691644489765167,
"logits/rejected": 0.14331945776939392,
"logps/chosen": -449.48876953125,
"logps/rejected": -561.8910522460938,
"loss": 0.4539,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.131084680557251,
"rewards/margins": 1.254732370376587,
"rewards/rejected": -2.385816812515259,
"step": 2730
},
{
"epoch": 0.6573896353166987,
"grad_norm": 39.01641038215159,
"learning_rate": 1.5847914463541939e-07,
"logits/chosen": 0.3676902651786804,
"logits/rejected": 0.34273606538772583,
"logps/chosen": -374.8681640625,
"logps/rejected": -478.32330322265625,
"loss": 0.4745,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1049184799194336,
"rewards/margins": 0.8594606518745422,
"rewards/rejected": -1.964379072189331,
"step": 2740
},
{
"epoch": 0.6597888675623801,
"grad_norm": 35.89167519955917,
"learning_rate": 1.5653389046220427e-07,
"logits/chosen": 0.3571329414844513,
"logits/rejected": 0.27262359857559204,
"logps/chosen": -399.9897155761719,
"logps/rejected": -535.5849609375,
"loss": 0.4737,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.1511871814727783,
"rewards/margins": 1.0949671268463135,
"rewards/rejected": -2.246154308319092,
"step": 2750
},
{
"epoch": 0.6621880998080614,
"grad_norm": 74.88730166916955,
"learning_rate": 1.545951925652375e-07,
"logits/chosen": 0.3250289559364319,
"logits/rejected": 0.39422863721847534,
"logps/chosen": -507.34735107421875,
"logps/rejected": -564.6778564453125,
"loss": 0.4953,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2604314088821411,
"rewards/margins": 1.2383835315704346,
"rewards/rejected": -2.4988150596618652,
"step": 2760
},
{
"epoch": 0.6645873320537428,
"grad_norm": 43.31844151941509,
"learning_rate": 1.5266318693647423e-07,
"logits/chosen": 0.38096925616264343,
"logits/rejected": 0.4018251299858093,
"logps/chosen": -460.6954040527344,
"logps/rejected": -567.9718017578125,
"loss": 0.4693,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2558958530426025,
"rewards/margins": 1.0993343591690063,
"rewards/rejected": -2.3552298545837402,
"step": 2770
},
{
"epoch": 0.6669865642994242,
"grad_norm": 72.85191786475721,
"learning_rate": 1.5073800909843353e-07,
"logits/chosen": 0.25220975279808044,
"logits/rejected": 0.3646177649497986,
"logps/chosen": -450.23193359375,
"logps/rejected": -503.99127197265625,
"loss": 0.4707,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2164779901504517,
"rewards/margins": 1.0934855937957764,
"rewards/rejected": -2.3099634647369385,
"step": 2780
},
{
"epoch": 0.6693857965451055,
"grad_norm": 63.40322968247712,
"learning_rate": 1.488197940946922e-07,
"logits/chosen": 0.23376190662384033,
"logits/rejected": 0.22447574138641357,
"logps/chosen": -456.8228454589844,
"logps/rejected": -523.5567626953125,
"loss": 0.4689,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1302400827407837,
"rewards/margins": 1.2681411504745483,
"rewards/rejected": -2.398381233215332,
"step": 2790
},
{
"epoch": 0.6717850287907869,
"grad_norm": 66.69199487516,
"learning_rate": 1.4690867648041167e-07,
"logits/chosen": 0.16230645775794983,
"logits/rejected": 0.1882302314043045,
"logps/chosen": -434.5381774902344,
"logps/rejected": -552.4396362304688,
"loss": 0.5021,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.1278281211853027,
"rewards/margins": 1.5061180591583252,
"rewards/rejected": -2.633945941925049,
"step": 2800
},
{
"epoch": 0.6741842610364683,
"grad_norm": 46.91530215907862,
"learning_rate": 1.4500479031289987e-07,
"logits/chosen": 0.15237310528755188,
"logits/rejected": 0.1518753319978714,
"logps/chosen": -466.0179138183594,
"logps/rejected": -572.3275146484375,
"loss": 0.517,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2266777753829956,
"rewards/margins": 1.2364604473114014,
"rewards/rejected": -2.4631385803222656,
"step": 2810
},
{
"epoch": 0.6765834932821497,
"grad_norm": 55.81983630093274,
"learning_rate": 1.4310826914220747e-07,
"logits/chosen": 0.17195823788642883,
"logits/rejected": 0.16844519972801208,
"logps/chosen": -536.1735229492188,
"logps/rejected": -609.2791748046875,
"loss": 0.5282,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.5774872303009033,
"rewards/margins": 0.9089029431343079,
"rewards/rejected": -2.4863903522491455,
"step": 2820
},
{
"epoch": 0.6789827255278311,
"grad_norm": 53.80796549341863,
"learning_rate": 1.412192460017597e-07,
"logits/chosen": 0.1955575793981552,
"logits/rejected": 0.12785163521766663,
"logps/chosen": -444.7312927246094,
"logps/rejected": -568.8245849609375,
"loss": 0.5024,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.395450472831726,
"rewards/margins": 1.22873055934906,
"rewards/rejected": -2.624181032180786,
"step": 2830
},
{
"epoch": 0.6813819577735125,
"grad_norm": 44.72827252256254,
"learning_rate": 1.3933785339902504e-07,
"logits/chosen": 0.27861329913139343,
"logits/rejected": 0.13766932487487793,
"logps/chosen": -376.69805908203125,
"logps/rejected": -530.3146362304688,
"loss": 0.5003,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1893621683120728,
"rewards/margins": 1.1048251390457153,
"rewards/rejected": -2.294187307357788,
"step": 2840
},
{
"epoch": 0.6837811900191939,
"grad_norm": 38.90141505727746,
"learning_rate": 1.374642233062197e-07,
"logits/chosen": 0.1925538331270218,
"logits/rejected": 0.17995335161685944,
"logps/chosen": -486.0704650878906,
"logps/rejected": -545.6419067382812,
"loss": 0.5175,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2210705280303955,
"rewards/margins": 1.031243085861206,
"rewards/rejected": -2.2523136138916016,
"step": 2850
},
{
"epoch": 0.6861804222648752,
"grad_norm": 38.934538649501114,
"learning_rate": 1.355984871510511e-07,
"logits/chosen": 0.24185729026794434,
"logits/rejected": 0.16981028020381927,
"logps/chosen": -488.8545837402344,
"logps/rejected": -586.666015625,
"loss": 0.4586,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2682818174362183,
"rewards/margins": 0.9824774861335754,
"rewards/rejected": -2.2507593631744385,
"step": 2860
},
{
"epoch": 0.6885796545105566,
"grad_norm": 45.29750933331666,
"learning_rate": 1.3374077580749783e-07,
"logits/chosen": 0.29279276728630066,
"logits/rejected": 0.1869848519563675,
"logps/chosen": -351.5196838378906,
"logps/rejected": -475.97308349609375,
"loss": 0.5016,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.038088321685791,
"rewards/margins": 1.0932366847991943,
"rewards/rejected": -2.1313250064849854,
"step": 2870
},
{
"epoch": 0.690978886756238,
"grad_norm": 48.530711516673115,
"learning_rate": 1.3189121958663024e-07,
"logits/chosen": 0.1910950392484665,
"logits/rejected": 0.2789291739463806,
"logps/chosen": -532.8553466796875,
"logps/rejected": -549.0731201171875,
"loss": 0.502,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.6616315841674805,
"rewards/margins": 0.5708137154579163,
"rewards/rejected": -2.232445240020752,
"step": 2880
},
{
"epoch": 0.6933781190019194,
"grad_norm": 49.658508332103274,
"learning_rate": 1.3004994822746895e-07,
"logits/chosen": 0.08187554031610489,
"logits/rejected": 0.053650178015232086,
"logps/chosen": -428.33087158203125,
"logps/rejected": -530.8117065429688,
"loss": 0.5162,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.238884687423706,
"rewards/margins": 0.9637983441352844,
"rewards/rejected": -2.2026829719543457,
"step": 2890
},
{
"epoch": 0.6957773512476008,
"grad_norm": 49.423150125943955,
"learning_rate": 1.2821709088788434e-07,
"logits/chosen": 0.2585221827030182,
"logits/rejected": 0.17918451130390167,
"logps/chosen": -400.4766845703125,
"logps/rejected": -514.9324951171875,
"loss": 0.5073,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2878179550170898,
"rewards/margins": 1.140520691871643,
"rewards/rejected": -2.4283385276794434,
"step": 2900
},
{
"epoch": 0.6981765834932822,
"grad_norm": 59.81963849634542,
"learning_rate": 1.2639277613553736e-07,
"logits/chosen": 0.39327603578567505,
"logits/rejected": 0.3338584899902344,
"logps/chosen": -380.1231994628906,
"logps/rejected": -466.969482421875,
"loss": 0.4731,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2669470310211182,
"rewards/margins": 0.8520339727401733,
"rewards/rejected": -2.11898136138916,
"step": 2910
},
{
"epoch": 0.7005758157389635,
"grad_norm": 47.91120083091996,
"learning_rate": 1.2457713193885975e-07,
"logits/chosen": 0.23712964355945587,
"logits/rejected": 0.10274624824523926,
"logps/chosen": -363.88116455078125,
"logps/rejected": -501.58477783203125,
"loss": 0.4744,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3259981870651245,
"rewards/margins": 1.0507750511169434,
"rewards/rejected": -2.3767733573913574,
"step": 2920
},
{
"epoch": 0.7029750479846449,
"grad_norm": 56.163566511516365,
"learning_rate": 1.2277028565807838e-07,
"logits/chosen": 0.2799941599369049,
"logits/rejected": 0.2706086039543152,
"logps/chosen": -432.5113220214844,
"logps/rejected": -512.887451171875,
"loss": 0.5009,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1611391305923462,
"rewards/margins": 0.9016637802124023,
"rewards/rejected": -2.062802791595459,
"step": 2930
},
{
"epoch": 0.7053742802303263,
"grad_norm": 62.43277712323061,
"learning_rate": 1.209723640362815e-07,
"logits/chosen": 0.16554930806159973,
"logits/rejected": 0.1359563171863556,
"logps/chosen": -462.38568115234375,
"logps/rejected": -574.197998046875,
"loss": 0.5577,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3521636724472046,
"rewards/margins": 1.263946294784546,
"rewards/rejected": -2.616110324859619,
"step": 2940
},
{
"epoch": 0.7077735124760077,
"grad_norm": 33.88826120125574,
"learning_rate": 1.191834931905277e-07,
"logits/chosen": 0.20565947890281677,
"logits/rejected": 0.13917942345142365,
"logps/chosen": -520.4049072265625,
"logps/rejected": -632.361328125,
"loss": 0.4599,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5277377367019653,
"rewards/margins": 1.1449778079986572,
"rewards/rejected": -2.672715425491333,
"step": 2950
},
{
"epoch": 0.710172744721689,
"grad_norm": 45.009587506259074,
"learning_rate": 1.1740379860299988e-07,
"logits/chosen": 0.2947765588760376,
"logits/rejected": 0.23910513520240784,
"logps/chosen": -472.980712890625,
"logps/rejected": -579.03125,
"loss": 0.497,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3068562746047974,
"rewards/margins": 0.9038209915161133,
"rewards/rejected": -2.210677146911621,
"step": 2960
},
{
"epoch": 0.7125719769673704,
"grad_norm": 47.44511342924861,
"learning_rate": 1.1563340511220254e-07,
"logits/chosen": 0.2019500434398651,
"logits/rejected": 0.2147335559129715,
"logps/chosen": -510.0350646972656,
"logps/rejected": -596.2499389648438,
"loss": 0.5062,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.336971402168274,
"rewards/margins": 1.1666083335876465,
"rewards/rejected": -2.503579616546631,
"step": 2970
},
{
"epoch": 0.7149712092130518,
"grad_norm": 42.57941151152834,
"learning_rate": 1.1387243690420556e-07,
"logits/chosen": 0.23384490609169006,
"logits/rejected": 0.20733702182769775,
"logps/chosen": -481.803955078125,
"logps/rejected": -632.8770751953125,
"loss": 0.4655,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2207635641098022,
"rewards/margins": 1.6195507049560547,
"rewards/rejected": -2.8403146266937256,
"step": 2980
},
{
"epoch": 0.7173704414587332,
"grad_norm": 64.95455682456064,
"learning_rate": 1.1212101750393235e-07,
"logits/chosen": 0.3023291528224945,
"logits/rejected": 0.30834710597991943,
"logps/chosen": -450.0244140625,
"logps/rejected": -551.0206298828125,
"loss": 0.4357,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.409967303276062,
"rewards/margins": 1.2566007375717163,
"rewards/rejected": -2.666567802429199,
"step": 2990
},
{
"epoch": 0.7197696737044146,
"grad_norm": 46.170115289110555,
"learning_rate": 1.1037926976649562e-07,
"logits/chosen": 0.22152157127857208,
"logits/rejected": 0.16806095838546753,
"logps/chosen": -476.97320556640625,
"logps/rejected": -616.3040161132812,
"loss": 0.5408,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4931066036224365,
"rewards/margins": 1.1700246334075928,
"rewards/rejected": -2.6631312370300293,
"step": 3000
},
{
"epoch": 0.722168905950096,
"grad_norm": 57.89732258915135,
"learning_rate": 1.0864731586857936e-07,
"logits/chosen": 0.3043791949748993,
"logits/rejected": 0.36210864782333374,
"logps/chosen": -495.12164306640625,
"logps/rejected": -574.9592895507812,
"loss": 0.46,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4336802959442139,
"rewards/margins": 1.215421199798584,
"rewards/rejected": -2.649101495742798,
"step": 3010
},
{
"epoch": 0.7245681381957774,
"grad_norm": 49.111012867250984,
"learning_rate": 1.0692527729986839e-07,
"logits/chosen": 0.11315940320491791,
"logits/rejected": 0.11848314106464386,
"logps/chosen": -460.1648864746094,
"logps/rejected": -543.1566162109375,
"loss": 0.4285,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2527508735656738,
"rewards/margins": 1.067068338394165,
"rewards/rejected": -2.3198189735412598,
"step": 3020
},
{
"epoch": 0.7269673704414588,
"grad_norm": 57.35606077595889,
"learning_rate": 1.0521327485452692e-07,
"logits/chosen": 0.347392201423645,
"logits/rejected": 0.3210673928260803,
"logps/chosen": -450.1835021972656,
"logps/rejected": -524.6959228515625,
"loss": 0.4912,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2702230215072632,
"rewards/margins": 1.0865848064422607,
"rewards/rejected": -2.3568077087402344,
"step": 3030
},
{
"epoch": 0.7293666026871402,
"grad_norm": 56.3165468310005,
"learning_rate": 1.0351142862272468e-07,
"logits/chosen": 0.209666445851326,
"logits/rejected": 0.20282307267189026,
"logps/chosen": -423.931396484375,
"logps/rejected": -572.3831176757812,
"loss": 0.4984,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4169180393218994,
"rewards/margins": 1.6322886943817139,
"rewards/rejected": -3.049206256866455,
"step": 3040
},
{
"epoch": 0.7317658349328215,
"grad_norm": 47.57484794011745,
"learning_rate": 1.0181985798221343e-07,
"logits/chosen": 0.3013080060482025,
"logits/rejected": 0.2218068540096283,
"logps/chosen": -470.23480224609375,
"logps/rejected": -589.8034057617188,
"loss": 0.5126,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3861111402511597,
"rewards/margins": 1.0853662490844727,
"rewards/rejected": -2.4714770317077637,
"step": 3050
},
{
"epoch": 0.7341650671785028,
"grad_norm": 48.06894623911944,
"learning_rate": 1.0013868158995329e-07,
"logits/chosen": 0.3860154449939728,
"logits/rejected": 0.3630084991455078,
"logps/chosen": -442.2177734375,
"logps/rejected": -527.9283447265625,
"loss": 0.4642,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.2381409406661987,
"rewards/margins": 1.22637939453125,
"rewards/rejected": -2.4645204544067383,
"step": 3060
},
{
"epoch": 0.7365642994241842,
"grad_norm": 51.169486765513234,
"learning_rate": 9.84680173737887e-08,
"logits/chosen": 0.2769750952720642,
"logits/rejected": 0.2646028995513916,
"logps/chosen": -475.0011291503906,
"logps/rejected": -548.9890747070312,
"loss": 0.4765,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4719974994659424,
"rewards/margins": 1.0918984413146973,
"rewards/rejected": -2.5638959407806396,
"step": 3070
},
{
"epoch": 0.7389635316698656,
"grad_norm": 58.59714970661162,
"learning_rate": 9.680798252417713e-08,
"logits/chosen": 0.2717548906803131,
"logits/rejected": 0.2305576503276825,
"logps/chosen": -379.20928955078125,
"logps/rejected": -515.2022094726562,
"loss": 0.492,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.101180076599121,
"rewards/margins": 0.944907009601593,
"rewards/rejected": -2.0460872650146484,
"step": 3080
},
{
"epoch": 0.741362763915547,
"grad_norm": 61.65719550385752,
"learning_rate": 9.515869348596808e-08,
"logits/chosen": 0.11913663148880005,
"logits/rejected": 0.09342759847640991,
"logps/chosen": -497.87109375,
"logps/rejected": -598.5771484375,
"loss": 0.4878,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4787302017211914,
"rewards/margins": 1.3738664388656616,
"rewards/rejected": -2.8525967597961426,
"step": 3090
},
{
"epoch": 0.7437619961612284,
"grad_norm": 41.218790972775324,
"learning_rate": 9.352026595023493e-08,
"logits/chosen": 0.10429096221923828,
"logits/rejected": 0.1541799008846283,
"logps/chosen": -517.5309448242188,
"logps/rejected": -543.2216796875,
"loss": 0.4985,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.4588356018066406,
"rewards/margins": 0.6233514547348022,
"rewards/rejected": -2.0821871757507324,
"step": 3100
},
{
"epoch": 0.7461612284069098,
"grad_norm": 64.90615052640716,
"learning_rate": 9.189281484616004e-08,
"logits/chosen": 0.22654108703136444,
"logits/rejected": 0.1651889979839325,
"logps/chosen": -402.1455383300781,
"logps/rejected": -556.6002807617188,
"loss": 0.5169,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.490733027458191,
"rewards/margins": 0.9633838534355164,
"rewards/rejected": -2.4541170597076416,
"step": 3110
},
{
"epoch": 0.7485604606525912,
"grad_norm": 55.8925921234848,
"learning_rate": 9.027645433297249e-08,
"logits/chosen": 0.11542461812496185,
"logits/rejected": 0.17937800288200378,
"logps/chosen": -566.5679321289062,
"logps/rejected": -637.2242431640625,
"loss": 0.5183,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.827355146408081,
"rewards/margins": 1.0894877910614014,
"rewards/rejected": -2.9168429374694824,
"step": 3120
},
{
"epoch": 0.7509596928982726,
"grad_norm": 54.00400306277147,
"learning_rate": 8.867129779194066e-08,
"logits/chosen": 0.16981378197669983,
"logits/rejected": 0.16173888742923737,
"logps/chosen": -371.0014953613281,
"logps/rejected": -522.0635986328125,
"loss": 0.4864,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -0.9543946981430054,
"rewards/margins": 1.564900517463684,
"rewards/rejected": -2.5192952156066895,
"step": 3130
},
{
"epoch": 0.753358925143954,
"grad_norm": 54.599194100775584,
"learning_rate": 8.707745781841866e-08,
"logits/chosen": 0.14470471441745758,
"logits/rejected": 0.1468985676765442,
"logps/chosen": -400.3184509277344,
"logps/rejected": -513.9260864257812,
"loss": 0.5147,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.221286416053772,
"rewards/margins": 1.1234190464019775,
"rewards/rejected": -2.344705581665039,
"step": 3140
},
{
"epoch": 0.7557581573896354,
"grad_norm": 34.75601738944086,
"learning_rate": 8.549504621394831e-08,
"logits/chosen": 0.15695925056934357,
"logits/rejected": 0.14711011946201324,
"logps/chosen": -413.1109313964844,
"logps/rejected": -539.1842041015625,
"loss": 0.427,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.069771409034729,
"rewards/margins": 1.3148638010025024,
"rewards/rejected": -2.3846354484558105,
"step": 3150
},
{
"epoch": 0.7581573896353166,
"grad_norm": 53.105874033112755,
"learning_rate": 8.392417397841703e-08,
"logits/chosen": 0.26591944694519043,
"logits/rejected": 0.26246827840805054,
"logps/chosen": -447.65557861328125,
"logps/rejected": -544.1351318359375,
"loss": 0.4888,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2605499029159546,
"rewards/margins": 0.8398078083992004,
"rewards/rejected": -2.1003577709198,
"step": 3160
},
{
"epoch": 0.760556621880998,
"grad_norm": 47.78743297051488,
"learning_rate": 8.236495130227083e-08,
"logits/chosen": 0.24016205966472626,
"logits/rejected": 0.31962883472442627,
"logps/chosen": -456.42724609375,
"logps/rejected": -570.5203247070312,
"loss": 0.4993,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.1726109981536865,
"rewards/margins": 1.4965537786483765,
"rewards/rejected": -2.6691648960113525,
"step": 3170
},
{
"epoch": 0.7629558541266794,
"grad_norm": 47.980852069933555,
"learning_rate": 8.081748755878612e-08,
"logits/chosen": 0.2495994120836258,
"logits/rejected": 0.2759885787963867,
"logps/chosen": -469.076171875,
"logps/rejected": -524.9566040039062,
"loss": 0.4968,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3896160125732422,
"rewards/margins": 0.9030616879463196,
"rewards/rejected": -2.292677879333496,
"step": 3180
},
{
"epoch": 0.7653550863723608,
"grad_norm": 44.55472031419905,
"learning_rate": 7.928189129639632e-08,
"logits/chosen": 0.2707396149635315,
"logits/rejected": 0.21236738562583923,
"logps/chosen": -433.556884765625,
"logps/rejected": -539.517578125,
"loss": 0.4522,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2524731159210205,
"rewards/margins": 0.9954684972763062,
"rewards/rejected": -2.247941493988037,
"step": 3190
},
{
"epoch": 0.7677543186180422,
"grad_norm": 77.95104725852434,
"learning_rate": 7.775827023107834e-08,
"logits/chosen": 0.18351641297340393,
"logits/rejected": 0.18833932280540466,
"logps/chosen": -446.3948669433594,
"logps/rejected": -545.6039428710938,
"loss": 0.5189,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3583651781082153,
"rewards/margins": 0.7764785289764404,
"rewards/rejected": -2.1348438262939453,
"step": 3200
},
{
"epoch": 0.7701535508637236,
"grad_norm": 60.91165565345474,
"learning_rate": 7.624673123879682e-08,
"logits/chosen": 0.03742004930973053,
"logits/rejected": 0.07750044018030167,
"logps/chosen": -426.2344665527344,
"logps/rejected": -510.4020080566406,
"loss": 0.5048,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3010095357894897,
"rewards/margins": 0.9792767763137817,
"rewards/rejected": -2.2802863121032715,
"step": 3210
},
{
"epoch": 0.772552783109405,
"grad_norm": 43.5674600823645,
"learning_rate": 7.474738034800663e-08,
"logits/chosen": 0.13723036646842957,
"logits/rejected": 0.04767593368887901,
"logps/chosen": -369.3411560058594,
"logps/rejected": -486.69439697265625,
"loss": 0.4954,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0865916013717651,
"rewards/margins": 1.3701813220977783,
"rewards/rejected": -2.456772804260254,
"step": 3220
},
{
"epoch": 0.7749520153550864,
"grad_norm": 65.92599613926842,
"learning_rate": 7.326032273221606e-08,
"logits/chosen": 0.23154711723327637,
"logits/rejected": 0.1886422336101532,
"logps/chosen": -478.3605041503906,
"logps/rejected": -570.8961791992188,
"loss": 0.4831,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2519872188568115,
"rewards/margins": 1.1383593082427979,
"rewards/rejected": -2.3903465270996094,
"step": 3230
},
{
"epoch": 0.7773512476007678,
"grad_norm": 45.83026398609644,
"learning_rate": 7.178566270260872e-08,
"logits/chosen": 0.31105470657348633,
"logits/rejected": 0.22554393112659454,
"logps/chosen": -447.1808166503906,
"logps/rejected": -576.032958984375,
"loss": 0.5129,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2397021055221558,
"rewards/margins": 0.951197624206543,
"rewards/rejected": -2.190899610519409,
"step": 3240
},
{
"epoch": 0.7797504798464492,
"grad_norm": 50.507648401741996,
"learning_rate": 7.032350370072709e-08,
"logits/chosen": 0.19485214352607727,
"logits/rejected": 0.18930187821388245,
"logps/chosen": -456.4967346191406,
"logps/rejected": -569.6973266601562,
"loss": 0.4481,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.2981529235839844,
"rewards/margins": 1.2217051982879639,
"rewards/rejected": -2.5198581218719482,
"step": 3250
},
{
"epoch": 0.7821497120921305,
"grad_norm": 40.68087545077646,
"learning_rate": 6.887394829121596e-08,
"logits/chosen": 0.2527236044406891,
"logits/rejected": 0.20923948287963867,
"logps/chosen": -455.47454833984375,
"logps/rejected": -632.2030029296875,
"loss": 0.4543,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.2574024200439453,
"rewards/margins": 1.9225542545318604,
"rewards/rejected": -3.1799566745758057,
"step": 3260
},
{
"epoch": 0.7845489443378119,
"grad_norm": 37.54734198368332,
"learning_rate": 6.743709815462833e-08,
"logits/chosen": 0.10011599957942963,
"logits/rejected": 0.1131478101015091,
"logps/chosen": -462.3744201660156,
"logps/rejected": -519.977783203125,
"loss": 0.4665,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3748726844787598,
"rewards/margins": 0.9147384762763977,
"rewards/rejected": -2.2896108627319336,
"step": 3270
},
{
"epoch": 0.7869481765834933,
"grad_norm": 51.364098272276145,
"learning_rate": 6.601305408029287e-08,
"logits/chosen": 0.41624197363853455,
"logits/rejected": 0.4190692901611328,
"logps/chosen": -458.64141845703125,
"logps/rejected": -567.7257080078125,
"loss": 0.4664,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5802090167999268,
"rewards/margins": 1.1085751056671143,
"rewards/rejected": -2.688784122467041,
"step": 3280
},
{
"epoch": 0.7893474088291746,
"grad_norm": 48.68347118403701,
"learning_rate": 6.460191595924366e-08,
"logits/chosen": 0.23670163750648499,
"logits/rejected": 0.21305176615715027,
"logps/chosen": -472.20654296875,
"logps/rejected": -575.2575073242188,
"loss": 0.4741,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4002991914749146,
"rewards/margins": 1.0437225103378296,
"rewards/rejected": -2.444021701812744,
"step": 3290
},
{
"epoch": 0.791746641074856,
"grad_norm": 56.29885219772071,
"learning_rate": 6.320378277721342e-08,
"logits/chosen": 0.3236589729785919,
"logits/rejected": 0.2942892014980316,
"logps/chosen": -485.74609375,
"logps/rejected": -548.2264404296875,
"loss": 0.4937,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.748716950416565,
"rewards/margins": 0.7379652261734009,
"rewards/rejected": -2.4866819381713867,
"step": 3300
},
{
"epoch": 0.7941458733205374,
"grad_norm": 47.30513911873481,
"learning_rate": 6.181875260769032e-08,
"logits/chosen": 0.21434447169303894,
"logits/rejected": 0.29501864314079285,
"logps/chosen": -473.3141174316406,
"logps/rejected": -513.0935668945312,
"loss": 0.4824,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.042864441871643,
"rewards/margins": 1.146907925605774,
"rewards/rejected": -2.189772129058838,
"step": 3310
},
{
"epoch": 0.7965451055662188,
"grad_norm": 43.836702306292864,
"learning_rate": 6.044692260503797e-08,
"logits/chosen": 0.2978779673576355,
"logits/rejected": 0.2920413911342621,
"logps/chosen": -517.2478637695312,
"logps/rejected": -626.5977172851562,
"loss": 0.4366,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4270732402801514,
"rewards/margins": 1.378542184829712,
"rewards/rejected": -2.805615186691284,
"step": 3320
},
{
"epoch": 0.7989443378119002,
"grad_norm": 49.783712311366116,
"learning_rate": 5.9088388997680984e-08,
"logits/chosen": 0.15503938496112823,
"logits/rejected": 0.19135913252830505,
"logps/chosen": -540.2918090820312,
"logps/rejected": -589.90185546875,
"loss": 0.4691,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.3969385623931885,
"rewards/margins": 1.2311842441558838,
"rewards/rejected": -2.6281230449676514,
"step": 3330
},
{
"epoch": 0.8013435700575816,
"grad_norm": 52.57420620553349,
"learning_rate": 5.774324708135439e-08,
"logits/chosen": 0.2751420736312866,
"logits/rejected": 0.28755050897598267,
"logps/chosen": -397.3004150390625,
"logps/rejected": -484.384521484375,
"loss": 0.4858,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2007606029510498,
"rewards/margins": 1.0477242469787598,
"rewards/rejected": -2.2484848499298096,
"step": 3340
},
{
"epoch": 0.803742802303263,
"grad_norm": 40.1972577695682,
"learning_rate": 5.641159121241953e-08,
"logits/chosen": 0.32921257615089417,
"logits/rejected": 0.24844393134117126,
"logps/chosen": -387.3114013671875,
"logps/rejected": -536.5883178710938,
"loss": 0.4903,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1569197177886963,
"rewards/margins": 1.0728175640106201,
"rewards/rejected": -2.2297370433807373,
"step": 3350
},
{
"epoch": 0.8061420345489443,
"grad_norm": 41.69598167340838,
"learning_rate": 5.5093514801245106e-08,
"logits/chosen": 0.3076106905937195,
"logits/rejected": 0.2400285303592682,
"logps/chosen": -443.32000732421875,
"logps/rejected": -577.48388671875,
"loss": 0.4817,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3214082717895508,
"rewards/margins": 0.9961814880371094,
"rewards/rejected": -2.317589521408081,
"step": 3360
},
{
"epoch": 0.8085412667946257,
"grad_norm": 38.422027437084395,
"learning_rate": 5.378911030565453e-08,
"logits/chosen": 0.3213488757610321,
"logits/rejected": 0.26428383588790894,
"logps/chosen": -506.6258239746094,
"logps/rejected": -639.1539916992188,
"loss": 0.4834,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5233229398727417,
"rewards/margins": 1.1094070672988892,
"rewards/rejected": -2.6327297687530518,
"step": 3370
},
{
"epoch": 0.8109404990403071,
"grad_norm": 44.33236145563771,
"learning_rate": 5.249846922444101e-08,
"logits/chosen": 0.3445442318916321,
"logits/rejected": 0.2675052285194397,
"logps/chosen": -402.5491638183594,
"logps/rejected": -543.9547729492188,
"loss": 0.462,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3842418193817139,
"rewards/margins": 1.5147311687469482,
"rewards/rejected": -2.898972988128662,
"step": 3380
},
{
"epoch": 0.8133397312859885,
"grad_norm": 58.63982281658398,
"learning_rate": 5.122168209094865e-08,
"logits/chosen": 0.38930395245552063,
"logits/rejected": 0.36614999175071716,
"logps/chosen": -429.3528747558594,
"logps/rejected": -487.43408203125,
"loss": 0.4705,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.5928863286972046,
"rewards/margins": 0.6625052094459534,
"rewards/rejected": -2.2553915977478027,
"step": 3390
},
{
"epoch": 0.8157389635316699,
"grad_norm": 41.613843164350314,
"learning_rate": 4.995883846672222e-08,
"logits/chosen": 0.14363157749176025,
"logits/rejected": 0.2796134054660797,
"logps/chosen": -592.6302490234375,
"logps/rejected": -587.6798706054688,
"loss": 0.4708,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.5286242961883545,
"rewards/margins": 0.759235143661499,
"rewards/rejected": -2.2878596782684326,
"step": 3400
},
{
"epoch": 0.8181381957773513,
"grad_norm": 49.604272632089646,
"learning_rate": 4.871002693522486e-08,
"logits/chosen": 0.2720317244529724,
"logits/rejected": 0.25077277421951294,
"logps/chosen": -463.8601989746094,
"logps/rejected": -517.4568481445312,
"loss": 0.4856,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.3305190801620483,
"rewards/margins": 0.8297308683395386,
"rewards/rejected": -2.160250186920166,
"step": 3410
},
{
"epoch": 0.8205374280230326,
"grad_norm": 40.03296260410171,
"learning_rate": 4.7475335095623956e-08,
"logits/chosen": 0.34070852398872375,
"logits/rejected": 0.2651143968105316,
"logps/chosen": -466.72686767578125,
"logps/rejected": -563.7396240234375,
"loss": 0.4721,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4763226509094238,
"rewards/margins": 1.2001755237579346,
"rewards/rejected": -2.6764981746673584,
"step": 3420
},
{
"epoch": 0.822936660268714,
"grad_norm": 80.18666057349425,
"learning_rate": 4.6254849556646714e-08,
"logits/chosen": 0.22728531062602997,
"logits/rejected": 0.229964017868042,
"logps/chosen": -496.54852294921875,
"logps/rejected": -587.7725830078125,
"loss": 0.4966,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.4686188697814941,
"rewards/margins": 1.2523690462112427,
"rewards/rejected": -2.7209877967834473,
"step": 3430
},
{
"epoch": 0.8253358925143954,
"grad_norm": 52.794660060456266,
"learning_rate": 4.504865593050483e-08,
"logits/chosen": 0.27111780643463135,
"logits/rejected": 0.2475912868976593,
"logps/chosen": -477.52685546875,
"logps/rejected": -583.2651977539062,
"loss": 0.5015,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5320662260055542,
"rewards/margins": 0.9184083938598633,
"rewards/rejected": -2.450474500656128,
"step": 3440
},
{
"epoch": 0.8277351247600768,
"grad_norm": 63.34169787369902,
"learning_rate": 4.385683882688895e-08,
"logits/chosen": 0.15275821089744568,
"logits/rejected": 0.20857541263103485,
"logps/chosen": -512.3521728515625,
"logps/rejected": -510.0169982910156,
"loss": 0.5622,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6563478708267212,
"rewards/margins": 0.5664867162704468,
"rewards/rejected": -2.222834587097168,
"step": 3450
},
{
"epoch": 0.8301343570057581,
"grad_norm": 59.152646437947276,
"learning_rate": 4.2679481847033985e-08,
"logits/chosen": 0.3345550298690796,
"logits/rejected": 0.3184022009372711,
"logps/chosen": -458.943115234375,
"logps/rejected": -580.5496826171875,
"loss": 0.5152,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.367032527923584,
"rewards/margins": 1.1215214729309082,
"rewards/rejected": -2.488554000854492,
"step": 3460
},
{
"epoch": 0.8325335892514395,
"grad_norm": 41.522970046635024,
"learning_rate": 4.151666757785435e-08,
"logits/chosen": 0.25053077936172485,
"logits/rejected": 0.21285638213157654,
"logps/chosen": -415.4532165527344,
"logps/rejected": -565.3435668945312,
"loss": 0.467,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.0187116861343384,
"rewards/margins": 1.5517404079437256,
"rewards/rejected": -2.5704522132873535,
"step": 3470
},
{
"epoch": 0.8349328214971209,
"grad_norm": 50.23472101986963,
"learning_rate": 4.036847758615136e-08,
"logits/chosen": 0.23763033747673035,
"logits/rejected": 0.23918600380420685,
"logps/chosen": -477.56292724609375,
"logps/rejected": -576.02490234375,
"loss": 0.5032,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.8143908977508545,
"rewards/margins": 0.8722183108329773,
"rewards/rejected": -2.6866097450256348,
"step": 3480
},
{
"epoch": 0.8373320537428023,
"grad_norm": 45.74910365878837,
"learning_rate": 3.923499241289113e-08,
"logits/chosen": 0.160926952958107,
"logits/rejected": 0.19261090457439423,
"logps/chosen": -533.9952392578125,
"logps/rejected": -552.327880859375,
"loss": 0.5377,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.6078903675079346,
"rewards/margins": 0.8734768033027649,
"rewards/rejected": -2.4813671112060547,
"step": 3490
},
{
"epoch": 0.8397312859884837,
"grad_norm": 47.87346283993082,
"learning_rate": 3.811629156755541e-08,
"logits/chosen": 0.1999920904636383,
"logits/rejected": 0.14960861206054688,
"logps/chosen": -488.1973571777344,
"logps/rejected": -596.3568115234375,
"loss": 0.4956,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.2728978395462036,
"rewards/margins": 1.2190895080566406,
"rewards/rejected": -2.4919872283935547,
"step": 3500
},
{
"epoch": 0.8421305182341651,
"grad_norm": 41.03513231238894,
"learning_rate": 3.701245352256391e-08,
"logits/chosen": 0.2294701635837555,
"logits/rejected": 0.25733810663223267,
"logps/chosen": -478.146484375,
"logps/rejected": -508.96856689453125,
"loss": 0.4831,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1556288003921509,
"rewards/margins": 0.6834120750427246,
"rewards/rejected": -1.839040756225586,
"step": 3510
},
{
"epoch": 0.8445297504798465,
"grad_norm": 40.5674011892533,
"learning_rate": 3.592355570776984e-08,
"logits/chosen": 0.1878044307231903,
"logits/rejected": 0.14977982640266418,
"logps/chosen": -398.69970703125,
"logps/rejected": -515.438232421875,
"loss": 0.4747,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.043157935142517,
"rewards/margins": 1.0835729837417603,
"rewards/rejected": -2.1267309188842773,
"step": 3520
},
{
"epoch": 0.8469289827255279,
"grad_norm": 42.604163064101506,
"learning_rate": 3.484967450502904e-08,
"logits/chosen": 0.3040066361427307,
"logits/rejected": 0.23765726387500763,
"logps/chosen": -383.44561767578125,
"logps/rejected": -547.45703125,
"loss": 0.4804,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.1220704317092896,
"rewards/margins": 1.1863175630569458,
"rewards/rejected": -2.3083879947662354,
"step": 3530
},
{
"epoch": 0.8493282149712092,
"grad_norm": 59.806153925908724,
"learning_rate": 3.3790885242841296e-08,
"logits/chosen": 0.13462401926517487,
"logits/rejected": 0.1024751216173172,
"logps/chosen": -459.56915283203125,
"logps/rejected": -603.9002685546875,
"loss": 0.4678,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.4029033184051514,
"rewards/margins": 1.477888584136963,
"rewards/rejected": -2.8807921409606934,
"step": 3540
},
{
"epoch": 0.8517274472168906,
"grad_norm": 58.73779989635176,
"learning_rate": 3.274726219106677e-08,
"logits/chosen": 0.09248481690883636,
"logits/rejected": 0.07832972705364227,
"logps/chosen": -512.8543701171875,
"logps/rejected": -601.7901611328125,
"loss": 0.4885,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.3987281322479248,
"rewards/margins": 1.0073614120483398,
"rewards/rejected": -2.4060897827148438,
"step": 3550
},
{
"epoch": 0.8541266794625719,
"grad_norm": 47.36729249212975,
"learning_rate": 3.171887855571642e-08,
"logits/chosen": 0.23542580008506775,
"logits/rejected": 0.21016255021095276,
"logps/chosen": -400.10943603515625,
"logps/rejected": -472.996337890625,
"loss": 0.4859,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.2242614030838013,
"rewards/margins": 0.7899783253669739,
"rewards/rejected": -2.014239549636841,
"step": 3560
},
{
"epoch": 0.8565259117082533,
"grad_norm": 51.17436258863895,
"learning_rate": 3.070580647381643e-08,
"logits/chosen": 0.2268662452697754,
"logits/rejected": 0.17909319698810577,
"logps/chosen": -437.57421875,
"logps/rejected": -550.803466796875,
"loss": 0.4995,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3558123111724854,
"rewards/margins": 1.194657802581787,
"rewards/rejected": -2.5504701137542725,
"step": 3570
},
{
"epoch": 0.8589251439539347,
"grad_norm": 48.13040334135157,
"learning_rate": 2.9708117008348576e-08,
"logits/chosen": 0.31328874826431274,
"logits/rejected": 0.3502875864505768,
"logps/chosen": -517.9609985351562,
"logps/rejected": -542.474365234375,
"loss": 0.4897,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4639475345611572,
"rewards/margins": 0.7750081419944763,
"rewards/rejected": -2.2389559745788574,
"step": 3580
},
{
"epoch": 0.8613243761996161,
"grad_norm": 53.220249607806664,
"learning_rate": 2.8725880143264992e-08,
"logits/chosen": 0.21370474994182587,
"logits/rejected": 0.17975714802742004,
"logps/chosen": -469.7068786621094,
"logps/rejected": -589.1580200195312,
"loss": 0.5243,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.5573484897613525,
"rewards/margins": 0.7886762022972107,
"rewards/rejected": -2.346024513244629,
"step": 3590
},
{
"epoch": 0.8637236084452975,
"grad_norm": 67.90288894206734,
"learning_rate": 2.775916477857948e-08,
"logits/chosen": 0.25214099884033203,
"logits/rejected": 0.19312720000743866,
"logps/chosen": -414.92059326171875,
"logps/rejected": -506.6708984375,
"loss": 0.4783,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.330328345298767,
"rewards/margins": 0.9338465929031372,
"rewards/rejected": -2.2641749382019043,
"step": 3600
},
{
"epoch": 0.8661228406909789,
"grad_norm": 59.11695302836589,
"learning_rate": 2.680803872553408e-08,
"logits/chosen": 0.2528062164783478,
"logits/rejected": 0.17121002078056335,
"logps/chosen": -428.0210876464844,
"logps/rejected": -563.393310546875,
"loss": 0.4888,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.2556774616241455,
"rewards/margins": 1.5917272567749023,
"rewards/rejected": -2.8474044799804688,
"step": 3610
},
{
"epoch": 0.8685220729366603,
"grad_norm": 59.52967993062111,
"learning_rate": 2.5872568701842706e-08,
"logits/chosen": 0.32945194840431213,
"logits/rejected": 0.2652639150619507,
"logps/chosen": -392.31329345703125,
"logps/rejected": -495.3326721191406,
"loss": 0.539,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3325417041778564,
"rewards/margins": 0.8470155000686646,
"rewards/rejected": -2.1795573234558105,
"step": 3620
},
{
"epoch": 0.8709213051823417,
"grad_norm": 53.81677624528546,
"learning_rate": 2.495282032701096e-08,
"logits/chosen": 0.15500156581401825,
"logits/rejected": 0.2495473325252533,
"logps/chosen": -334.62774658203125,
"logps/rejected": -434.7308044433594,
"loss": 0.5105,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1157186031341553,
"rewards/margins": 1.1813395023345947,
"rewards/rejected": -2.29705810546875,
"step": 3630
},
{
"epoch": 0.8733205374280231,
"grad_norm": 70.3049018186209,
"learning_rate": 2.4048858117733133e-08,
"logits/chosen": 0.16910839080810547,
"logits/rejected": 0.169979065656662,
"logps/chosen": -436.8203125,
"logps/rejected": -540.2689819335938,
"loss": 0.4643,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.1331539154052734,
"rewards/margins": 1.6183888912200928,
"rewards/rejected": -2.751542568206787,
"step": 3640
},
{
"epoch": 0.8757197696737045,
"grad_norm": 49.87789467243074,
"learning_rate": 2.3160745483366938e-08,
"logits/chosen": 0.23682577908039093,
"logits/rejected": 0.1723048985004425,
"logps/chosen": -431.7490234375,
"logps/rejected": -562.2601318359375,
"loss": 0.4639,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3692805767059326,
"rewards/margins": 1.0158613920211792,
"rewards/rejected": -2.3851418495178223,
"step": 3650
},
{
"epoch": 0.8781190019193857,
"grad_norm": 47.658615941206975,
"learning_rate": 2.2288544721485197e-08,
"logits/chosen": 0.14381949603557587,
"logits/rejected": 0.03533410280942917,
"logps/chosen": -387.8703918457031,
"logps/rejected": -524.4107666015625,
"loss": 0.4813,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.0450143814086914,
"rewards/margins": 1.2733433246612549,
"rewards/rejected": -2.3183577060699463,
"step": 3660
},
{
"epoch": 0.8805182341650671,
"grad_norm": 45.82488725036134,
"learning_rate": 2.1432317013506117e-08,
"logits/chosen": 0.10933347791433334,
"logits/rejected": 0.12345802783966064,
"logps/chosen": -458.113037109375,
"logps/rejected": -490.5302734375,
"loss": 0.5397,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4664695262908936,
"rewards/margins": 0.8134799003601074,
"rewards/rejected": -2.27994966506958,
"step": 3670
},
{
"epoch": 0.8829174664107485,
"grad_norm": 53.31879445002021,
"learning_rate": 2.0592122420401704e-08,
"logits/chosen": 0.22227077186107635,
"logits/rejected": 0.24705934524536133,
"logps/chosen": -430.19537353515625,
"logps/rejected": -503.33050537109375,
"loss": 0.4986,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5509039163589478,
"rewards/margins": 0.648668646812439,
"rewards/rejected": -2.199572801589966,
"step": 3680
},
{
"epoch": 0.8853166986564299,
"grad_norm": 42.381048234129516,
"learning_rate": 1.976801987848459e-08,
"logits/chosen": 0.2069139927625656,
"logits/rejected": 0.16672655940055847,
"logps/chosen": -472.5472106933594,
"logps/rejected": -602.9320068359375,
"loss": 0.4888,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4228280782699585,
"rewards/margins": 1.2275440692901611,
"rewards/rejected": -2.65037202835083,
"step": 3690
},
{
"epoch": 0.8877159309021113,
"grad_norm": 53.40883833426912,
"learning_rate": 1.8960067195273987e-08,
"logits/chosen": 0.22911398112773895,
"logits/rejected": 0.21664564311504364,
"logps/chosen": -400.25030517578125,
"logps/rejected": -505.628173828125,
"loss": 0.5018,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.1031681299209595,
"rewards/margins": 1.1834853887557983,
"rewards/rejected": -2.286653757095337,
"step": 3700
},
{
"epoch": 0.8901151631477927,
"grad_norm": 41.251861300500764,
"learning_rate": 1.816832104544072e-08,
"logits/chosen": 0.30456072092056274,
"logits/rejected": 0.29536372423171997,
"logps/chosen": -486.6631774902344,
"logps/rejected": -542.3421630859375,
"loss": 0.4891,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.6421973705291748,
"rewards/margins": 0.8107506632804871,
"rewards/rejected": -2.4529478549957275,
"step": 3710
},
{
"epoch": 0.8925143953934741,
"grad_norm": 39.96769073144664,
"learning_rate": 1.7392836966831553e-08,
"logits/chosen": 0.20969875156879425,
"logits/rejected": 0.1758739948272705,
"logps/chosen": -437.6871643066406,
"logps/rejected": -546.804443359375,
"loss": 0.4412,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -1.2321717739105225,
"rewards/margins": 1.4391019344329834,
"rewards/rejected": -2.671273708343506,
"step": 3720
},
{
"epoch": 0.8949136276391555,
"grad_norm": 49.70645527143697,
"learning_rate": 1.663366935657373e-08,
"logits/chosen": 0.2884444296360016,
"logits/rejected": 0.3402741551399231,
"logps/chosen": -414.3851623535156,
"logps/rejected": -516.3787841796875,
"loss": 0.5216,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3819457292556763,
"rewards/margins": 0.948479950428009,
"rewards/rejected": -2.33042573928833,
"step": 3730
},
{
"epoch": 0.8973128598848369,
"grad_norm": 77.94886524477812,
"learning_rate": 1.5890871467258898e-08,
"logits/chosen": 0.19290375709533691,
"logits/rejected": 0.21824567019939423,
"logps/chosen": -533.5081787109375,
"logps/rejected": -581.3971557617188,
"loss": 0.4969,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.4534004926681519,
"rewards/margins": 0.8920512199401855,
"rewards/rejected": -2.345451831817627,
"step": 3740
},
{
"epoch": 0.8997120921305183,
"grad_norm": 41.01203397728858,
"learning_rate": 1.5164495403207967e-08,
"logits/chosen": 0.1695217341184616,
"logits/rejected": 0.035564176738262177,
"logps/chosen": -487.5433654785156,
"logps/rejected": -645.5303344726562,
"loss": 0.4676,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4484670162200928,
"rewards/margins": 1.3183784484863281,
"rewards/rejected": -2.766845464706421,
"step": 3750
},
{
"epoch": 0.9021113243761996,
"grad_norm": 40.21247020861889,
"learning_rate": 1.4454592116815962e-08,
"logits/chosen": 0.2717417776584625,
"logits/rejected": 0.2026948183774948,
"logps/chosen": -468.3108825683594,
"logps/rejected": -587.5950927734375,
"loss": 0.4599,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3242508172988892,
"rewards/margins": 1.0534656047821045,
"rewards/rejected": -2.3777167797088623,
"step": 3760
},
{
"epoch": 0.904510556621881,
"grad_norm": 36.5293014274636,
"learning_rate": 1.3761211404977934e-08,
"logits/chosen": 0.21695688366889954,
"logits/rejected": 0.18997912108898163,
"logps/chosen": -414.2664489746094,
"logps/rejected": -547.6370849609375,
"loss": 0.4415,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.205294132232666,
"rewards/margins": 1.4340205192565918,
"rewards/rejected": -2.639314651489258,
"step": 3770
},
{
"epoch": 0.9069097888675623,
"grad_norm": 57.20162040882379,
"learning_rate": 1.3084401905596177e-08,
"logits/chosen": 0.12880149483680725,
"logits/rejected": 0.14128455519676208,
"logps/chosen": -481.64605712890625,
"logps/rejected": -535.501708984375,
"loss": 0.4888,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1130110025405884,
"rewards/margins": 1.1385244131088257,
"rewards/rejected": -2.251535654067993,
"step": 3780
},
{
"epoch": 0.9093090211132437,
"grad_norm": 45.23521207048333,
"learning_rate": 1.2424211094168053e-08,
"logits/chosen": 0.3405439257621765,
"logits/rejected": 0.3810498118400574,
"logps/chosen": -528.0676879882812,
"logps/rejected": -599.46044921875,
"loss": 0.4764,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4664819240570068,
"rewards/margins": 0.8699405789375305,
"rewards/rejected": -2.3364224433898926,
"step": 3790
},
{
"epoch": 0.9117082533589251,
"grad_norm": 42.750810945395436,
"learning_rate": 1.1780685280456143e-08,
"logits/chosen": 0.22092266380786896,
"logits/rejected": 0.1667570322751999,
"logps/chosen": -535.6447143554688,
"logps/rejected": -663.0042724609375,
"loss": 0.5469,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.7906440496444702,
"rewards/margins": 1.2009087800979614,
"rewards/rejected": -2.9915525913238525,
"step": 3800
},
{
"epoch": 0.9141074856046065,
"grad_norm": 45.02882150214674,
"learning_rate": 1.1153869605239564e-08,
"logits/chosen": 0.3357655704021454,
"logits/rejected": 0.39680781960487366,
"logps/chosen": -468.96661376953125,
"logps/rejected": -499.510009765625,
"loss": 0.4881,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1827433109283447,
"rewards/margins": 0.8699227571487427,
"rewards/rejected": -2.052665948867798,
"step": 3810
},
{
"epoch": 0.9165067178502879,
"grad_norm": 53.835458478805826,
"learning_rate": 1.0543808037147606e-08,
"logits/chosen": 0.19844678044319153,
"logits/rejected": 0.09387796372175217,
"logps/chosen": -430.8998107910156,
"logps/rejected": -596.3431396484375,
"loss": 0.4637,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.182877779006958,
"rewards/margins": 1.630902886390686,
"rewards/rejected": -2.8137805461883545,
"step": 3820
},
{
"epoch": 0.9189059500959693,
"grad_norm": 45.731621829576106,
"learning_rate": 9.95054336957557e-09,
"logits/chosen": 0.20105035603046417,
"logits/rejected": 0.12556061148643494,
"logps/chosen": -441.2509765625,
"logps/rejected": -534.109375,
"loss": 0.4648,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.1896703243255615,
"rewards/margins": 0.8572039604187012,
"rewards/rejected": -2.046874523162842,
"step": 3830
},
{
"epoch": 0.9213051823416507,
"grad_norm": 59.067862218302,
"learning_rate": 9.37411721768286e-09,
"logits/chosen": 0.39653897285461426,
"logits/rejected": 0.27279889583587646,
"logps/chosen": -486.5269470214844,
"logps/rejected": -648.8412475585938,
"loss": 0.46,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.6371396780014038,
"rewards/margins": 1.1990723609924316,
"rewards/rejected": -2.836211919784546,
"step": 3840
},
{
"epoch": 0.9237044145873321,
"grad_norm": 47.43074874048961,
"learning_rate": 8.81457001547392e-09,
"logits/chosen": 0.2673342823982239,
"logits/rejected": 0.2015964239835739,
"logps/chosen": -492.92254638671875,
"logps/rejected": -605.4844970703125,
"loss": 0.4934,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.6292178630828857,
"rewards/margins": 1.0606807470321655,
"rewards/rejected": -2.689898729324341,
"step": 3850
},
{
"epoch": 0.9261036468330134,
"grad_norm": 38.22467054106717,
"learning_rate": 8.271941012961942e-09,
"logits/chosen": 0.35539960861206055,
"logits/rejected": 0.2722089886665344,
"logps/chosen": -419.780517578125,
"logps/rejected": -596.482666015625,
"loss": 0.4606,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.4746735095977783,
"rewards/margins": 1.2210102081298828,
"rewards/rejected": -2.695683717727661,
"step": 3860
},
{
"epoch": 0.9285028790786948,
"grad_norm": 50.37493253511501,
"learning_rate": 7.746268273415568e-09,
"logits/chosen": 0.3808482885360718,
"logits/rejected": 0.2647871673107147,
"logps/chosen": -485.0458984375,
"logps/rejected": -578.0924072265625,
"loss": 0.4923,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.5579640865325928,
"rewards/margins": 0.5145239233970642,
"rewards/rejected": -2.0724880695343018,
"step": 3870
},
{
"epoch": 0.9309021113243762,
"grad_norm": 48.99088949948664,
"learning_rate": 7.237588670689076e-09,
"logits/chosen": 0.08190400898456573,
"logits/rejected": 0.12344332039356232,
"logps/chosen": -428.4112243652344,
"logps/rejected": -517.3125610351562,
"loss": 0.4641,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.215342402458191,
"rewards/margins": 1.1562786102294922,
"rewards/rejected": -2.3716208934783936,
"step": 3880
},
{
"epoch": 0.9333013435700576,
"grad_norm": 44.444359990708264,
"learning_rate": 6.745937886635606e-09,
"logits/chosen": 0.22676298022270203,
"logits/rejected": 0.14976339042186737,
"logps/chosen": -487.2351989746094,
"logps/rejected": -613.9521484375,
"loss": 0.465,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.407539963722229,
"rewards/margins": 1.2937225103378296,
"rewards/rejected": -2.7012624740600586,
"step": 3890
},
{
"epoch": 0.935700575815739,
"grad_norm": 40.06142700499872,
"learning_rate": 6.271350408604409e-09,
"logits/chosen": 0.2837770879268646,
"logits/rejected": 0.2296113520860672,
"logps/chosen": -382.27227783203125,
"logps/rejected": -569.7482299804688,
"loss": 0.4645,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -1.0186858177185059,
"rewards/margins": 1.615103006362915,
"rewards/rejected": -2.633788585662842,
"step": 3900
},
{
"epoch": 0.9380998080614203,
"grad_norm": 73.29538152762231,
"learning_rate": 5.813859527021487e-09,
"logits/chosen": 0.35343560576438904,
"logits/rejected": 0.2977786660194397,
"logps/chosen": -445.3648376464844,
"logps/rejected": -555.0151977539062,
"loss": 0.4829,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2847732305526733,
"rewards/margins": 1.3373976945877075,
"rewards/rejected": -2.62217116355896,
"step": 3910
},
{
"epoch": 0.9404990403071017,
"grad_norm": 55.32984913756992,
"learning_rate": 5.373497333054616e-09,
"logits/chosen": 0.2757224440574646,
"logits/rejected": 0.27316632866859436,
"logps/chosen": -503.87371826171875,
"logps/rejected": -564.5245361328125,
"loss": 0.515,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.4553347826004028,
"rewards/margins": 0.799089252948761,
"rewards/rejected": -2.2544240951538086,
"step": 3920
},
{
"epoch": 0.9428982725527831,
"grad_norm": 45.13547537051501,
"learning_rate": 4.950294716362213e-09,
"logits/chosen": 0.2402069866657257,
"logits/rejected": 0.2745649814605713,
"logps/chosen": -531.6744995117188,
"logps/rejected": -642.5697021484375,
"loss": 0.4806,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.5781127214431763,
"rewards/margins": 1.1812816858291626,
"rewards/rejected": -2.7593941688537598,
"step": 3930
},
{
"epoch": 0.9452975047984645,
"grad_norm": 41.45524037338652,
"learning_rate": 4.544281362926422e-09,
"logits/chosen": 0.1885417103767395,
"logits/rejected": 0.1404399871826172,
"logps/chosen": -493.8916931152344,
"logps/rejected": -607.7820434570312,
"loss": 0.4847,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.2237544059753418,
"rewards/margins": 1.2443654537200928,
"rewards/rejected": -2.4681198596954346,
"step": 3940
},
{
"epoch": 0.9476967370441459,
"grad_norm": 41.32543731890712,
"learning_rate": 4.15548575297095e-09,
"logits/chosen": 0.13838523626327515,
"logits/rejected": 0.12014584243297577,
"logps/chosen": -423.9912109375,
"logps/rejected": -555.3060302734375,
"loss": 0.4492,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3068205118179321,
"rewards/margins": 1.4025375843048096,
"rewards/rejected": -2.709357976913452,
"step": 3950
},
{
"epoch": 0.9500959692898272,
"grad_norm": 38.663387459727744,
"learning_rate": 3.7839351589631366e-09,
"logits/chosen": 0.20229902863502502,
"logits/rejected": 0.06122536584734917,
"logps/chosen": -423.82379150390625,
"logps/rejected": -579.1092529296875,
"loss": 0.4703,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3485114574432373,
"rewards/margins": 0.9370707273483276,
"rewards/rejected": -2.2855823040008545,
"step": 3960
},
{
"epoch": 0.9524952015355086,
"grad_norm": 55.20292972374471,
"learning_rate": 3.4296556437010405e-09,
"logits/chosen": 0.20910441875457764,
"logits/rejected": 0.18343612551689148,
"logps/chosen": -397.52239990234375,
"logps/rejected": -482.45513916015625,
"loss": 0.4914,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3418649435043335,
"rewards/margins": 0.8949347734451294,
"rewards/rejected": -2.236799716949463,
"step": 3970
},
{
"epoch": 0.95489443378119,
"grad_norm": 52.42663168427878,
"learning_rate": 3.092672058485124e-09,
"logits/chosen": 0.2784040868282318,
"logits/rejected": 0.22552700340747833,
"logps/chosen": -437.07122802734375,
"logps/rejected": -581.2984008789062,
"loss": 0.5278,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.39797043800354,
"rewards/margins": 1.417875051498413,
"rewards/rejected": -2.815845489501953,
"step": 3980
},
{
"epoch": 0.9572936660268714,
"grad_norm": 51.730419941201816,
"learning_rate": 2.7730080413750356e-09,
"logits/chosen": 0.3203295171260834,
"logits/rejected": 0.33414626121520996,
"logps/chosen": -470.68963623046875,
"logps/rejected": -590.4835815429688,
"loss": 0.4926,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.3279026746749878,
"rewards/margins": 1.245241641998291,
"rewards/rejected": -2.5731444358825684,
"step": 3990
},
{
"epoch": 0.9596928982725528,
"grad_norm": 52.110269896703294,
"learning_rate": 2.4706860155316033e-09,
"logits/chosen": 0.2101161777973175,
"logits/rejected": 0.23008927702903748,
"logps/chosen": -545.7728271484375,
"logps/rejected": -637.5755615234375,
"loss": 0.4895,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4393521547317505,
"rewards/margins": 0.8712779879570007,
"rewards/rejected": -2.3106300830841064,
"step": 4000
},
{
"epoch": 0.9596928982725528,
"eval_logits/chosen": 0.38506969809532166,
"eval_logits/rejected": 0.3408171534538269,
"eval_logps/chosen": -459.0677185058594,
"eval_logps/rejected": -584.910400390625,
"eval_loss": 0.4760858714580536,
"eval_rewards/accuracies": 0.7982142567634583,
"eval_rewards/chosen": -1.4039554595947266,
"eval_rewards/margins": 1.1972852945327759,
"eval_rewards/rejected": -2.601240873336792,
"eval_runtime": 172.2382,
"eval_samples_per_second": 25.9,
"eval_steps_per_second": 0.406,
"step": 4000
},
{
"epoch": 0.9620921305182342,
"grad_norm": 48.027804731217394,
"learning_rate": 2.185727187643843e-09,
"logits/chosen": 0.17230884730815887,
"logits/rejected": 0.11436843872070312,
"logps/chosen": -407.50506591796875,
"logps/rejected": -561.4027709960938,
"loss": 0.5232,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.3693194389343262,
"rewards/margins": 1.4761823415756226,
"rewards/rejected": -2.8455021381378174,
"step": 4010
},
{
"epoch": 0.9644913627639156,
"grad_norm": 57.935705040777876,
"learning_rate": 1.9181515464413434e-09,
"logits/chosen": 0.15140806138515472,
"logits/rejected": 0.09838312864303589,
"logps/chosen": -575.939208984375,
"logps/rejected": -703.3623657226562,
"loss": 0.4954,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -1.2815978527069092,
"rewards/margins": 1.3098185062408447,
"rewards/rejected": -2.591416597366333,
"step": 4020
},
{
"epoch": 0.966890595009597,
"grad_norm": 40.72962697033489,
"learning_rate": 1.6679778612923302e-09,
"logits/chosen": 0.21621087193489075,
"logits/rejected": 0.2728949785232544,
"logps/chosen": -515.5426635742188,
"logps/rejected": -592.4903564453125,
"loss": 0.4629,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.5008609294891357,
"rewards/margins": 0.7948905229568481,
"rewards/rejected": -2.2957513332366943,
"step": 4030
},
{
"epoch": 0.9692898272552783,
"grad_norm": 54.67990587779175,
"learning_rate": 1.43522368088686e-09,
"logits/chosen": 0.29817652702331543,
"logits/rejected": 0.22439947724342346,
"logps/chosen": -469.2783203125,
"logps/rejected": -633.0770263671875,
"loss": 0.5304,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.5123900175094604,
"rewards/margins": 1.635197401046753,
"rewards/rejected": -3.147587299346924,
"step": 4040
},
{
"epoch": 0.9716890595009597,
"grad_norm": 70.3894278582445,
"learning_rate": 1.2199053320059993e-09,
"logits/chosen": 0.3103833794593811,
"logits/rejected": 0.2175188809633255,
"logps/chosen": -478.85443115234375,
"logps/rejected": -599.4166259765625,
"loss": 0.4973,
"rewards/accuracies": 0.7749999761581421,
"rewards/chosen": -1.4971873760223389,
"rewards/margins": 1.086474061012268,
"rewards/rejected": -2.5836615562438965,
"step": 4050
},
{
"epoch": 0.974088291746641,
"grad_norm": 45.89117778001179,
"learning_rate": 1.0220379183764338e-09,
"logits/chosen": 0.1872117817401886,
"logits/rejected": 0.14962831139564514,
"logps/chosen": -379.21612548828125,
"logps/rejected": -526.4472045898438,
"loss": 0.4727,
"rewards/accuracies": 0.800000011920929,
"rewards/chosen": -1.1573994159698486,
"rewards/margins": 1.4220101833343506,
"rewards/rejected": -2.5794098377227783,
"step": 4060
},
{
"epoch": 0.9764875239923224,
"grad_norm": 42.58928630808853,
"learning_rate": 8.416353196111503e-10,
"logits/chosen": 0.4299827218055725,
"logits/rejected": 0.3653213679790497,
"logps/chosen": -455.606689453125,
"logps/rejected": -537.0999755859375,
"loss": 0.5395,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.646712064743042,
"rewards/margins": 0.9391171336174011,
"rewards/rejected": -2.585829257965088,
"step": 4070
},
{
"epoch": 0.9788867562380038,
"grad_norm": 53.46584271337103,
"learning_rate": 6.787101902356873e-10,
"logits/chosen": 0.3689078986644745,
"logits/rejected": 0.34390968084335327,
"logps/chosen": -460.3603515625,
"logps/rejected": -581.0008544921875,
"loss": 0.461,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.3799049854278564,
"rewards/margins": 0.9097992181777954,
"rewards/rejected": -2.2897043228149414,
"step": 4080
},
{
"epoch": 0.9812859884836852,
"grad_norm": 61.26328575901746,
"learning_rate": 5.332739588005953e-10,
"logits/chosen": 0.1865283101797104,
"logits/rejected": 0.08630210161209106,
"logps/chosen": -390.46929931640625,
"logps/rejected": -543.6526489257812,
"loss": 0.4761,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2067457437515259,
"rewards/margins": 1.2890859842300415,
"rewards/rejected": -2.4958317279815674,
"step": 4090
},
{
"epoch": 0.9836852207293666,
"grad_norm": 49.7010925580212,
"learning_rate": 4.053368270797164e-10,
"logits/chosen": 0.34013232588768005,
"logits/rejected": 0.23841337859630585,
"logps/chosen": -435.8818359375,
"logps/rejected": -555.6300659179688,
"loss": 0.452,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.4993178844451904,
"rewards/margins": 1.1681034564971924,
"rewards/rejected": -2.6674208641052246,
"step": 4100
},
{
"epoch": 0.986084452975048,
"grad_norm": 39.60831889767418,
"learning_rate": 2.949077693545354e-10,
"logits/chosen": 0.3429808020591736,
"logits/rejected": 0.28340935707092285,
"logps/chosen": -493.12799072265625,
"logps/rejected": -603.4589233398438,
"loss": 0.5203,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5291836261749268,
"rewards/margins": 0.7797685861587524,
"rewards/rejected": -2.3089520931243896,
"step": 4110
},
{
"epoch": 0.9884836852207294,
"grad_norm": 48.86243638343189,
"learning_rate": 2.0199453178471047e-10,
"logits/chosen": 0.2578023374080658,
"logits/rejected": 0.28469234704971313,
"logps/chosen": -521.9082641601562,
"logps/rejected": -584.6770629882812,
"loss": 0.4724,
"rewards/accuracies": 0.925000011920929,
"rewards/chosen": -1.4713417291641235,
"rewards/margins": 1.0106664896011353,
"rewards/rejected": -2.482008457183838,
"step": 4120
},
{
"epoch": 0.9908829174664108,
"grad_norm": 40.74851597282627,
"learning_rate": 1.266036318647301e-10,
"logits/chosen": 0.24952539801597595,
"logits/rejected": 0.21447113156318665,
"logps/chosen": -515.4271240234375,
"logps/rejected": -623.4884033203125,
"loss": 0.4614,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3878891468048096,
"rewards/margins": 1.3714239597320557,
"rewards/rejected": -2.7593131065368652,
"step": 4130
},
{
"epoch": 0.9932821497120922,
"grad_norm": 59.7391917924102,
"learning_rate": 6.874035796672339e-11,
"logits/chosen": 0.20685334503650665,
"logits/rejected": 0.19621731340885162,
"logps/chosen": -468.5065002441406,
"logps/rejected": -590.8399047851562,
"loss": 0.511,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.1164958477020264,
"rewards/margins": 1.485654592514038,
"rewards/rejected": -2.6021504402160645,
"step": 4140
},
{
"epoch": 0.9956813819577736,
"grad_norm": 64.53279604006218,
"learning_rate": 2.8408768969423458e-11,
"logits/chosen": 0.16596756875514984,
"logits/rejected": 0.11380906403064728,
"logps/chosen": -485.75006103515625,
"logps/rejected": -593.44287109375,
"loss": 0.4651,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3494365215301514,
"rewards/margins": 1.0383247137069702,
"rewards/rejected": -2.387761354446411,
"step": 4150
},
{
"epoch": 0.9980806142034548,
"grad_norm": 64.97328454417662,
"learning_rate": 5.611693973617271e-12,
"logits/chosen": 0.3674852252006531,
"logits/rejected": 0.3302612006664276,
"logps/chosen": -414.0726623535156,
"logps/rejected": -535.21337890625,
"loss": 0.5188,
"rewards/accuracies": 0.824999988079071,
"rewards/chosen": -1.3726593255996704,
"rewards/margins": 0.9844037890434265,
"rewards/rejected": -2.357063055038452,
"step": 4160
},
{
"epoch": 1.0,
"step": 4168,
"total_flos": 0.0,
"train_loss": 0.5273771832863338,
"train_runtime": 14157.4064,
"train_samples_per_second": 9.42,
"train_steps_per_second": 0.294
}
],
"logging_steps": 10,
"max_steps": 4168,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}