|
{ |
|
"best_metric": 1.305156946182251, |
|
"best_model_checkpoint": "./results/models/checkpoint-230688", |
|
"epoch": 24.0, |
|
"eval_steps": 500, |
|
"global_step": 230688, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.05201831044527674, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.001997919267582189, |
|
"loss": 2.3383, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.10403662089055347, |
|
"grad_norm": 0.93359375, |
|
"learning_rate": 0.001995838535164378, |
|
"loss": 1.9394, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1560549313358302, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.001993757802746567, |
|
"loss": 1.8509, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.20807324178110695, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0019916770703287557, |
|
"loss": 1.8119, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.2600915522263837, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0019895963379109446, |
|
"loss": 1.746, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.3121098626716604, |
|
"grad_norm": 0.4921875, |
|
"learning_rate": 0.0019875156054931335, |
|
"loss": 1.7113, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.3641281731169372, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0019854348730753224, |
|
"loss": 1.6861, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.4161464835622139, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0019833541406575114, |
|
"loss": 1.6518, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.4681647940074906, |
|
"grad_norm": 1.2578125, |
|
"learning_rate": 0.0019812734082397003, |
|
"loss": 1.6257, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.5201831044527674, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0019791926758218896, |
|
"loss": 1.6184, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.5722014148980441, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 0.001977111943404078, |
|
"loss": 1.6034, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.6242197253433208, |
|
"grad_norm": 0.46875, |
|
"learning_rate": 0.001975031210986267, |
|
"loss": 1.5798, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.6762380357885975, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0019729504785684564, |
|
"loss": 1.6023, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.7282563462338744, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0019708697461506453, |
|
"loss": 1.6354, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.7802746566791511, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0019687890137328337, |
|
"loss": 1.6039, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.8322929671244278, |
|
"grad_norm": 0.48046875, |
|
"learning_rate": 0.001966708281315023, |
|
"loss": 1.5863, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.8843112775697045, |
|
"grad_norm": 0.99609375, |
|
"learning_rate": 0.001964627548897212, |
|
"loss": 1.5758, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.9363295880149812, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0019625468164794005, |
|
"loss": 1.5658, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.9883478984602581, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00196046608406159, |
|
"loss": 1.5664, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 1.6215704679489136, |
|
"eval_runtime": 1.5075, |
|
"eval_samples_per_second": 663.37, |
|
"eval_steps_per_second": 0.663, |
|
"step": 9612 |
|
}, |
|
{ |
|
"epoch": 1.0403662089055348, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0019583853516437788, |
|
"loss": 1.5684, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 1.0923845193508115, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0019563046192259677, |
|
"loss": 1.5536, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 1.1444028297960882, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0019542238868081566, |
|
"loss": 1.5495, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 1.196421140241365, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0019521431543903455, |
|
"loss": 1.529, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 1.2484394506866416, |
|
"grad_norm": 0.38671875, |
|
"learning_rate": 0.0019500624219725344, |
|
"loss": 1.5307, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 1.3004577611319184, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0019479816895547233, |
|
"loss": 1.5422, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 1.352476071577195, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.0019459009571369122, |
|
"loss": 1.5281, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 1.404494382022472, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0019438202247191011, |
|
"loss": 1.5232, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.4565126924677487, |
|
"grad_norm": 0.98828125, |
|
"learning_rate": 0.00194173949230129, |
|
"loss": 1.5286, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.5085310029130254, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 0.001939658759883479, |
|
"loss": 1.5286, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.5605493133583022, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.001937578027465668, |
|
"loss": 1.5173, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.6125676238035789, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0019354972950478568, |
|
"loss": 1.5055, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 1.6645859342488556, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0019334165626300457, |
|
"loss": 1.5071, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.7166042446941323, |
|
"grad_norm": 0.37890625, |
|
"learning_rate": 0.0019313358302122348, |
|
"loss": 1.5071, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 1.768622555139409, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.0019292550977944235, |
|
"loss": 1.5211, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.8206408655846857, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0019271743653766125, |
|
"loss": 1.5211, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 1.8726591760299627, |
|
"grad_norm": 0.41796875, |
|
"learning_rate": 0.0019250936329588016, |
|
"loss": 1.5159, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.9246774864752392, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0019230129005409905, |
|
"loss": 1.5006, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 1.9766957969205161, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.0019209321681231794, |
|
"loss": 1.5082, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.5491766929626465, |
|
"eval_runtime": 1.6608, |
|
"eval_samples_per_second": 602.119, |
|
"eval_steps_per_second": 0.602, |
|
"step": 19224 |
|
}, |
|
{ |
|
"epoch": 2.0287141073657926, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0019188514357053683, |
|
"loss": 1.5182, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 2.0807324178110695, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0019167707032875572, |
|
"loss": 1.5172, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 2.132750728256346, |
|
"grad_norm": 0.37109375, |
|
"learning_rate": 0.0019146899708697464, |
|
"loss": 1.5101, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 2.184769038701623, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.001912609238451935, |
|
"loss": 1.5086, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 2.2367873491468995, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.001910528506034124, |
|
"loss": 1.4943, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 2.2888056595921764, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0019084477736163131, |
|
"loss": 1.4848, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 2.3408239700374533, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0019063670411985018, |
|
"loss": 1.4823, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 2.39284228048273, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0019042863087806907, |
|
"loss": 1.4702, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 2.444860590928007, |
|
"grad_norm": 1.296875, |
|
"learning_rate": 0.0019022055763628799, |
|
"loss": 1.4673, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 2.4968789013732833, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0019001248439450688, |
|
"loss": 1.4706, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 2.54889721181856, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.0018980441115272575, |
|
"loss": 1.4635, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 2.6009155222638367, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0018959633791094466, |
|
"loss": 1.4499, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 2.6529338327091136, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0018938826466916355, |
|
"loss": 1.4453, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 2.70495214315439, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0018918019142738244, |
|
"loss": 1.4463, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 2.756970453599667, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0018897211818560133, |
|
"loss": 1.452, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 2.808988764044944, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0018876404494382023, |
|
"loss": 1.448, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 2.8610070744902205, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0018855597170203914, |
|
"loss": 1.4525, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 2.9130253849354975, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00188347898460258, |
|
"loss": 1.4457, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.965043695380774, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.001881398252184769, |
|
"loss": 1.4468, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 1.4870332479476929, |
|
"eval_runtime": 1.4668, |
|
"eval_samples_per_second": 681.76, |
|
"eval_steps_per_second": 0.682, |
|
"step": 28836 |
|
}, |
|
{ |
|
"epoch": 3.017062005826051, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0018793175197669581, |
|
"loss": 1.4453, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 3.0690803162713274, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.001877236787349147, |
|
"loss": 1.4455, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 3.1210986267166043, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.0018751560549313357, |
|
"loss": 1.4378, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 3.173116937161881, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0018730753225135249, |
|
"loss": 1.4342, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 3.2251352476071578, |
|
"grad_norm": 5.09375, |
|
"learning_rate": 0.0018709945900957138, |
|
"loss": 1.4401, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 3.2771535580524347, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0018689138576779025, |
|
"loss": 1.4317, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 3.329171868497711, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0018668331252600916, |
|
"loss": 1.4252, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 3.381190178942988, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0018647523928422805, |
|
"loss": 1.427, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 3.4332084893882646, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.0018626716604244697, |
|
"loss": 1.4207, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 3.4852267998335416, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0018605909280066584, |
|
"loss": 1.4209, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 3.537245110278818, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0018585101955888473, |
|
"loss": 1.418, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 3.589263420724095, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0018564294631710364, |
|
"loss": 1.4153, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 3.6412817311693715, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.001854348730753225, |
|
"loss": 1.4171, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 3.6933000416146484, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 0.001852267998335414, |
|
"loss": 1.4203, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 3.7453183520599254, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0018501872659176031, |
|
"loss": 1.4189, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 3.797336662505202, |
|
"grad_norm": 0.53125, |
|
"learning_rate": 0.001848106533499792, |
|
"loss": 1.4212, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 3.8493549729504783, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.0018460258010819808, |
|
"loss": 1.4151, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 3.9013732833957553, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0018439450686641699, |
|
"loss": 1.4087, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 3.9533915938410322, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0018418643362463588, |
|
"loss": 1.4038, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.4296818971633911, |
|
"eval_runtime": 1.3293, |
|
"eval_samples_per_second": 752.251, |
|
"eval_steps_per_second": 0.752, |
|
"step": 38448 |
|
}, |
|
{ |
|
"epoch": 4.005409904286309, |
|
"grad_norm": 2.53125, |
|
"learning_rate": 0.0018397836038285475, |
|
"loss": 1.4037, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 4.057428214731585, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0018377028714107366, |
|
"loss": 1.402, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 4.109446525176862, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0018356221389929255, |
|
"loss": 1.3972, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 4.161464835622139, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 0.0018335414065751145, |
|
"loss": 1.3996, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 4.213483146067416, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0018314606741573034, |
|
"loss": 1.3989, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 4.265501456512692, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.0018293799417394923, |
|
"loss": 1.3945, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 4.317519766957969, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0018272992093216814, |
|
"loss": 1.3936, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 4.369538077403246, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0018252184769038703, |
|
"loss": 1.3906, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 4.421556387848523, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.001823137744486059, |
|
"loss": 1.3946, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 4.473574698293799, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0018210570120682482, |
|
"loss": 1.4069, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 4.525593008739076, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.001818976279650437, |
|
"loss": 1.4049, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 4.577611319184353, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0018168955472326258, |
|
"loss": 1.3995, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 4.62962962962963, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.001814814814814815, |
|
"loss": 1.4053, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 4.681647940074907, |
|
"grad_norm": 0.5078125, |
|
"learning_rate": 0.0018127340823970038, |
|
"loss": 1.4, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 4.733666250520183, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0018106533499791927, |
|
"loss": 1.3946, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 4.78568456096546, |
|
"grad_norm": 0.498046875, |
|
"learning_rate": 0.0018085726175613816, |
|
"loss": 1.3914, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 4.837702871410737, |
|
"grad_norm": 1.0703125, |
|
"learning_rate": 0.0018064918851435705, |
|
"loss": 1.3882, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 4.889721181856014, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.0018044111527257595, |
|
"loss": 1.3905, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 4.94173949230129, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0018023304203079484, |
|
"loss": 1.3927, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 4.9937578027465666, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0018002496878901373, |
|
"loss": 1.3883, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 1.4223600625991821, |
|
"eval_runtime": 1.6852, |
|
"eval_samples_per_second": 593.419, |
|
"eval_steps_per_second": 0.593, |
|
"step": 48060 |
|
}, |
|
{ |
|
"epoch": 5.0457761131918435, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0017981689554723264, |
|
"loss": 1.3873, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 5.09779442363712, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0017960882230545153, |
|
"loss": 1.3831, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 5.149812734082397, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.001794007490636704, |
|
"loss": 1.3784, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 5.201831044527673, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0017919267582188932, |
|
"loss": 1.3821, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 5.25384935497295, |
|
"grad_norm": 0.451171875, |
|
"learning_rate": 0.001789846025801082, |
|
"loss": 1.3781, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 5.305867665418227, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0017877652933832708, |
|
"loss": 1.381, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 5.357885975863504, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00178568456096546, |
|
"loss": 1.381, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 5.40990428630878, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0017836038285476488, |
|
"loss": 1.375, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 5.461922596754057, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0017815230961298377, |
|
"loss": 1.3776, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 5.513940907199334, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.0017794423637120266, |
|
"loss": 1.3773, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 5.565959217644611, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.0017773616312942156, |
|
"loss": 1.3809, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 5.617977528089888, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 0.0017752808988764045, |
|
"loss": 1.3786, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 5.669995838535164, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.0017732001664585936, |
|
"loss": 1.3762, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 5.722014148980441, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0017711194340407823, |
|
"loss": 1.3741, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 5.774032459425718, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.0017690387016229714, |
|
"loss": 1.3719, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 5.826050769870995, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0017669579692051603, |
|
"loss": 1.3712, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 5.878069080316271, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.001764877236787349, |
|
"loss": 1.3716, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 5.930087390761548, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0017627965043695382, |
|
"loss": 1.3739, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 5.982105701206825, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.001760715771951727, |
|
"loss": 1.3744, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 1.4039781093597412, |
|
"eval_runtime": 1.6711, |
|
"eval_samples_per_second": 598.397, |
|
"eval_steps_per_second": 0.598, |
|
"step": 57672 |
|
}, |
|
{ |
|
"epoch": 6.034124011652102, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.001758635039533916, |
|
"loss": 1.3701, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 6.086142322097379, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.001756554307116105, |
|
"loss": 1.3629, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 6.138160632542655, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0017544735746982938, |
|
"loss": 1.3656, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 6.190178942987932, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0017523928422804827, |
|
"loss": 1.3673, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 6.242197253433209, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0017503121098626717, |
|
"loss": 1.363, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 6.294215563878486, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0017482313774448606, |
|
"loss": 1.3632, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 6.346233874323762, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.0017461506450270495, |
|
"loss": 1.36, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 6.398252184769039, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.0017440699126092386, |
|
"loss": 1.3578, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 6.4502704952143155, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0017419891801914273, |
|
"loss": 1.3622, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 6.502288805659592, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0017399084477736164, |
|
"loss": 1.3607, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 6.554307116104869, |
|
"grad_norm": 0.39453125, |
|
"learning_rate": 0.0017378277153558054, |
|
"loss": 1.3552, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 6.606325426550145, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.001735746982937994, |
|
"loss": 1.3518, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 6.658343736995422, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0017336662505201832, |
|
"loss": 1.3498, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 6.710362047440699, |
|
"grad_norm": 1.1484375, |
|
"learning_rate": 0.001731585518102372, |
|
"loss": 1.3528, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 6.762380357885976, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.001729504785684561, |
|
"loss": 1.3528, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 6.814398668331252, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00172742405326675, |
|
"loss": 1.3514, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 6.866416978776529, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.0017253433208489388, |
|
"loss": 1.3519, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 6.918435289221806, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0017232625884311278, |
|
"loss": 1.3475, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 6.970453599667083, |
|
"grad_norm": 0.51171875, |
|
"learning_rate": 0.0017211818560133169, |
|
"loss": 1.3498, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 1.3733755350112915, |
|
"eval_runtime": 1.5013, |
|
"eval_samples_per_second": 666.111, |
|
"eval_steps_per_second": 0.666, |
|
"step": 67284 |
|
}, |
|
{ |
|
"epoch": 7.022471910112359, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0017191011235955056, |
|
"loss": 1.348, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 7.074490220557636, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0017170203911776945, |
|
"loss": 1.3467, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 7.126508531002913, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.0017149396587598836, |
|
"loss": 1.3484, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 7.17852684144819, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0017128589263420723, |
|
"loss": 1.3504, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 7.230545151893467, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0017107781939242615, |
|
"loss": 1.3474, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 7.282563462338743, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0017086974615064504, |
|
"loss": 1.3457, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 7.33458177278402, |
|
"grad_norm": 0.494140625, |
|
"learning_rate": 0.0017066167290886393, |
|
"loss": 1.344, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 7.386600083229297, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0017045359966708282, |
|
"loss": 1.3409, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 7.438618393674574, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.001702455264253017, |
|
"loss": 1.3427, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 7.49063670411985, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.001700374531835206, |
|
"loss": 1.3446, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 7.542655014565127, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.001698293799417395, |
|
"loss": 1.3427, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 7.594673325010404, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0016962130669995838, |
|
"loss": 1.3451, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 7.646691635455681, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.0016941323345817728, |
|
"loss": 1.3473, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 7.698709945900957, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.001692051602163962, |
|
"loss": 1.3487, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 7.750728256346234, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0016899708697461506, |
|
"loss": 1.3565, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 7.802746566791511, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0016878901373283395, |
|
"loss": 1.348, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 7.8547648772367875, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0016858094049105286, |
|
"loss": 1.3478, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 7.9067831876820645, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0016837286724927173, |
|
"loss": 1.3484, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 7.9588014981273405, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0016816479400749065, |
|
"loss": 1.3457, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 1.3691484928131104, |
|
"eval_runtime": 1.5204, |
|
"eval_samples_per_second": 657.725, |
|
"eval_steps_per_second": 0.658, |
|
"step": 76896 |
|
}, |
|
{ |
|
"epoch": 8.010819808572618, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0016795672076570954, |
|
"loss": 1.3419, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 8.062838119017893, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0016774864752392843, |
|
"loss": 1.3375, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 8.11485642946317, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 0.0016754057428214732, |
|
"loss": 1.3368, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 8.166874739908447, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0016733250104036621, |
|
"loss": 1.3385, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 8.218893050353724, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.001671244277985851, |
|
"loss": 1.3329, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 8.270911360799001, |
|
"grad_norm": 0.462890625, |
|
"learning_rate": 0.0016691635455680402, |
|
"loss": 1.3346, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 8.322929671244278, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0016670828131502289, |
|
"loss": 1.3342, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 8.374947981689555, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0016650020807324178, |
|
"loss": 1.3313, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 8.426966292134832, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.001662921348314607, |
|
"loss": 1.33, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 8.478984602580109, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0016608406158967956, |
|
"loss": 1.3321, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 8.531002913025384, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0016587598834789845, |
|
"loss": 1.3322, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 8.583021223470661, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0016566791510611736, |
|
"loss": 1.3354, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 8.635039533915938, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0016545984186433626, |
|
"loss": 1.3358, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 8.687057844361215, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0016525176862255513, |
|
"loss": 1.3303, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 8.739076154806492, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.0016504369538077404, |
|
"loss": 1.3332, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 8.791094465251769, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0016483562213899293, |
|
"loss": 1.3337, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 8.843112775697046, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.0016462754889721182, |
|
"loss": 1.3321, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 8.895131086142323, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.0016441947565543071, |
|
"loss": 1.3283, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 8.947149396587598, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.001642114024136496, |
|
"loss": 1.3306, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 8.999167707032875, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0016400332917186852, |
|
"loss": 1.3315, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 1.3568580150604248, |
|
"eval_runtime": 1.6522, |
|
"eval_samples_per_second": 605.266, |
|
"eval_steps_per_second": 0.605, |
|
"step": 86508 |
|
}, |
|
{ |
|
"epoch": 9.051186017478152, |
|
"grad_norm": 0.77734375, |
|
"learning_rate": 0.0016379525593008739, |
|
"loss": 1.3232, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 9.103204327923429, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0016358718268830628, |
|
"loss": 1.3251, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 9.155222638368706, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.001633791094465252, |
|
"loss": 1.3295, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 9.207240948813983, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.0016317103620474408, |
|
"loss": 1.3261, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 9.25925925925926, |
|
"grad_norm": 0.171875, |
|
"learning_rate": 0.0016296296296296295, |
|
"loss": 1.3288, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 9.311277569704536, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0016275488972118187, |
|
"loss": 1.3276, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 9.363295880149813, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0016254681647940076, |
|
"loss": 1.3257, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 9.41531419059509, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0016233874323761963, |
|
"loss": 1.3219, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 9.467332501040365, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0016213066999583854, |
|
"loss": 1.3219, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 9.519350811485642, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0016192259675405743, |
|
"loss": 1.3216, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 9.57136912193092, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.0016171452351227634, |
|
"loss": 1.324, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 9.623387432376196, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0016150645027049521, |
|
"loss": 1.3212, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 9.675405742821473, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.001612983770287141, |
|
"loss": 1.3217, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 9.72742405326675, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0016109030378693302, |
|
"loss": 1.3219, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 9.779442363712027, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0016088223054515189, |
|
"loss": 1.3219, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 9.831460674157304, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0016067415730337078, |
|
"loss": 1.3188, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 9.88347898460258, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.001604660840615897, |
|
"loss": 1.3205, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 9.935497295047856, |
|
"grad_norm": 1.828125, |
|
"learning_rate": 0.0016025801081980858, |
|
"loss": 1.3238, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 9.987515605493133, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0016004993757802745, |
|
"loss": 1.3224, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 1.3528562784194946, |
|
"eval_runtime": 1.936, |
|
"eval_samples_per_second": 516.533, |
|
"eval_steps_per_second": 0.517, |
|
"step": 96120 |
|
}, |
|
{ |
|
"epoch": 10.03953391593841, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0015984186433624637, |
|
"loss": 1.3174, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 10.091552226383687, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.0015963379109446526, |
|
"loss": 1.3199, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 10.143570536828964, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0015942571785268413, |
|
"loss": 1.3187, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 10.19558884727424, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0015921764461090304, |
|
"loss": 1.3205, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 10.247607157719518, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0015900957136912193, |
|
"loss": 1.3201, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 10.299625468164795, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 0.0015880149812734085, |
|
"loss": 1.3193, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 10.35164377861007, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0015859342488555972, |
|
"loss": 1.3181, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 10.403662089055347, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.001583853516437786, |
|
"loss": 1.3169, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 10.455680399500624, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0015817727840199752, |
|
"loss": 1.3162, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 10.5076987099459, |
|
"grad_norm": 0.458984375, |
|
"learning_rate": 0.0015796920516021641, |
|
"loss": 1.3217, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 10.559717020391178, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.0015776113191843528, |
|
"loss": 1.3245, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 10.611735330836455, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.001575530586766542, |
|
"loss": 1.3233, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 10.663753641281732, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0015734498543487309, |
|
"loss": 1.3182, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 10.715771951727008, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0015713691219309195, |
|
"loss": 1.3163, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 10.767790262172285, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0015692883895131087, |
|
"loss": 1.3181, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 10.81980857261756, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0015672076570952976, |
|
"loss": 1.3171, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 10.871826883062838, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0015651269246774865, |
|
"loss": 1.3177, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 10.923845193508114, |
|
"grad_norm": 0.306640625, |
|
"learning_rate": 0.0015630461922596754, |
|
"loss": 1.3211, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 10.975863503953391, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.0015609654598418643, |
|
"loss": 1.3183, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 1.347601056098938, |
|
"eval_runtime": 1.5374, |
|
"eval_samples_per_second": 650.453, |
|
"eval_steps_per_second": 0.65, |
|
"step": 105732 |
|
}, |
|
{ |
|
"epoch": 11.027881814398668, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0015588847274240535, |
|
"loss": 1.315, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 11.079900124843945, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0015568039950062422, |
|
"loss": 1.3125, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 11.131918435289222, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.001554723262588431, |
|
"loss": 1.3114, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 11.1839367457345, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0015526425301706202, |
|
"loss": 1.3104, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 11.235955056179776, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0015505617977528091, |
|
"loss": 1.3109, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 11.287973366625051, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0015484810653349978, |
|
"loss": 1.3119, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 11.339991677070328, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.001546400332917187, |
|
"loss": 1.31, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 11.392009987515605, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0015443196004993759, |
|
"loss": 1.3114, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 11.444028297960882, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0015422388680815646, |
|
"loss": 1.3133, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 11.496046608406159, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0015401581356637537, |
|
"loss": 1.3129, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 11.548064918851436, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0015380774032459426, |
|
"loss": 1.3125, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 11.600083229296713, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0015359966708281315, |
|
"loss": 1.312, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 11.65210153974199, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.0015339159384103204, |
|
"loss": 1.3107, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 11.704119850187267, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0015318352059925093, |
|
"loss": 1.3116, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 11.756138160632542, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.0015297544735746985, |
|
"loss": 1.3104, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 11.808156471077819, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0015276737411568874, |
|
"loss": 1.3102, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 11.860174781523096, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.001525593008739076, |
|
"loss": 1.309, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 11.912193091968373, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0015235122763212652, |
|
"loss": 1.3096, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 11.96421140241365, |
|
"grad_norm": 0.9296875, |
|
"learning_rate": 0.0015214315439034541, |
|
"loss": 1.307, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 1.3371446132659912, |
|
"eval_runtime": 1.4263, |
|
"eval_samples_per_second": 701.11, |
|
"eval_steps_per_second": 0.701, |
|
"step": 115344 |
|
}, |
|
{ |
|
"epoch": 12.016229712858927, |
|
"grad_norm": 0.185546875, |
|
"learning_rate": 0.0015193508114856428, |
|
"loss": 1.3041, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 12.068248023304204, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.001517270079067832, |
|
"loss": 1.3025, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 12.12026633374948, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 0.0015151893466500209, |
|
"loss": 1.3041, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 12.172284644194757, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0015131086142322098, |
|
"loss": 1.3063, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 12.224302954640033, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0015110278818143987, |
|
"loss": 1.3037, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 12.27632126508531, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0015089471493965876, |
|
"loss": 1.3049, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 12.328339575530586, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.0015068664169787765, |
|
"loss": 1.3033, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 12.380357885975863, |
|
"grad_norm": 0.1884765625, |
|
"learning_rate": 0.0015047856845609654, |
|
"loss": 1.3032, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 12.43237619642114, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0015027049521431544, |
|
"loss": 1.3031, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 12.484394506866417, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0015006242197253433, |
|
"loss": 1.3031, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 12.536412817311694, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 0.0014985434873075324, |
|
"loss": 1.304, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 12.588431127756971, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.001496462754889721, |
|
"loss": 1.3042, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 12.640449438202246, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0014943820224719102, |
|
"loss": 1.3055, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 12.692467748647523, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0014923012900540991, |
|
"loss": 1.3061, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 12.7444860590928, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.0014902205576362878, |
|
"loss": 1.304, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 12.796504369538077, |
|
"grad_norm": 0.408203125, |
|
"learning_rate": 0.001488139825218477, |
|
"loss": 1.3048, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 12.848522679983354, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0014860590928006659, |
|
"loss": 1.3038, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 12.900540990428631, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0014839783603828548, |
|
"loss": 1.3029, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 12.952559300873908, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0014818976279650437, |
|
"loss": 1.302, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 1.336362600326538, |
|
"eval_runtime": 1.5551, |
|
"eval_samples_per_second": 643.05, |
|
"eval_steps_per_second": 0.643, |
|
"step": 124956 |
|
}, |
|
{ |
|
"epoch": 13.004577611319185, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0014798168955472326, |
|
"loss": 1.3067, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 13.056595921764462, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.0014777361631294215, |
|
"loss": 1.3017, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 13.108614232209737, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0014756554307116107, |
|
"loss": 1.3037, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 13.160632542655014, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0014735746982937994, |
|
"loss": 1.3047, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 13.21265085310029, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.0014714939658759883, |
|
"loss": 1.3034, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 13.264669163545568, |
|
"grad_norm": 7.0, |
|
"learning_rate": 0.0014694132334581774, |
|
"loss": 1.303, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 13.316687473990845, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0014673325010403661, |
|
"loss": 1.304, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 13.368705784436122, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.0014652517686225552, |
|
"loss": 1.3033, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 13.420724094881399, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0014631710362047442, |
|
"loss": 1.3014, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 13.472742405326676, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.001461090303786933, |
|
"loss": 1.3004, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 13.524760715771952, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.001459009571369122, |
|
"loss": 1.3011, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 13.576779026217228, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.001456928838951311, |
|
"loss": 1.3005, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 13.628797336662505, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0014548481065334998, |
|
"loss": 1.3022, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 13.680815647107782, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0014527673741156887, |
|
"loss": 1.3013, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 13.732833957553058, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0014506866416978776, |
|
"loss": 1.3005, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 13.784852267998335, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0014486059092800666, |
|
"loss": 1.2997, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 13.836870578443612, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0014465251768622557, |
|
"loss": 1.2998, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 13.88888888888889, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.0014444444444444444, |
|
"loss": 1.3001, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 13.940907199334166, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0014423637120266333, |
|
"loss": 1.2998, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 13.992925509779443, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.0014402829796088224, |
|
"loss": 1.3013, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_loss": 1.3362102508544922, |
|
"eval_runtime": 1.3748, |
|
"eval_samples_per_second": 727.372, |
|
"eval_steps_per_second": 0.727, |
|
"step": 134568 |
|
}, |
|
{ |
|
"epoch": 14.044943820224718, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0014382022471910111, |
|
"loss": 1.2953, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 14.096962130669995, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0014361215147732003, |
|
"loss": 1.2956, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 14.148980441115272, |
|
"grad_norm": 0.3359375, |
|
"learning_rate": 0.0014340407823553892, |
|
"loss": 1.2953, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 14.20099875156055, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.001431960049937578, |
|
"loss": 1.2959, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 14.253017062005826, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.001429879317519767, |
|
"loss": 1.2951, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 14.305035372451103, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.001427798585101956, |
|
"loss": 1.2948, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 14.35705368289638, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0014257178526841448, |
|
"loss": 1.2962, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 14.409071993341657, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.001423637120266334, |
|
"loss": 1.2941, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 14.461090303786934, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.0014215563878485226, |
|
"loss": 1.2958, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 14.513108614232209, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 0.0014194756554307116, |
|
"loss": 1.2949, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 14.565126924677486, |
|
"grad_norm": 0.177734375, |
|
"learning_rate": 0.0014173949230129007, |
|
"loss": 1.2933, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 14.617145235122763, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0014153141905950894, |
|
"loss": 1.295, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 14.66916354556804, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0014132334581772783, |
|
"loss": 1.2944, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 14.721181856013317, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.0014111527257594674, |
|
"loss": 1.2923, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 14.773200166458594, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.0014090719933416563, |
|
"loss": 1.2925, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 14.82521847690387, |
|
"grad_norm": 0.3828125, |
|
"learning_rate": 0.0014069912609238453, |
|
"loss": 1.2907, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 14.877236787349148, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0014049105285060342, |
|
"loss": 1.2936, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 14.929255097794425, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.001402829796088223, |
|
"loss": 1.2927, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 14.9812734082397, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.001400749063670412, |
|
"loss": 1.2911, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_loss": 1.3259565830230713, |
|
"eval_runtime": 1.5089, |
|
"eval_samples_per_second": 662.754, |
|
"eval_steps_per_second": 0.663, |
|
"step": 144180 |
|
}, |
|
{ |
|
"epoch": 15.033291718684977, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.001398668331252601, |
|
"loss": 1.2892, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 15.085310029130254, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0013965875988347898, |
|
"loss": 1.2898, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 15.13732833957553, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.001394506866416979, |
|
"loss": 1.2916, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 15.189346650020807, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.0013924261339991677, |
|
"loss": 1.2906, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 15.241364960466084, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0013903454015813566, |
|
"loss": 1.2905, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 15.293383270911361, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0013882646691635457, |
|
"loss": 1.2898, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 15.345401581356638, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0013861839367457346, |
|
"loss": 1.2893, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 15.397419891801913, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.0013841032043279233, |
|
"loss": 1.2905, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 15.44943820224719, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.0013820224719101124, |
|
"loss": 1.2899, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 15.501456512692467, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0013799417394923014, |
|
"loss": 1.2913, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 15.553474823137744, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0013778610070744903, |
|
"loss": 1.29, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 15.605493133583021, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0013757802746566792, |
|
"loss": 1.2906, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 15.657511444028298, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.001373699542238868, |
|
"loss": 1.289, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 15.709529754473575, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0013716188098210572, |
|
"loss": 1.2894, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 15.761548064918852, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.001369538077403246, |
|
"loss": 1.2876, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 15.813566375364129, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 0.0013674573449854348, |
|
"loss": 1.2886, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 15.865584685809406, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.001365376612567624, |
|
"loss": 1.2875, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 15.917602996254681, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.0013632958801498127, |
|
"loss": 1.2874, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 15.969621306699958, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0013612151477320016, |
|
"loss": 1.2876, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_loss": 1.3189575672149658, |
|
"eval_runtime": 1.5607, |
|
"eval_samples_per_second": 640.73, |
|
"eval_steps_per_second": 0.641, |
|
"step": 153792 |
|
}, |
|
{ |
|
"epoch": 16.021639617145237, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0013591344153141907, |
|
"loss": 1.285, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 16.073657927590514, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.0013570536828963796, |
|
"loss": 1.2834, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 16.125676238035787, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.0013549729504785683, |
|
"loss": 1.2828, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 16.177694548481064, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0013528922180607575, |
|
"loss": 1.2826, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 16.22971285892634, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0013508114856429464, |
|
"loss": 1.2825, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 16.281731169371618, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.001348730753225135, |
|
"loss": 1.2836, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 16.333749479816895, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.0013466500208073242, |
|
"loss": 1.2853, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 16.38576779026217, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0013445692883895131, |
|
"loss": 1.2859, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 16.43778610070745, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.0013424885559717022, |
|
"loss": 1.2841, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 16.489804411152726, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.001340407823553891, |
|
"loss": 1.2834, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 16.541822721598002, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0013383270911360799, |
|
"loss": 1.2834, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 16.59384103204328, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.001336246358718269, |
|
"loss": 1.2834, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 16.645859342488556, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.001334165626300458, |
|
"loss": 1.2856, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 16.697877652933833, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.0013320848938826466, |
|
"loss": 1.2829, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 16.74989596337911, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0013300041614648357, |
|
"loss": 1.283, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 16.801914273824387, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0013279234290470246, |
|
"loss": 1.2848, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 16.853932584269664, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0013258426966292133, |
|
"loss": 1.2837, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 16.90595089471494, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0013237619642114025, |
|
"loss": 1.2824, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 16.957969205160218, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0013216812317935914, |
|
"loss": 1.284, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_loss": 1.3203132152557373, |
|
"eval_runtime": 1.432, |
|
"eval_samples_per_second": 698.327, |
|
"eval_steps_per_second": 0.698, |
|
"step": 163404 |
|
}, |
|
{ |
|
"epoch": 17.00998751560549, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0013196004993757803, |
|
"loss": 1.2829, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 17.06200582605077, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0013175197669579692, |
|
"loss": 1.2788, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 17.114024136496045, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0013154390345401581, |
|
"loss": 1.2805, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 17.166042446941322, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.0013133583021223473, |
|
"loss": 1.2815, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 17.2180607573866, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.001311277569704536, |
|
"loss": 1.2817, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 17.270079067831876, |
|
"grad_norm": 0.345703125, |
|
"learning_rate": 0.0013091968372867249, |
|
"loss": 1.2835, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 17.322097378277153, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.001307116104868914, |
|
"loss": 1.2823, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 17.37411568872243, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.001305035372451103, |
|
"loss": 1.2831, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 17.426133999167707, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0013029546400332916, |
|
"loss": 1.283, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 17.478152309612984, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.0013008739076154807, |
|
"loss": 1.2823, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 17.53017062005826, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.0012987931751976696, |
|
"loss": 1.2826, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 17.582188930503538, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0012967124427798583, |
|
"loss": 1.2823, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 17.634207240948815, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.0012946317103620475, |
|
"loss": 1.2851, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 17.68622555139409, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0012925509779442364, |
|
"loss": 1.2828, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 17.73824386183937, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0012904702455264253, |
|
"loss": 1.2842, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 17.790262172284645, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0012883895131086142, |
|
"loss": 1.2839, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 17.842280482729922, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0012863087806908031, |
|
"loss": 1.2848, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 17.8942987931752, |
|
"grad_norm": 1.0, |
|
"learning_rate": 0.0012842280482729923, |
|
"loss": 1.2834, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 17.946317103620473, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0012821473158551812, |
|
"loss": 1.2839, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 17.99833541406575, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0012800665834373699, |
|
"loss": 1.2837, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_loss": 1.3176885843276978, |
|
"eval_runtime": 1.6332, |
|
"eval_samples_per_second": 612.278, |
|
"eval_steps_per_second": 0.612, |
|
"step": 173016 |
|
}, |
|
{ |
|
"epoch": 18.050353724511027, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 0.001277985851019559, |
|
"loss": 1.2788, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 18.102372034956304, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 0.001275905118601748, |
|
"loss": 1.28, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 18.15439034540158, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0012738243861839366, |
|
"loss": 1.2797, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 18.206408655846857, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 0.0012717436537661257, |
|
"loss": 1.2813, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 18.258426966292134, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.0012696629213483147, |
|
"loss": 1.2816, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 18.31044527673741, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.0012675821889305036, |
|
"loss": 1.282, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 18.36246358718269, |
|
"grad_norm": 0.62109375, |
|
"learning_rate": 0.0012655014565126925, |
|
"loss": 1.2821, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 18.414481897627965, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0012634207240948814, |
|
"loss": 1.2827, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 18.466500208073242, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.0012613399916770703, |
|
"loss": 1.2802, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 18.51851851851852, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.0012592592592592592, |
|
"loss": 1.2803, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 18.570536828963796, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0012571785268414481, |
|
"loss": 1.2808, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 18.622555139409073, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.0012550977944236373, |
|
"loss": 1.2795, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 18.67457344985435, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0012530170620058262, |
|
"loss": 1.2789, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 18.726591760299627, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 0.0012509363295880149, |
|
"loss": 1.2805, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 18.778610070744904, |
|
"grad_norm": 1.640625, |
|
"learning_rate": 0.001248855597170204, |
|
"loss": 1.2808, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 18.83062838119018, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.001246774864752393, |
|
"loss": 1.2798, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 18.882646691635454, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.0012446941323345816, |
|
"loss": 1.2794, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 18.93466500208073, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.0012426133999167708, |
|
"loss": 1.2801, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 18.986683312526008, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.0012405326674989597, |
|
"loss": 1.2823, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_loss": 1.3176276683807373, |
|
"eval_runtime": 1.3968, |
|
"eval_samples_per_second": 715.946, |
|
"eval_steps_per_second": 0.716, |
|
"step": 182628 |
|
}, |
|
{ |
|
"epoch": 19.038701622971285, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0012384519350811486, |
|
"loss": 1.2802, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 19.090719933416562, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.0012363712026633375, |
|
"loss": 1.2782, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 19.14273824386184, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.0012342904702455264, |
|
"loss": 1.2769, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 19.194756554307116, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0012322097378277153, |
|
"loss": 1.2789, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 19.246774864752393, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0012301290054099045, |
|
"loss": 1.279, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 19.29879317519767, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 0.0012280482729920932, |
|
"loss": 1.2813, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 19.350811485642947, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.0012259675405742823, |
|
"loss": 1.2811, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 19.402829796088223, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0012238868081564712, |
|
"loss": 1.2826, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 19.4548481065335, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00122180607573866, |
|
"loss": 1.2811, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 19.506866416978777, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.001219725343320849, |
|
"loss": 1.2814, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 19.558884727424054, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.001217644610903038, |
|
"loss": 1.2794, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 19.61090303786933, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.0012155638784852269, |
|
"loss": 1.2801, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 19.662921348314608, |
|
"grad_norm": 0.435546875, |
|
"learning_rate": 0.0012134831460674158, |
|
"loss": 1.2796, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 19.714939658759885, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0012114024136496047, |
|
"loss": 1.2785, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 19.76695796920516, |
|
"grad_norm": 5.6875, |
|
"learning_rate": 0.0012093216812317936, |
|
"loss": 1.2783, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 19.818976279650435, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 0.0012072409488139825, |
|
"loss": 1.2782, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 19.870994590095712, |
|
"grad_norm": 3.21875, |
|
"learning_rate": 0.0012051602163961714, |
|
"loss": 1.2792, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 19.92301290054099, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 0.0012030794839783603, |
|
"loss": 1.2803, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 19.975031210986266, |
|
"grad_norm": 0.419921875, |
|
"learning_rate": 0.0012009987515605495, |
|
"loss": 1.2778, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_loss": 1.3075087070465088, |
|
"eval_runtime": 1.6065, |
|
"eval_samples_per_second": 622.457, |
|
"eval_steps_per_second": 0.622, |
|
"step": 192240 |
|
}, |
|
{ |
|
"epoch": 20.027049521431543, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0011989180191427382, |
|
"loss": 1.2763, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 20.07906783187682, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0011968372867249273, |
|
"loss": 1.2755, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 20.131086142322097, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.0011947565543071162, |
|
"loss": 1.2754, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 20.183104452767374, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.001192675821889305, |
|
"loss": 1.2756, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 20.23512276321265, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.001190595089471494, |
|
"loss": 1.278, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 20.287141073657928, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.001188514357053683, |
|
"loss": 1.2766, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 20.339159384103205, |
|
"grad_norm": 0.466796875, |
|
"learning_rate": 0.0011864336246358719, |
|
"loss": 1.2769, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 20.39117769454848, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.0011843528922180608, |
|
"loss": 1.2788, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 20.44319600499376, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 0.0011822721598002497, |
|
"loss": 1.279, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 20.495214315439036, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.0011801914273824386, |
|
"loss": 1.2782, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 20.547232625884313, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0011781106949646277, |
|
"loss": 1.2773, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 20.59925093632959, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0011760299625468164, |
|
"loss": 1.279, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 20.651269246774866, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0011739492301290053, |
|
"loss": 1.2786, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 20.70328755722014, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.0011718684977111945, |
|
"loss": 1.2788, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 20.755305867665417, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.0011697877652933832, |
|
"loss": 1.2781, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 20.807324178110694, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.001167707032875572, |
|
"loss": 1.2789, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 20.85934248855597, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.0011656263004577612, |
|
"loss": 1.2809, |
|
"step": 200500 |
|
}, |
|
{ |
|
"epoch": 20.911360799001248, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 0.0011635455680399501, |
|
"loss": 1.2778, |
|
"step": 201000 |
|
}, |
|
{ |
|
"epoch": 20.963379109446524, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.001161464835622139, |
|
"loss": 1.2777, |
|
"step": 201500 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_loss": 1.3104900121688843, |
|
"eval_runtime": 1.3896, |
|
"eval_samples_per_second": 719.624, |
|
"eval_steps_per_second": 0.72, |
|
"step": 201852 |
|
}, |
|
{ |
|
"epoch": 21.0153974198918, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.001159384103204328, |
|
"loss": 1.2777, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 21.06741573033708, |
|
"grad_norm": 0.173828125, |
|
"learning_rate": 0.0011573033707865169, |
|
"loss": 1.2763, |
|
"step": 202500 |
|
}, |
|
{ |
|
"epoch": 21.119434040782355, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.0011552226383687058, |
|
"loss": 1.2753, |
|
"step": 203000 |
|
}, |
|
{ |
|
"epoch": 21.171452351227632, |
|
"grad_norm": 0.1875, |
|
"learning_rate": 0.0011531419059508947, |
|
"loss": 1.2752, |
|
"step": 203500 |
|
}, |
|
{ |
|
"epoch": 21.22347066167291, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0011510611735330836, |
|
"loss": 1.2754, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 21.275488972118186, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0011489804411152727, |
|
"loss": 1.2756, |
|
"step": 204500 |
|
}, |
|
{ |
|
"epoch": 21.327507282563463, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.0011468997086974614, |
|
"loss": 1.2754, |
|
"step": 205000 |
|
}, |
|
{ |
|
"epoch": 21.37952559300874, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 0.0011448189762796504, |
|
"loss": 1.2753, |
|
"step": 205500 |
|
}, |
|
{ |
|
"epoch": 21.431543903454017, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.0011427382438618395, |
|
"loss": 1.2757, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 21.483562213899294, |
|
"grad_norm": 0.19140625, |
|
"learning_rate": 0.0011406575114440284, |
|
"loss": 1.2755, |
|
"step": 206500 |
|
}, |
|
{ |
|
"epoch": 21.53558052434457, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.001138576779026217, |
|
"loss": 1.2764, |
|
"step": 207000 |
|
}, |
|
{ |
|
"epoch": 21.587598834789844, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.0011364960466084062, |
|
"loss": 1.2765, |
|
"step": 207500 |
|
}, |
|
{ |
|
"epoch": 21.63961714523512, |
|
"grad_norm": 0.48828125, |
|
"learning_rate": 0.0011344153141905951, |
|
"loss": 1.2756, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 21.691635455680398, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.001132334581772784, |
|
"loss": 1.2772, |
|
"step": 208500 |
|
}, |
|
{ |
|
"epoch": 21.743653766125675, |
|
"grad_norm": 0.30859375, |
|
"learning_rate": 0.001130253849354973, |
|
"loss": 1.2757, |
|
"step": 209000 |
|
}, |
|
{ |
|
"epoch": 21.795672076570952, |
|
"grad_norm": 2.46875, |
|
"learning_rate": 0.0011281731169371619, |
|
"loss": 1.2767, |
|
"step": 209500 |
|
}, |
|
{ |
|
"epoch": 21.84769038701623, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.001126092384519351, |
|
"loss": 1.2741, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 21.899708697461506, |
|
"grad_norm": 0.94921875, |
|
"learning_rate": 0.0011240116521015397, |
|
"loss": 1.277, |
|
"step": 210500 |
|
}, |
|
{ |
|
"epoch": 21.951727007906783, |
|
"grad_norm": 0.1923828125, |
|
"learning_rate": 0.0011219309196837286, |
|
"loss": 1.2761, |
|
"step": 211000 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_loss": 1.309814453125, |
|
"eval_runtime": 1.7271, |
|
"eval_samples_per_second": 578.995, |
|
"eval_steps_per_second": 0.579, |
|
"step": 211464 |
|
}, |
|
{ |
|
"epoch": 22.00374531835206, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 0.0011198501872659178, |
|
"loss": 1.2756, |
|
"step": 211500 |
|
}, |
|
{ |
|
"epoch": 22.055763628797337, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0011177694548481065, |
|
"loss": 1.2751, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 22.107781939242614, |
|
"grad_norm": 0.181640625, |
|
"learning_rate": 0.0011156887224302954, |
|
"loss": 1.2746, |
|
"step": 212500 |
|
}, |
|
{ |
|
"epoch": 22.15980024968789, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0011136079900124845, |
|
"loss": 1.2754, |
|
"step": 213000 |
|
}, |
|
{ |
|
"epoch": 22.211818560133167, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.0011115272575946734, |
|
"loss": 1.2767, |
|
"step": 213500 |
|
}, |
|
{ |
|
"epoch": 22.263836870578444, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 0.0011094465251768621, |
|
"loss": 1.2737, |
|
"step": 214000 |
|
}, |
|
{ |
|
"epoch": 22.31585518102372, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.0011073657927590512, |
|
"loss": 1.2737, |
|
"step": 214500 |
|
}, |
|
{ |
|
"epoch": 22.367873491469, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0011052850603412402, |
|
"loss": 1.2735, |
|
"step": 215000 |
|
}, |
|
{ |
|
"epoch": 22.419891801914275, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.001103204327923429, |
|
"loss": 1.2733, |
|
"step": 215500 |
|
}, |
|
{ |
|
"epoch": 22.471910112359552, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.001101123595505618, |
|
"loss": 1.2723, |
|
"step": 216000 |
|
}, |
|
{ |
|
"epoch": 22.52392842280483, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.001099042863087807, |
|
"loss": 1.274, |
|
"step": 216500 |
|
}, |
|
{ |
|
"epoch": 22.575946733250102, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 0.001096962130669996, |
|
"loss": 1.2744, |
|
"step": 217000 |
|
}, |
|
{ |
|
"epoch": 22.62796504369538, |
|
"grad_norm": 0.1953125, |
|
"learning_rate": 0.0010948813982521847, |
|
"loss": 1.2758, |
|
"step": 217500 |
|
}, |
|
{ |
|
"epoch": 22.679983354140656, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.0010928006658343736, |
|
"loss": 1.2755, |
|
"step": 218000 |
|
}, |
|
{ |
|
"epoch": 22.732001664585933, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 0.0010907199334165628, |
|
"loss": 1.2742, |
|
"step": 218500 |
|
}, |
|
{ |
|
"epoch": 22.78401997503121, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 0.0010886392009987517, |
|
"loss": 1.2741, |
|
"step": 219000 |
|
}, |
|
{ |
|
"epoch": 22.836038285476487, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0010865584685809404, |
|
"loss": 1.2739, |
|
"step": 219500 |
|
}, |
|
{ |
|
"epoch": 22.888056595921764, |
|
"grad_norm": 0.166015625, |
|
"learning_rate": 0.0010844777361631295, |
|
"loss": 1.2747, |
|
"step": 220000 |
|
}, |
|
{ |
|
"epoch": 22.94007490636704, |
|
"grad_norm": 0.216796875, |
|
"learning_rate": 0.0010823970037453184, |
|
"loss": 1.2733, |
|
"step": 220500 |
|
}, |
|
{ |
|
"epoch": 22.992093216812318, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 0.0010803162713275071, |
|
"loss": 1.2734, |
|
"step": 221000 |
|
}, |
|
{ |
|
"epoch": 23.0, |
|
"eval_loss": 1.3059455156326294, |
|
"eval_runtime": 1.6804, |
|
"eval_samples_per_second": 595.106, |
|
"eval_steps_per_second": 0.595, |
|
"step": 221076 |
|
}, |
|
{ |
|
"epoch": 23.044111527257595, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 0.0010782355389096963, |
|
"loss": 1.2717, |
|
"step": 221500 |
|
}, |
|
{ |
|
"epoch": 23.096129837702872, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 0.0010761548064918852, |
|
"loss": 1.2719, |
|
"step": 222000 |
|
}, |
|
{ |
|
"epoch": 23.14814814814815, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 0.0010740740740740743, |
|
"loss": 1.2732, |
|
"step": 222500 |
|
}, |
|
{ |
|
"epoch": 23.200166458593426, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.001071993341656263, |
|
"loss": 1.2743, |
|
"step": 223000 |
|
}, |
|
{ |
|
"epoch": 23.252184769038703, |
|
"grad_norm": 0.87890625, |
|
"learning_rate": 0.001069912609238452, |
|
"loss": 1.2724, |
|
"step": 223500 |
|
}, |
|
{ |
|
"epoch": 23.30420307948398, |
|
"grad_norm": 0.423828125, |
|
"learning_rate": 0.001067831876820641, |
|
"loss": 1.271, |
|
"step": 224000 |
|
}, |
|
{ |
|
"epoch": 23.356221389929257, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.0010657511444028297, |
|
"loss": 1.2721, |
|
"step": 224500 |
|
}, |
|
{ |
|
"epoch": 23.408239700374533, |
|
"grad_norm": 0.1962890625, |
|
"learning_rate": 0.0010636704119850186, |
|
"loss": 1.2716, |
|
"step": 225000 |
|
}, |
|
{ |
|
"epoch": 23.460258010819807, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.0010615896795672078, |
|
"loss": 1.2714, |
|
"step": 225500 |
|
}, |
|
{ |
|
"epoch": 23.512276321265084, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.0010595089471493967, |
|
"loss": 1.2719, |
|
"step": 226000 |
|
}, |
|
{ |
|
"epoch": 23.56429463171036, |
|
"grad_norm": 0.189453125, |
|
"learning_rate": 0.0010574282147315854, |
|
"loss": 1.2716, |
|
"step": 226500 |
|
}, |
|
{ |
|
"epoch": 23.616312942155638, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 0.0010553474823137745, |
|
"loss": 1.272, |
|
"step": 227000 |
|
}, |
|
{ |
|
"epoch": 23.668331252600915, |
|
"grad_norm": 0.890625, |
|
"learning_rate": 0.0010532667498959634, |
|
"loss": 1.2729, |
|
"step": 227500 |
|
}, |
|
{ |
|
"epoch": 23.72034956304619, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 0.0010511860174781521, |
|
"loss": 1.2728, |
|
"step": 228000 |
|
}, |
|
{ |
|
"epoch": 23.77236787349147, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0010491052850603413, |
|
"loss": 1.2733, |
|
"step": 228500 |
|
}, |
|
{ |
|
"epoch": 23.824386183936745, |
|
"grad_norm": 0.1806640625, |
|
"learning_rate": 0.0010470245526425302, |
|
"loss": 1.2713, |
|
"step": 229000 |
|
}, |
|
{ |
|
"epoch": 23.876404494382022, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.0010449438202247193, |
|
"loss": 1.2714, |
|
"step": 229500 |
|
}, |
|
{ |
|
"epoch": 23.9284228048273, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.001042863087806908, |
|
"loss": 1.2724, |
|
"step": 230000 |
|
}, |
|
{ |
|
"epoch": 23.980441115272576, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 0.001040782355389097, |
|
"loss": 1.2716, |
|
"step": 230500 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"eval_loss": 1.305156946182251, |
|
"eval_runtime": 2.3203, |
|
"eval_samples_per_second": 430.983, |
|
"eval_steps_per_second": 0.431, |
|
"step": 230688 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 480600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.304114629940989e+19, |
|
"train_batch_size": 1024, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|