diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,15148 +1,3461 @@ { - "best_metric": 1.4233994483947754, - "best_model_checkpoint": "./results/models/checkpoint-922740", - "epoch": 7.0, + "best_metric": 1.305156946182251, + "best_model_checkpoint": "./results/models/checkpoint-230688", + "epoch": 24.0, "eval_steps": 500, - "global_step": 1076530, + "global_step": 230688, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.003251186683139346, - "grad_norm": 1.3671875, - "learning_rate": 0.0019998699525326743, - "loss": 2.8127, + "epoch": 0.05201831044527674, + "grad_norm": 0.34375, + "learning_rate": 0.001997919267582189, + "loss": 2.3383, "step": 500 }, { - "epoch": 0.006502373366278692, - "grad_norm": 67.0, - "learning_rate": 0.001999739905065349, - "loss": 2.4852, + "epoch": 0.10403662089055347, + "grad_norm": 0.93359375, + "learning_rate": 0.001995838535164378, + "loss": 1.9394, "step": 1000 }, { - "epoch": 0.009753560049418037, - "grad_norm": 0.7890625, - "learning_rate": 0.0019996098575980233, - "loss": 2.3924, + "epoch": 0.1560549313358302, + "grad_norm": 0.5078125, + "learning_rate": 0.001993757802746567, + "loss": 1.8509, "step": 1500 }, { - "epoch": 0.013004746732557384, - "grad_norm": 0.7578125, - "learning_rate": 0.0019994798101306975, - "loss": 2.3235, + "epoch": 0.20807324178110695, + "grad_norm": 0.341796875, + "learning_rate": 0.0019916770703287557, + "loss": 1.8119, "step": 2000 }, { - "epoch": 0.01625593341569673, - "grad_norm": 0.85546875, - "learning_rate": 0.0019993497626633722, - "loss": 2.2647, + "epoch": 0.2600915522263837, + "grad_norm": 0.30859375, + "learning_rate": 0.0019895963379109446, + "loss": 1.746, "step": 2500 }, { - "epoch": 0.019507120098836074, - "grad_norm": 0.67578125, - "learning_rate": 0.0019992197151960465, - "loss": 2.2077, + "epoch": 0.3121098626716604, + "grad_norm": 0.4921875, + "learning_rate": 0.0019875156054931335, + "loss": 1.7113, "step": 3000 }, { - "epoch": 0.02275830678197542, - "grad_norm": 0.7890625, - "learning_rate": 0.0019990896677287207, - "loss": 2.1543, + "epoch": 0.3641281731169372, + "grad_norm": 0.37109375, + "learning_rate": 0.0019854348730753224, + "loss": 1.6861, "step": 3500 }, { - "epoch": 0.026009493465114768, - "grad_norm": 0.80078125, - "learning_rate": 0.0019989596202613954, - "loss": 2.1291, + "epoch": 0.4161464835622139, + "grad_norm": 0.2451171875, + "learning_rate": 0.0019833541406575114, + "loss": 1.6518, "step": 4000 }, { - "epoch": 0.02926068014825411, - "grad_norm": 0.92578125, - "learning_rate": 0.00199882957279407, - "loss": 2.1703, + "epoch": 0.4681647940074906, + "grad_norm": 1.2578125, + "learning_rate": 0.0019812734082397003, + "loss": 1.6257, "step": 4500 }, { - "epoch": 0.03251186683139346, - "grad_norm": 0.859375, - "learning_rate": 0.0019986995253267444, - "loss": 2.1796, + "epoch": 0.5201831044527674, + "grad_norm": 0.322265625, + "learning_rate": 0.0019791926758218896, + "loss": 1.6184, "step": 5000 }, { - "epoch": 0.0357630535145328, - "grad_norm": 0.96875, - "learning_rate": 0.0019985694778594187, - "loss": 2.1274, + "epoch": 0.5722014148980441, + "grad_norm": 0.2177734375, + "learning_rate": 0.001977111943404078, + "loss": 1.6034, "step": 5500 }, { - "epoch": 0.03901424019767215, - "grad_norm": 0.890625, - "learning_rate": 0.0019984394303920934, - "loss": 2.0801, + "epoch": 0.6242197253433208, + "grad_norm": 0.46875, + "learning_rate": 0.001975031210986267, + "loss": 1.5798, "step": 6000 }, { - "epoch": 0.042265426880811495, - "grad_norm": 0.9296875, - "learning_rate": 0.0019983093829247676, - "loss": 2.0708, + "epoch": 0.6762380357885975, + "grad_norm": 0.349609375, + "learning_rate": 0.0019729504785684564, + "loss": 1.6023, "step": 6500 }, { - "epoch": 0.04551661356395084, - "grad_norm": 1.0, - "learning_rate": 0.001998179335457442, - "loss": 2.0459, + "epoch": 0.7282563462338744, + "grad_norm": 0.314453125, + "learning_rate": 0.0019708697461506453, + "loss": 1.6354, "step": 7000 }, { - "epoch": 0.04876780024709019, - "grad_norm": 0.67578125, - "learning_rate": 0.0019980492879901166, - "loss": 2.0342, + "epoch": 0.7802746566791511, + "grad_norm": 0.259765625, + "learning_rate": 0.0019687890137328337, + "loss": 1.6039, "step": 7500 }, { - "epoch": 0.052018986930229535, - "grad_norm": 0.90234375, - "learning_rate": 0.001997919240522791, - "loss": 2.0241, + "epoch": 0.8322929671244278, + "grad_norm": 0.48046875, + "learning_rate": 0.001966708281315023, + "loss": 1.5863, "step": 8000 }, { - "epoch": 0.05527017361336888, - "grad_norm": 1.0, - "learning_rate": 0.001997789193055465, - "loss": 2.0065, + "epoch": 0.8843112775697045, + "grad_norm": 0.99609375, + "learning_rate": 0.001964627548897212, + "loss": 1.5758, "step": 8500 }, { - "epoch": 0.05852136029650822, - "grad_norm": 0.671875, - "learning_rate": 0.00199765914558814, - "loss": 1.992, + "epoch": 0.9363295880149812, + "grad_norm": 0.2333984375, + "learning_rate": 0.0019625468164794005, + "loss": 1.5658, "step": 9000 }, { - "epoch": 0.06177254697964757, - "grad_norm": 1.1328125, - "learning_rate": 0.001997529098120814, - "loss": 1.9743, + "epoch": 0.9883478984602581, + "grad_norm": 0.296875, + "learning_rate": 0.00196046608406159, + "loss": 1.5664, "step": 9500 }, { - "epoch": 0.06502373366278692, - "grad_norm": 0.8671875, - "learning_rate": 0.0019973990506534883, - "loss": 1.9478, + "epoch": 1.0, + "eval_loss": 1.6215704679489136, + "eval_runtime": 1.5075, + "eval_samples_per_second": 663.37, + "eval_steps_per_second": 0.663, + "step": 9612 + }, + { + "epoch": 1.0403662089055348, + "grad_norm": 0.28515625, + "learning_rate": 0.0019583853516437788, + "loss": 1.5684, "step": 10000 }, { - "epoch": 0.06827492034592626, - "grad_norm": 0.6796875, - "learning_rate": 0.001997269003186163, - "loss": 1.943, + "epoch": 1.0923845193508115, + "grad_norm": 0.75, + "learning_rate": 0.0019563046192259677, + "loss": 1.5536, "step": 10500 }, { - "epoch": 0.0715261070290656, - "grad_norm": 0.90625, - "learning_rate": 0.0019971389557188377, - "loss": 1.92, + "epoch": 1.1444028297960882, + "grad_norm": 0.29296875, + "learning_rate": 0.0019542238868081566, + "loss": 1.5495, "step": 11000 }, { - "epoch": 0.07477729371220496, - "grad_norm": 0.71484375, - "learning_rate": 0.001997008908251512, - "loss": 1.9189, + "epoch": 1.196421140241365, + "grad_norm": 0.25, + "learning_rate": 0.0019521431543903455, + "loss": 1.529, "step": 11500 }, { - "epoch": 0.0780284803953443, - "grad_norm": 1.015625, - "learning_rate": 0.0019968788607841862, - "loss": 1.9228, + "epoch": 1.2484394506866416, + "grad_norm": 0.38671875, + "learning_rate": 0.0019500624219725344, + "loss": 1.5307, "step": 12000 }, { - "epoch": 0.08127966707848365, - "grad_norm": 0.65234375, - "learning_rate": 0.001996748813316861, - "loss": 1.9151, + "epoch": 1.3004577611319184, + "grad_norm": 0.21875, + "learning_rate": 0.0019479816895547233, + "loss": 1.5422, "step": 12500 }, { - "epoch": 0.08453085376162299, - "grad_norm": 0.9453125, - "learning_rate": 0.001996618765849535, - "loss": 1.9197, + "epoch": 1.352476071577195, + "grad_norm": 0.34375, + "learning_rate": 0.0019459009571369122, + "loss": 1.5281, "step": 13000 }, { - "epoch": 0.08778204044476234, - "grad_norm": 0.94140625, - "learning_rate": 0.0019964887183822094, - "loss": 1.9055, + "epoch": 1.404494382022472, + "grad_norm": 0.2421875, + "learning_rate": 0.0019438202247191011, + "loss": 1.5232, "step": 13500 }, { - "epoch": 0.09103322712790168, - "grad_norm": 0.64453125, - "learning_rate": 0.001996358670914884, - "loss": 1.8883, + "epoch": 1.4565126924677487, + "grad_norm": 0.98828125, + "learning_rate": 0.00194173949230129, + "loss": 1.5286, "step": 14000 }, { - "epoch": 0.09428441381104102, - "grad_norm": 0.9453125, - "learning_rate": 0.0019962286234475584, - "loss": 1.89, + "epoch": 1.5085310029130254, + "grad_norm": 0.72265625, + "learning_rate": 0.001939658759883479, + "loss": 1.5286, "step": 14500 }, { - "epoch": 0.09753560049418038, - "grad_norm": 0.71875, - "learning_rate": 0.0019960985759802327, - "loss": 1.8763, + "epoch": 1.5605493133583022, + "grad_norm": 0.341796875, + "learning_rate": 0.001937578027465668, + "loss": 1.5173, "step": 15000 }, { - "epoch": 0.10078678717731972, - "grad_norm": 0.7265625, - "learning_rate": 0.0019959685285129074, - "loss": 1.8751, + "epoch": 1.6125676238035789, + "grad_norm": 0.2451171875, + "learning_rate": 0.0019354972950478568, + "loss": 1.5055, "step": 15500 }, { - "epoch": 0.10403797386045907, - "grad_norm": 0.9453125, - "learning_rate": 0.0019958384810455816, - "loss": 1.8804, + "epoch": 1.6645859342488556, + "grad_norm": 0.2255859375, + "learning_rate": 0.0019334165626300457, + "loss": 1.5071, "step": 16000 }, { - "epoch": 0.10728916054359841, - "grad_norm": 0.6953125, - "learning_rate": 0.001995708433578256, - "loss": 1.8864, + "epoch": 1.7166042446941323, + "grad_norm": 0.37890625, + "learning_rate": 0.0019313358302122348, + "loss": 1.5071, "step": 16500 }, { - "epoch": 0.11054034722673776, - "grad_norm": 0.70703125, - "learning_rate": 0.0019955783861109306, - "loss": 1.8726, + "epoch": 1.768622555139409, + "grad_norm": 0.451171875, + "learning_rate": 0.0019292550977944235, + "loss": 1.5211, "step": 17000 }, { - "epoch": 0.1137915339098771, - "grad_norm": 0.88671875, - "learning_rate": 0.001995448338643605, - "loss": 1.8611, + "epoch": 1.8206408655846857, + "grad_norm": 0.25, + "learning_rate": 0.0019271743653766125, + "loss": 1.5211, "step": 17500 }, { - "epoch": 0.11704272059301644, - "grad_norm": 0.7734375, - "learning_rate": 0.001995318291176279, - "loss": 1.8544, + "epoch": 1.8726591760299627, + "grad_norm": 0.41796875, + "learning_rate": 0.0019250936329588016, + "loss": 1.5159, "step": 18000 }, { - "epoch": 0.1202939072761558, - "grad_norm": 2.34375, - "learning_rate": 0.001995188243708954, - "loss": 1.8469, + "epoch": 1.9246774864752392, + "grad_norm": 0.466796875, + "learning_rate": 0.0019230129005409905, + "loss": 1.5006, "step": 18500 }, { - "epoch": 0.12354509395929514, - "grad_norm": 0.671875, - "learning_rate": 0.0019950581962416285, - "loss": 1.8493, + "epoch": 1.9766957969205161, + "grad_norm": 0.451171875, + "learning_rate": 0.0019209321681231794, + "loss": 1.5082, "step": 19000 }, { - "epoch": 0.12679628064243448, - "grad_norm": 2.40625, - "learning_rate": 0.0019949281487743028, - "loss": 1.8449, + "epoch": 2.0, + "eval_loss": 1.5491766929626465, + "eval_runtime": 1.6608, + "eval_samples_per_second": 602.119, + "eval_steps_per_second": 0.602, + "step": 19224 + }, + { + "epoch": 2.0287141073657926, + "grad_norm": 0.296875, + "learning_rate": 0.0019188514357053683, + "loss": 1.5182, "step": 19500 }, { - "epoch": 0.13004746732557385, - "grad_norm": 0.921875, - "learning_rate": 0.001994798101306977, - "loss": 1.8341, + "epoch": 2.0807324178110695, + "grad_norm": 0.28125, + "learning_rate": 0.0019167707032875572, + "loss": 1.5172, "step": 20000 }, { - "epoch": 0.13329865400871319, - "grad_norm": 1.0546875, - "learning_rate": 0.0019946680538396517, - "loss": 1.8258, + "epoch": 2.132750728256346, + "grad_norm": 0.37109375, + "learning_rate": 0.0019146899708697464, + "loss": 1.5101, "step": 20500 }, { - "epoch": 0.13654984069185253, - "grad_norm": 0.66015625, - "learning_rate": 0.001994538006372326, - "loss": 1.8294, + "epoch": 2.184769038701623, + "grad_norm": 0.2392578125, + "learning_rate": 0.001912609238451935, + "loss": 1.5086, "step": 21000 }, { - "epoch": 0.13980102737499187, - "grad_norm": 0.890625, - "learning_rate": 0.0019944079589050002, - "loss": 1.8346, + "epoch": 2.2367873491468995, + "grad_norm": 0.2421875, + "learning_rate": 0.001910528506034124, + "loss": 1.4943, "step": 21500 }, { - "epoch": 0.1430522140581312, - "grad_norm": 0.828125, - "learning_rate": 0.001994277911437675, - "loss": 1.8136, + "epoch": 2.2888056595921764, + "grad_norm": 0.296875, + "learning_rate": 0.0019084477736163131, + "loss": 1.4848, "step": 22000 }, { - "epoch": 0.14630340074127057, - "grad_norm": 0.94921875, - "learning_rate": 0.001994147863970349, - "loss": 1.8161, + "epoch": 2.3408239700374533, + "grad_norm": 0.306640625, + "learning_rate": 0.0019063670411985018, + "loss": 1.4823, "step": 22500 }, { - "epoch": 0.1495545874244099, - "grad_norm": 0.97265625, - "learning_rate": 0.0019940178165030235, - "loss": 1.8165, + "epoch": 2.39284228048273, + "grad_norm": 0.26953125, + "learning_rate": 0.0019042863087806907, + "loss": 1.4702, "step": 23000 }, { - "epoch": 0.15280577410754925, - "grad_norm": 0.65625, - "learning_rate": 0.001993887769035698, - "loss": 1.8134, + "epoch": 2.444860590928007, + "grad_norm": 1.296875, + "learning_rate": 0.0019022055763628799, + "loss": 1.4673, "step": 23500 }, { - "epoch": 0.1560569607906886, - "grad_norm": 0.7421875, - "learning_rate": 0.0019937577215683724, - "loss": 1.8177, + "epoch": 2.4968789013732833, + "grad_norm": 0.26953125, + "learning_rate": 0.0019001248439450688, + "loss": 1.4706, "step": 24000 }, { - "epoch": 0.15930814747382796, - "grad_norm": 0.62109375, - "learning_rate": 0.0019936276741010467, - "loss": 1.8106, + "epoch": 2.54889721181856, + "grad_norm": 0.75, + "learning_rate": 0.0018980441115272575, + "loss": 1.4635, "step": 24500 }, { - "epoch": 0.1625593341569673, - "grad_norm": 0.91796875, - "learning_rate": 0.0019934976266337214, - "loss": 1.808, + "epoch": 2.6009155222638367, + "grad_norm": 0.201171875, + "learning_rate": 0.0018959633791094466, + "loss": 1.4499, "step": 25000 }, { - "epoch": 0.16581052084010664, - "grad_norm": 0.74609375, - "learning_rate": 0.001993367579166396, - "loss": 1.804, + "epoch": 2.6529338327091136, + "grad_norm": 0.3125, + "learning_rate": 0.0018938826466916355, + "loss": 1.4453, "step": 25500 }, { - "epoch": 0.16906170752324598, - "grad_norm": 8.6875, - "learning_rate": 0.0019932375316990703, - "loss": 1.7985, + "epoch": 2.70495214315439, + "grad_norm": 0.27734375, + "learning_rate": 0.0018918019142738244, + "loss": 1.4463, "step": 26000 }, { - "epoch": 0.17231289420638532, - "grad_norm": 0.64453125, - "learning_rate": 0.0019931074842317446, - "loss": 1.7876, + "epoch": 2.756970453599667, + "grad_norm": 0.283203125, + "learning_rate": 0.0018897211818560133, + "loss": 1.452, "step": 26500 }, { - "epoch": 0.1755640808895247, - "grad_norm": 0.82421875, - "learning_rate": 0.0019929774367644193, - "loss": 1.787, + "epoch": 2.808988764044944, + "grad_norm": 0.2236328125, + "learning_rate": 0.0018876404494382023, + "loss": 1.448, "step": 27000 }, { - "epoch": 0.17881526757266403, - "grad_norm": 0.9140625, - "learning_rate": 0.0019928473892970935, - "loss": 1.7812, + "epoch": 2.8610070744902205, + "grad_norm": 0.244140625, + "learning_rate": 0.0018855597170203914, + "loss": 1.4525, "step": 27500 }, { - "epoch": 0.18206645425580337, - "grad_norm": 0.7578125, - "learning_rate": 0.001992717341829768, - "loss": 1.785, + "epoch": 2.9130253849354975, + "grad_norm": 0.2412109375, + "learning_rate": 0.00188347898460258, + "loss": 1.4457, "step": 28000 }, { - "epoch": 0.1853176409389427, - "grad_norm": 0.640625, - "learning_rate": 0.0019925872943624425, - "loss": 1.7763, + "epoch": 2.965043695380774, + "grad_norm": 0.60546875, + "learning_rate": 0.001881398252184769, + "loss": 1.4468, "step": 28500 }, { - "epoch": 0.18856882762208205, - "grad_norm": 0.8125, - "learning_rate": 0.0019924572468951168, - "loss": 1.8051, + "epoch": 3.0, + "eval_loss": 1.4870332479476929, + "eval_runtime": 1.4668, + "eval_samples_per_second": 681.76, + "eval_steps_per_second": 0.682, + "step": 28836 + }, + { + "epoch": 3.017062005826051, + "grad_norm": 0.349609375, + "learning_rate": 0.0018793175197669581, + "loss": 1.4453, "step": 29000 }, { - "epoch": 0.19182001430522141, - "grad_norm": 0.765625, - "learning_rate": 0.001992327199427791, - "loss": 1.7849, + "epoch": 3.0690803162713274, + "grad_norm": 1.4453125, + "learning_rate": 0.001877236787349147, + "loss": 1.4455, "step": 29500 }, { - "epoch": 0.19507120098836075, - "grad_norm": 0.69140625, - "learning_rate": 0.0019921971519604657, - "loss": 1.7797, + "epoch": 3.1210986267166043, + "grad_norm": 0.2314453125, + "learning_rate": 0.0018751560549313357, + "loss": 1.4378, "step": 30000 }, { - "epoch": 0.1983223876715001, - "grad_norm": 0.7890625, - "learning_rate": 0.00199206710449314, - "loss": 1.7877, + "epoch": 3.173116937161881, + "grad_norm": 0.326171875, + "learning_rate": 0.0018730753225135249, + "loss": 1.4342, "step": 30500 }, { - "epoch": 0.20157357435463943, - "grad_norm": 0.71484375, - "learning_rate": 0.0019919370570258142, - "loss": 1.78, + "epoch": 3.2251352476071578, + "grad_norm": 5.09375, + "learning_rate": 0.0018709945900957138, + "loss": 1.4401, "step": 31000 }, { - "epoch": 0.2048247610377788, - "grad_norm": 0.7734375, - "learning_rate": 0.001991807009558489, - "loss": 1.7848, + "epoch": 3.2771535580524347, + "grad_norm": 0.31640625, + "learning_rate": 0.0018689138576779025, + "loss": 1.4317, "step": 31500 }, { - "epoch": 0.20807594772091814, - "grad_norm": 0.7734375, - "learning_rate": 0.001991676962091163, - "loss": 1.7725, + "epoch": 3.329171868497711, + "grad_norm": 0.291015625, + "learning_rate": 0.0018668331252600916, + "loss": 1.4252, "step": 32000 }, { - "epoch": 0.21132713440405748, - "grad_norm": 0.796875, - "learning_rate": 0.0019915469146238375, - "loss": 1.7705, + "epoch": 3.381190178942988, + "grad_norm": 0.435546875, + "learning_rate": 0.0018647523928422805, + "loss": 1.427, "step": 32500 }, { - "epoch": 0.21457832108719682, - "grad_norm": 0.69921875, - "learning_rate": 0.001991416867156512, - "loss": 1.7645, + "epoch": 3.4332084893882646, + "grad_norm": 0.56640625, + "learning_rate": 0.0018626716604244697, + "loss": 1.4207, "step": 33000 }, { - "epoch": 0.21782950777033616, - "grad_norm": 1.59375, - "learning_rate": 0.001991286819689187, - "loss": 1.7673, + "epoch": 3.4852267998335416, + "grad_norm": 0.31640625, + "learning_rate": 0.0018605909280066584, + "loss": 1.4209, "step": 33500 }, { - "epoch": 0.22108069445347553, - "grad_norm": 0.77734375, - "learning_rate": 0.001991156772221861, - "loss": 1.7758, + "epoch": 3.537245110278818, + "grad_norm": 0.255859375, + "learning_rate": 0.0018585101955888473, + "loss": 1.418, "step": 34000 }, { - "epoch": 0.22433188113661487, - "grad_norm": 2.09375, - "learning_rate": 0.0019910267247545354, - "loss": 1.7593, + "epoch": 3.589263420724095, + "grad_norm": 0.33984375, + "learning_rate": 0.0018564294631710364, + "loss": 1.4153, "step": 34500 }, { - "epoch": 0.2275830678197542, - "grad_norm": 0.9140625, - "learning_rate": 0.00199089667728721, - "loss": 1.756, + "epoch": 3.6412817311693715, + "grad_norm": 0.404296875, + "learning_rate": 0.001854348730753225, + "loss": 1.4171, "step": 35000 }, { - "epoch": 0.23083425450289355, - "grad_norm": 1.28125, - "learning_rate": 0.0019907666298198843, - "loss": 1.7467, + "epoch": 3.6933000416146484, + "grad_norm": 0.7421875, + "learning_rate": 0.001852267998335414, + "loss": 1.4203, "step": 35500 }, { - "epoch": 0.2340854411860329, - "grad_norm": 0.83203125, - "learning_rate": 0.0019906365823525586, - "loss": 1.7501, + "epoch": 3.7453183520599254, + "grad_norm": 0.265625, + "learning_rate": 0.0018501872659176031, + "loss": 1.4189, "step": 36000 }, { - "epoch": 0.23733662786917226, - "grad_norm": 0.6328125, - "learning_rate": 0.0019905065348852333, - "loss": 1.7459, + "epoch": 3.797336662505202, + "grad_norm": 0.53125, + "learning_rate": 0.001848106533499792, + "loss": 1.4212, "step": 36500 }, { - "epoch": 0.2405878145523116, - "grad_norm": 0.78515625, - "learning_rate": 0.0019903764874179075, - "loss": 1.7398, + "epoch": 3.8493549729504783, + "grad_norm": 0.2158203125, + "learning_rate": 0.0018460258010819808, + "loss": 1.4151, "step": 37000 }, { - "epoch": 0.24383900123545094, - "grad_norm": 0.69921875, - "learning_rate": 0.001990246439950582, - "loss": 1.7416, + "epoch": 3.9013732833957553, + "grad_norm": 0.349609375, + "learning_rate": 0.0018439450686641699, + "loss": 1.4087, "step": 37500 }, { - "epoch": 0.24709018791859028, - "grad_norm": 0.7578125, - "learning_rate": 0.0019901163924832565, - "loss": 1.7476, + "epoch": 3.9533915938410322, + "grad_norm": 0.228515625, + "learning_rate": 0.0018418643362463588, + "loss": 1.4038, "step": 38000 }, { - "epoch": 0.2503413746017296, - "grad_norm": 0.78515625, - "learning_rate": 0.0019899863450159308, - "loss": 1.7383, + "epoch": 4.0, + "eval_loss": 1.4296818971633911, + "eval_runtime": 1.3293, + "eval_samples_per_second": 752.251, + "eval_steps_per_second": 0.752, + "step": 38448 + }, + { + "epoch": 4.005409904286309, + "grad_norm": 2.53125, + "learning_rate": 0.0018397836038285475, + "loss": 1.4037, "step": 38500 }, { - "epoch": 0.25359256128486896, - "grad_norm": 0.578125, - "learning_rate": 0.001989856297548605, - "loss": 1.7379, + "epoch": 4.057428214731585, + "grad_norm": 1.703125, + "learning_rate": 0.0018377028714107366, + "loss": 1.402, "step": 39000 }, { - "epoch": 0.2568437479680083, - "grad_norm": 0.65625, - "learning_rate": 0.0019897262500812797, - "loss": 1.7357, + "epoch": 4.109446525176862, + "grad_norm": 0.2138671875, + "learning_rate": 0.0018356221389929255, + "loss": 1.3972, "step": 39500 }, { - "epoch": 0.2600949346511477, - "grad_norm": 0.7421875, - "learning_rate": 0.0019895962026139544, - "loss": 1.7301, + "epoch": 4.161464835622139, + "grad_norm": 0.2001953125, + "learning_rate": 0.0018335414065751145, + "loss": 1.3996, "step": 40000 }, { - "epoch": 0.26334612133428703, - "grad_norm": 0.62890625, - "learning_rate": 0.0019894661551466287, - "loss": 1.7312, + "epoch": 4.213483146067416, + "grad_norm": 0.455078125, + "learning_rate": 0.0018314606741573034, + "loss": 1.3989, "step": 40500 }, { - "epoch": 0.26659730801742637, - "grad_norm": 1.703125, - "learning_rate": 0.001989336107679303, - "loss": 1.7451, + "epoch": 4.265501456512692, + "grad_norm": 0.2041015625, + "learning_rate": 0.0018293799417394923, + "loss": 1.3945, "step": 41000 }, { - "epoch": 0.2698484947005657, - "grad_norm": 4.5, - "learning_rate": 0.0019892060602119776, - "loss": 1.7367, + "epoch": 4.317519766957969, + "grad_norm": 0.267578125, + "learning_rate": 0.0018272992093216814, + "loss": 1.3936, "step": 41500 }, { - "epoch": 0.27309968138370505, - "grad_norm": 0.78125, - "learning_rate": 0.001989076012744652, - "loss": 1.7231, + "epoch": 4.369538077403246, + "grad_norm": 0.384765625, + "learning_rate": 0.0018252184769038703, + "loss": 1.3906, "step": 42000 }, { - "epoch": 0.2763508680668444, - "grad_norm": 0.82421875, - "learning_rate": 0.001988945965277326, - "loss": 1.7225, + "epoch": 4.421556387848523, + "grad_norm": 0.380859375, + "learning_rate": 0.001823137744486059, + "loss": 1.3946, "step": 42500 }, { - "epoch": 0.27960205474998373, - "grad_norm": 0.73046875, - "learning_rate": 0.001988815917810001, - "loss": 1.7198, + "epoch": 4.473574698293799, + "grad_norm": 0.353515625, + "learning_rate": 0.0018210570120682482, + "loss": 1.4069, "step": 43000 }, { - "epoch": 0.28285324143312307, - "grad_norm": 0.765625, - "learning_rate": 0.001988685870342675, - "loss": 1.7255, + "epoch": 4.525593008739076, + "grad_norm": 0.373046875, + "learning_rate": 0.001818976279650437, + "loss": 1.4049, "step": 43500 }, { - "epoch": 0.2861044281162624, - "grad_norm": 0.7890625, - "learning_rate": 0.0019885558228753494, - "loss": 1.7231, + "epoch": 4.577611319184353, + "grad_norm": 0.263671875, + "learning_rate": 0.0018168955472326258, + "loss": 1.3995, "step": 44000 }, { - "epoch": 0.2893556147994018, - "grad_norm": 0.82421875, - "learning_rate": 0.001988425775408024, - "loss": 1.7365, + "epoch": 4.62962962962963, + "grad_norm": 0.392578125, + "learning_rate": 0.001814814814814815, + "loss": 1.4053, "step": 44500 }, { - "epoch": 0.29260680148254115, - "grad_norm": 0.67578125, - "learning_rate": 0.0019882957279406983, - "loss": 1.7237, + "epoch": 4.681647940074907, + "grad_norm": 0.5078125, + "learning_rate": 0.0018127340823970038, + "loss": 1.4, "step": 45000 }, { - "epoch": 0.2958579881656805, - "grad_norm": 0.859375, - "learning_rate": 0.0019881656804733726, - "loss": 1.7336, + "epoch": 4.733666250520183, + "grad_norm": 0.255859375, + "learning_rate": 0.0018106533499791927, + "loss": 1.3946, "step": 45500 }, { - "epoch": 0.2991091748488198, - "grad_norm": 0.984375, - "learning_rate": 0.0019880356330060473, - "loss": 1.7488, + "epoch": 4.78568456096546, + "grad_norm": 0.498046875, + "learning_rate": 0.0018085726175613816, + "loss": 1.3914, "step": 46000 }, { - "epoch": 0.30236036153195917, - "grad_norm": 1.0, - "learning_rate": 0.0019879055855387216, - "loss": 1.7362, + "epoch": 4.837702871410737, + "grad_norm": 1.0703125, + "learning_rate": 0.0018064918851435705, + "loss": 1.3882, "step": 46500 }, { - "epoch": 0.3056115482150985, - "grad_norm": 0.58984375, - "learning_rate": 0.001987775538071396, - "loss": 1.7358, + "epoch": 4.889721181856014, + "grad_norm": 0.30859375, + "learning_rate": 0.0018044111527257595, + "loss": 1.3905, "step": 47000 }, { - "epoch": 0.30886273489823785, - "grad_norm": 0.7265625, - "learning_rate": 0.0019876454906040705, - "loss": 1.7274, + "epoch": 4.94173949230129, + "grad_norm": 0.2197265625, + "learning_rate": 0.0018023304203079484, + "loss": 1.3927, "step": 47500 }, { - "epoch": 0.3121139215813772, - "grad_norm": 0.87890625, - "learning_rate": 0.001987515443136745, - "loss": 1.7265, + "epoch": 4.9937578027465666, + "grad_norm": 0.23828125, + "learning_rate": 0.0018002496878901373, + "loss": 1.3883, "step": 48000 }, { - "epoch": 0.3153651082645165, - "grad_norm": 0.70703125, - "learning_rate": 0.0019873853956694195, - "loss": 1.7121, + "epoch": 5.0, + "eval_loss": 1.4223600625991821, + "eval_runtime": 1.6852, + "eval_samples_per_second": 593.419, + "eval_steps_per_second": 0.593, + "step": 48060 + }, + { + "epoch": 5.0457761131918435, + "grad_norm": 0.29296875, + "learning_rate": 0.0017981689554723264, + "loss": 1.3873, "step": 48500 }, { - "epoch": 0.3186162949476559, - "grad_norm": 1.1328125, - "learning_rate": 0.0019872553482020937, - "loss": 1.7082, + "epoch": 5.09779442363712, + "grad_norm": 0.203125, + "learning_rate": 0.0017960882230545153, + "loss": 1.3831, "step": 49000 }, { - "epoch": 0.32186748163079526, - "grad_norm": 0.71875, - "learning_rate": 0.0019871253007347684, - "loss": 1.7064, + "epoch": 5.149812734082397, + "grad_norm": 0.251953125, + "learning_rate": 0.001794007490636704, + "loss": 1.3784, "step": 49500 }, { - "epoch": 0.3251186683139346, - "grad_norm": 0.79296875, - "learning_rate": 0.0019869952532674427, - "loss": 1.7093, + "epoch": 5.201831044527673, + "grad_norm": 0.271484375, + "learning_rate": 0.0017919267582188932, + "loss": 1.3821, "step": 50000 }, { - "epoch": 0.32836985499707394, - "grad_norm": 0.92578125, - "learning_rate": 0.001986865205800117, - "loss": 1.7092, + "epoch": 5.25384935497295, + "grad_norm": 0.451171875, + "learning_rate": 0.001789846025801082, + "loss": 1.3781, "step": 50500 }, { - "epoch": 0.3316210416802133, - "grad_norm": 0.71484375, - "learning_rate": 0.0019867351583327916, - "loss": 1.7031, + "epoch": 5.305867665418227, + "grad_norm": 0.33203125, + "learning_rate": 0.0017877652933832708, + "loss": 1.381, "step": 51000 }, { - "epoch": 0.3348722283633526, - "grad_norm": 0.7890625, - "learning_rate": 0.001986605110865466, - "loss": 1.7065, + "epoch": 5.357885975863504, + "grad_norm": 0.40625, + "learning_rate": 0.00178568456096546, + "loss": 1.381, "step": 51500 }, { - "epoch": 0.33812341504649196, - "grad_norm": 5.84375, - "learning_rate": 0.00198647506339814, - "loss": 1.7007, + "epoch": 5.40990428630878, + "grad_norm": 0.263671875, + "learning_rate": 0.0017836038285476488, + "loss": 1.375, "step": 52000 }, { - "epoch": 0.3413746017296313, - "grad_norm": 0.62890625, - "learning_rate": 0.001986345015930815, - "loss": 1.7033, + "epoch": 5.461922596754057, + "grad_norm": 0.314453125, + "learning_rate": 0.0017815230961298377, + "loss": 1.3776, "step": 52500 }, { - "epoch": 0.34462578841277064, - "grad_norm": 0.56640625, - "learning_rate": 0.001986214968463489, - "loss": 1.6962, + "epoch": 5.513940907199334, + "grad_norm": 0.30078125, + "learning_rate": 0.0017794423637120266, + "loss": 1.3773, "step": 53000 }, { - "epoch": 0.34787697509591003, - "grad_norm": 0.58984375, - "learning_rate": 0.0019860849209961634, - "loss": 1.6886, + "epoch": 5.565959217644611, + "grad_norm": 0.228515625, + "learning_rate": 0.0017773616312942156, + "loss": 1.3809, "step": 53500 }, { - "epoch": 0.3511281617790494, - "grad_norm": 0.77734375, - "learning_rate": 0.001985954873528838, - "loss": 1.6864, + "epoch": 5.617977528089888, + "grad_norm": 0.8671875, + "learning_rate": 0.0017752808988764045, + "loss": 1.3786, "step": 54000 }, { - "epoch": 0.3543793484621887, - "grad_norm": 0.8203125, - "learning_rate": 0.0019858248260615128, - "loss": 1.6905, + "epoch": 5.669995838535164, + "grad_norm": 0.275390625, + "learning_rate": 0.0017732001664585936, + "loss": 1.3762, "step": 54500 }, { - "epoch": 0.35763053514532805, - "grad_norm": 1.0546875, - "learning_rate": 0.001985694778594187, - "loss": 1.696, + "epoch": 5.722014148980441, + "grad_norm": 0.2451171875, + "learning_rate": 0.0017711194340407823, + "loss": 1.3741, "step": 55000 }, { - "epoch": 0.3608817218284674, - "grad_norm": 0.66015625, - "learning_rate": 0.0019855647311268613, - "loss": 1.6931, + "epoch": 5.774032459425718, + "grad_norm": 0.224609375, + "learning_rate": 0.0017690387016229714, + "loss": 1.3719, "step": 55500 }, { - "epoch": 0.36413290851160673, - "grad_norm": 1.2578125, - "learning_rate": 0.001985434683659536, - "loss": 1.6919, + "epoch": 5.826050769870995, + "grad_norm": 0.208984375, + "learning_rate": 0.0017669579692051603, + "loss": 1.3712, "step": 56000 }, { - "epoch": 0.3673840951947461, - "grad_norm": 0.9453125, - "learning_rate": 0.0019853046361922103, - "loss": 1.7009, + "epoch": 5.878069080316271, + "grad_norm": 0.26171875, + "learning_rate": 0.001764877236787349, + "loss": 1.3716, "step": 56500 }, { - "epoch": 0.3706352818778854, - "grad_norm": 1.1953125, - "learning_rate": 0.0019851745887248845, - "loss": 1.7066, + "epoch": 5.930087390761548, + "grad_norm": 0.2373046875, + "learning_rate": 0.0017627965043695382, + "loss": 1.3739, "step": 57000 }, { - "epoch": 0.37388646856102475, - "grad_norm": 0.828125, - "learning_rate": 0.001985044541257559, - "loss": 1.6974, + "epoch": 5.982105701206825, + "grad_norm": 0.30859375, + "learning_rate": 0.001760715771951727, + "loss": 1.3744, "step": 57500 }, { - "epoch": 0.3771376552441641, - "grad_norm": 1.0546875, - "learning_rate": 0.0019849144937902335, - "loss": 1.695, + "epoch": 6.0, + "eval_loss": 1.4039781093597412, + "eval_runtime": 1.6711, + "eval_samples_per_second": 598.397, + "eval_steps_per_second": 0.598, + "step": 57672 + }, + { + "epoch": 6.034124011652102, + "grad_norm": 0.2451171875, + "learning_rate": 0.001758635039533916, + "loss": 1.3701, "step": 58000 }, { - "epoch": 0.3803888419273035, - "grad_norm": 1.7421875, - "learning_rate": 0.0019847844463229077, - "loss": 1.7002, + "epoch": 6.086142322097379, + "grad_norm": 0.2197265625, + "learning_rate": 0.001756554307116105, + "loss": 1.3629, "step": 58500 }, { - "epoch": 0.38364002861044283, - "grad_norm": 0.59765625, - "learning_rate": 0.0019846543988555824, - "loss": 1.6998, + "epoch": 6.138160632542655, + "grad_norm": 0.2451171875, + "learning_rate": 0.0017544735746982938, + "loss": 1.3656, "step": 59000 }, { - "epoch": 0.38689121529358217, - "grad_norm": 0.640625, - "learning_rate": 0.0019845243513882567, - "loss": 1.7108, + "epoch": 6.190178942987932, + "grad_norm": 0.23046875, + "learning_rate": 0.0017523928422804827, + "loss": 1.3673, "step": 59500 }, { - "epoch": 0.3901424019767215, - "grad_norm": 0.578125, - "learning_rate": 0.001984394303920931, - "loss": 1.6777, + "epoch": 6.242197253433209, + "grad_norm": 0.2080078125, + "learning_rate": 0.0017503121098626717, + "loss": 1.363, "step": 60000 }, { - "epoch": 0.39339358865986085, - "grad_norm": 0.75390625, - "learning_rate": 0.0019842642564536057, - "loss": 1.6866, + "epoch": 6.294215563878486, + "grad_norm": 1.046875, + "learning_rate": 0.0017482313774448606, + "loss": 1.3632, "step": 60500 }, { - "epoch": 0.3966447753430002, - "grad_norm": 1.890625, - "learning_rate": 0.00198413420898628, - "loss": 1.6838, + "epoch": 6.346233874323762, + "grad_norm": 0.359375, + "learning_rate": 0.0017461506450270495, + "loss": 1.36, "step": 61000 }, { - "epoch": 0.39989596202613953, - "grad_norm": 0.7421875, - "learning_rate": 0.001984004161518954, - "loss": 1.6834, + "epoch": 6.398252184769039, + "grad_norm": 0.58203125, + "learning_rate": 0.0017440699126092386, + "loss": 1.3578, "step": 61500 }, { - "epoch": 0.40314714870927887, - "grad_norm": 0.6875, - "learning_rate": 0.001983874114051629, - "loss": 1.6837, + "epoch": 6.4502704952143155, + "grad_norm": 0.2421875, + "learning_rate": 0.0017419891801914273, + "loss": 1.3622, "step": 62000 }, { - "epoch": 0.4063983353924182, - "grad_norm": 0.80859375, - "learning_rate": 0.0019837440665843036, - "loss": 1.6869, + "epoch": 6.502288805659592, + "grad_norm": 0.21484375, + "learning_rate": 0.0017399084477736164, + "loss": 1.3607, "step": 62500 }, { - "epoch": 0.4096495220755576, - "grad_norm": 0.87890625, - "learning_rate": 0.001983614019116978, - "loss": 1.7305, + "epoch": 6.554307116104869, + "grad_norm": 0.39453125, + "learning_rate": 0.0017378277153558054, + "loss": 1.3552, "step": 63000 }, { - "epoch": 0.41290070875869694, - "grad_norm": 1.0703125, - "learning_rate": 0.001983483971649652, - "loss": 1.746, + "epoch": 6.606325426550145, + "grad_norm": 0.322265625, + "learning_rate": 0.001735746982937994, + "loss": 1.3518, "step": 63500 }, { - "epoch": 0.4161518954418363, - "grad_norm": 0.73046875, - "learning_rate": 0.001983353924182327, - "loss": 1.7387, + "epoch": 6.658343736995422, + "grad_norm": 0.310546875, + "learning_rate": 0.0017336662505201832, + "loss": 1.3498, "step": 64000 }, { - "epoch": 0.4194030821249756, - "grad_norm": 0.90234375, - "learning_rate": 0.001983223876715001, - "loss": 1.7394, + "epoch": 6.710362047440699, + "grad_norm": 1.1484375, + "learning_rate": 0.001731585518102372, + "loss": 1.3528, "step": 64500 }, { - "epoch": 0.42265426880811496, - "grad_norm": 1.203125, - "learning_rate": 0.0019830938292476753, - "loss": 1.7914, + "epoch": 6.762380357885976, + "grad_norm": 0.216796875, + "learning_rate": 0.001729504785684561, + "loss": 1.3528, "step": 65000 }, { - "epoch": 0.4259054554912543, - "grad_norm": 1.1484375, - "learning_rate": 0.00198296378178035, - "loss": 1.7304, + "epoch": 6.814398668331252, + "grad_norm": 0.234375, + "learning_rate": 0.00172742405326675, + "loss": 1.3514, "step": 65500 }, { - "epoch": 0.42915664217439364, - "grad_norm": 0.67578125, - "learning_rate": 0.0019828337343130243, - "loss": 1.7173, + "epoch": 6.866416978776529, + "grad_norm": 0.44921875, + "learning_rate": 0.0017253433208489388, + "loss": 1.3519, "step": 66000 }, { - "epoch": 0.432407828857533, - "grad_norm": 0.69140625, - "learning_rate": 0.0019827036868456985, - "loss": 1.7409, + "epoch": 6.918435289221806, + "grad_norm": 0.2412109375, + "learning_rate": 0.0017232625884311278, + "loss": 1.3475, "step": 66500 }, { - "epoch": 0.4356590155406723, - "grad_norm": 0.734375, - "learning_rate": 0.0019825736393783732, - "loss": 1.7136, + "epoch": 6.970453599667083, + "grad_norm": 0.51171875, + "learning_rate": 0.0017211818560133169, + "loss": 1.3498, "step": 67000 }, { - "epoch": 0.4389102022238117, - "grad_norm": 0.80078125, - "learning_rate": 0.0019824435919110475, - "loss": 1.7297, + "epoch": 7.0, + "eval_loss": 1.3733755350112915, + "eval_runtime": 1.5013, + "eval_samples_per_second": 666.111, + "eval_steps_per_second": 0.666, + "step": 67284 + }, + { + "epoch": 7.022471910112359, + "grad_norm": 0.2373046875, + "learning_rate": 0.0017191011235955056, + "loss": 1.348, "step": 67500 }, { - "epoch": 0.44216138890695106, - "grad_norm": 0.79296875, - "learning_rate": 0.0019823135444437217, - "loss": 1.7409, + "epoch": 7.074490220557636, + "grad_norm": 0.2333984375, + "learning_rate": 0.0017170203911776945, + "loss": 1.3467, "step": 68000 }, { - "epoch": 0.4454125755900904, - "grad_norm": 1.765625, - "learning_rate": 0.0019821834969763964, - "loss": 1.7818, + "epoch": 7.126508531002913, + "grad_norm": 0.26953125, + "learning_rate": 0.0017149396587598836, + "loss": 1.3484, "step": 68500 }, { - "epoch": 0.44866376227322974, - "grad_norm": 0.70703125, - "learning_rate": 0.001982053449509071, - "loss": 1.7189, + "epoch": 7.17852684144819, + "grad_norm": 0.216796875, + "learning_rate": 0.0017128589263420723, + "loss": 1.3504, "step": 69000 }, { - "epoch": 0.4519149489563691, - "grad_norm": 6.21875, - "learning_rate": 0.0019819234020417454, - "loss": 1.7128, + "epoch": 7.230545151893467, + "grad_norm": 0.27734375, + "learning_rate": 0.0017107781939242615, + "loss": 1.3474, "step": 69500 }, { - "epoch": 0.4551661356395084, - "grad_norm": 0.75, - "learning_rate": 0.0019817933545744197, - "loss": 1.6918, + "epoch": 7.282563462338743, + "grad_norm": 0.4296875, + "learning_rate": 0.0017086974615064504, + "loss": 1.3457, "step": 70000 }, { - "epoch": 0.45841732232264776, - "grad_norm": 0.81640625, - "learning_rate": 0.0019816633071070944, - "loss": 1.6941, + "epoch": 7.33458177278402, + "grad_norm": 0.494140625, + "learning_rate": 0.0017066167290886393, + "loss": 1.344, "step": 70500 }, { - "epoch": 0.4616685090057871, - "grad_norm": 1.0546875, - "learning_rate": 0.0019815332596397686, - "loss": 1.6888, + "epoch": 7.386600083229297, + "grad_norm": 0.259765625, + "learning_rate": 0.0017045359966708282, + "loss": 1.3409, "step": 71000 }, { - "epoch": 0.46491969568892644, - "grad_norm": 0.88671875, - "learning_rate": 0.001981403212172443, - "loss": 1.689, + "epoch": 7.438618393674574, + "grad_norm": 0.267578125, + "learning_rate": 0.001702455264253017, + "loss": 1.3427, "step": 71500 }, { - "epoch": 0.4681708823720658, - "grad_norm": 4.75, - "learning_rate": 0.0019812731647051176, - "loss": 1.69, + "epoch": 7.49063670411985, + "grad_norm": 0.2236328125, + "learning_rate": 0.001700374531835206, + "loss": 1.3446, "step": 72000 }, { - "epoch": 0.4714220690552052, - "grad_norm": 1.0078125, - "learning_rate": 0.001981143117237792, - "loss": 1.6951, + "epoch": 7.542655014565127, + "grad_norm": 0.26953125, + "learning_rate": 0.001698293799417395, + "loss": 1.3427, "step": 72500 }, { - "epoch": 0.4746732557383445, - "grad_norm": 0.86328125, - "learning_rate": 0.001981013069770466, - "loss": 1.6849, + "epoch": 7.594673325010404, + "grad_norm": 0.251953125, + "learning_rate": 0.0016962130669995838, + "loss": 1.3451, "step": 73000 }, { - "epoch": 0.47792444242148385, - "grad_norm": 0.70703125, - "learning_rate": 0.001980883022303141, - "loss": 1.7031, + "epoch": 7.646691635455681, + "grad_norm": 0.236328125, + "learning_rate": 0.0016941323345817728, + "loss": 1.3473, "step": 73500 }, { - "epoch": 0.4811756291046232, - "grad_norm": 0.6328125, - "learning_rate": 0.001980752974835815, - "loss": 1.6962, + "epoch": 7.698709945900957, + "grad_norm": 0.2490234375, + "learning_rate": 0.001692051602163962, + "loss": 1.3487, "step": 74000 }, { - "epoch": 0.48442681578776253, - "grad_norm": 0.76171875, - "learning_rate": 0.0019806229273684893, - "loss": 1.7377, + "epoch": 7.750728256346234, + "grad_norm": 0.349609375, + "learning_rate": 0.0016899708697461506, + "loss": 1.3565, "step": 74500 }, { - "epoch": 0.4876780024709019, - "grad_norm": 0.77734375, - "learning_rate": 0.001980492879901164, - "loss": 1.7273, + "epoch": 7.802746566791511, + "grad_norm": 0.291015625, + "learning_rate": 0.0016878901373283395, + "loss": 1.348, "step": 75000 }, { - "epoch": 0.4909291891540412, - "grad_norm": 0.8515625, - "learning_rate": 0.0019803628324338383, - "loss": 1.703, + "epoch": 7.8547648772367875, + "grad_norm": 0.2021484375, + "learning_rate": 0.0016858094049105286, + "loss": 1.3478, "step": 75500 }, { - "epoch": 0.49418037583718055, - "grad_norm": 0.83984375, - "learning_rate": 0.0019802327849665125, - "loss": 1.7018, + "epoch": 7.9067831876820645, + "grad_norm": 0.259765625, + "learning_rate": 0.0016837286724927173, + "loss": 1.3484, "step": 76000 }, { - "epoch": 0.4974315625203199, - "grad_norm": 0.6015625, - "learning_rate": 0.0019801027374991872, - "loss": 1.7036, + "epoch": 7.9588014981273405, + "grad_norm": 0.1943359375, + "learning_rate": 0.0016816479400749065, + "loss": 1.3457, "step": 76500 }, { - "epoch": 0.5006827492034592, - "grad_norm": 1.25, - "learning_rate": 0.001979972690031862, - "loss": 1.7149, + "epoch": 8.0, + "eval_loss": 1.3691484928131104, + "eval_runtime": 1.5204, + "eval_samples_per_second": 657.725, + "eval_steps_per_second": 0.658, + "step": 76896 + }, + { + "epoch": 8.010819808572618, + "grad_norm": 0.244140625, + "learning_rate": 0.0016795672076570954, + "loss": 1.3419, "step": 77000 }, { - "epoch": 0.5039339358865986, - "grad_norm": 1.109375, - "learning_rate": 0.001979842642564536, - "loss": 1.7483, + "epoch": 8.062838119017893, + "grad_norm": 0.271484375, + "learning_rate": 0.0016774864752392843, + "loss": 1.3375, "step": 77500 }, { - "epoch": 0.5071851225697379, - "grad_norm": 0.9453125, - "learning_rate": 0.0019797125950972105, - "loss": 1.731, + "epoch": 8.11485642946317, + "grad_norm": 0.19921875, + "learning_rate": 0.0016754057428214732, + "loss": 1.3368, "step": 78000 }, { - "epoch": 0.5104363092528773, - "grad_norm": 0.859375, - "learning_rate": 0.001979582547629885, - "loss": 1.7228, + "epoch": 8.166874739908447, + "grad_norm": 0.349609375, + "learning_rate": 0.0016733250104036621, + "loss": 1.3385, "step": 78500 }, { - "epoch": 0.5136874959360166, - "grad_norm": 1.03125, - "learning_rate": 0.0019794525001625594, - "loss": 1.7164, + "epoch": 8.218893050353724, + "grad_norm": 0.28125, + "learning_rate": 0.001671244277985851, + "loss": 1.3329, "step": 79000 }, { - "epoch": 0.516938682619156, - "grad_norm": 6.75, - "learning_rate": 0.0019793224526952337, - "loss": 1.7132, + "epoch": 8.270911360799001, + "grad_norm": 0.462890625, + "learning_rate": 0.0016691635455680402, + "loss": 1.3346, "step": 79500 }, { - "epoch": 0.5201898693022954, - "grad_norm": 1.0078125, - "learning_rate": 0.0019791924052279084, - "loss": 1.7004, + "epoch": 8.322929671244278, + "grad_norm": 0.1943359375, + "learning_rate": 0.0016670828131502289, + "loss": 1.3342, "step": 80000 }, { - "epoch": 0.5234410559854347, - "grad_norm": 0.73828125, - "learning_rate": 0.0019790623577605826, - "loss": 1.7019, + "epoch": 8.374947981689555, + "grad_norm": 0.2099609375, + "learning_rate": 0.0016650020807324178, + "loss": 1.3313, "step": 80500 }, { - "epoch": 0.5266922426685741, - "grad_norm": 0.5859375, - "learning_rate": 0.001978932310293257, - "loss": 1.7002, + "epoch": 8.426966292134832, + "grad_norm": 0.265625, + "learning_rate": 0.001662921348314607, + "loss": 1.33, "step": 81000 }, { - "epoch": 0.5299434293517133, - "grad_norm": 0.72265625, - "learning_rate": 0.0019788022628259316, - "loss": 1.6903, + "epoch": 8.478984602580109, + "grad_norm": 0.244140625, + "learning_rate": 0.0016608406158967956, + "loss": 1.3321, "step": 81500 }, { - "epoch": 0.5331946160348527, - "grad_norm": 1.2578125, - "learning_rate": 0.001978672215358606, - "loss": 1.6837, + "epoch": 8.531002913025384, + "grad_norm": 0.2373046875, + "learning_rate": 0.0016587598834789845, + "loss": 1.3322, "step": 82000 }, { - "epoch": 0.536445802717992, - "grad_norm": 0.7890625, - "learning_rate": 0.00197854216789128, - "loss": 1.6843, + "epoch": 8.583021223470661, + "grad_norm": 0.2412109375, + "learning_rate": 0.0016566791510611736, + "loss": 1.3354, "step": 82500 }, { - "epoch": 0.5396969894011314, - "grad_norm": 0.9375, - "learning_rate": 0.001978412120423955, - "loss": 1.6899, + "epoch": 8.635039533915938, + "grad_norm": 0.26171875, + "learning_rate": 0.0016545984186433626, + "loss": 1.3358, "step": 83000 }, { - "epoch": 0.5429481760842707, - "grad_norm": 0.61328125, - "learning_rate": 0.0019782820729566295, - "loss": 1.685, + "epoch": 8.687057844361215, + "grad_norm": 0.2109375, + "learning_rate": 0.0016525176862255513, + "loss": 1.3303, "step": 83500 }, { - "epoch": 0.5461993627674101, - "grad_norm": 0.73828125, - "learning_rate": 0.0019781520254893038, - "loss": 1.6823, + "epoch": 8.739076154806492, + "grad_norm": 0.1865234375, + "learning_rate": 0.0016504369538077404, + "loss": 1.3332, "step": 84000 }, { - "epoch": 0.5494505494505495, - "grad_norm": 1.7421875, - "learning_rate": 0.001978021978021978, - "loss": 1.6779, + "epoch": 8.791094465251769, + "grad_norm": 0.201171875, + "learning_rate": 0.0016483562213899293, + "loss": 1.3337, "step": 84500 }, { - "epoch": 0.5527017361336888, - "grad_norm": 0.69921875, - "learning_rate": 0.0019778919305546527, - "loss": 1.6857, + "epoch": 8.843112775697046, + "grad_norm": 0.6015625, + "learning_rate": 0.0016462754889721182, + "loss": 1.3321, "step": 85000 }, { - "epoch": 0.5559529228168282, - "grad_norm": 0.6953125, - "learning_rate": 0.001977761883087327, - "loss": 1.6778, + "epoch": 8.895131086142323, + "grad_norm": 0.205078125, + "learning_rate": 0.0016441947565543071, + "loss": 1.3283, "step": 85500 }, { - "epoch": 0.5592041094999675, - "grad_norm": 0.9765625, - "learning_rate": 0.0019776318356200012, - "loss": 1.6724, + "epoch": 8.947149396587598, + "grad_norm": 0.376953125, + "learning_rate": 0.001642114024136496, + "loss": 1.3306, "step": 86000 }, { - "epoch": 0.5624552961831069, - "grad_norm": 0.94921875, - "learning_rate": 0.001977501788152676, - "loss": 1.6823, + "epoch": 8.999167707032875, + "grad_norm": 0.2099609375, + "learning_rate": 0.0016400332917186852, + "loss": 1.3315, "step": 86500 }, { - "epoch": 0.5657064828662461, - "grad_norm": 0.85546875, - "learning_rate": 0.00197737174068535, - "loss": 1.6812, + "epoch": 9.0, + "eval_loss": 1.3568580150604248, + "eval_runtime": 1.6522, + "eval_samples_per_second": 605.266, + "eval_steps_per_second": 0.605, + "step": 86508 + }, + { + "epoch": 9.051186017478152, + "grad_norm": 0.77734375, + "learning_rate": 0.0016379525593008739, + "loss": 1.3232, "step": 87000 }, { - "epoch": 0.5689576695493855, - "grad_norm": 0.6328125, - "learning_rate": 0.0019772416932180245, - "loss": 1.6774, + "epoch": 9.103204327923429, + "grad_norm": 0.2265625, + "learning_rate": 0.0016358718268830628, + "loss": 1.3251, "step": 87500 }, { - "epoch": 0.5722088562325248, - "grad_norm": 0.8515625, - "learning_rate": 0.001977111645750699, - "loss": 1.6614, + "epoch": 9.155222638368706, + "grad_norm": 0.21484375, + "learning_rate": 0.001633791094465252, + "loss": 1.3295, "step": 88000 }, { - "epoch": 0.5754600429156642, - "grad_norm": 1.5625, - "learning_rate": 0.0019769815982833734, - "loss": 1.657, + "epoch": 9.207240948813983, + "grad_norm": 0.25, + "learning_rate": 0.0016317103620474408, + "loss": 1.3261, "step": 88500 }, { - "epoch": 0.5787112295988036, - "grad_norm": 0.625, - "learning_rate": 0.0019768515508160477, - "loss": 1.6616, + "epoch": 9.25925925925926, + "grad_norm": 0.171875, + "learning_rate": 0.0016296296296296295, + "loss": 1.3288, "step": 89000 }, { - "epoch": 0.5819624162819429, - "grad_norm": 1.421875, - "learning_rate": 0.0019767215033487224, - "loss": 1.6561, + "epoch": 9.311277569704536, + "grad_norm": 0.1962890625, + "learning_rate": 0.0016275488972118187, + "loss": 1.3276, "step": 89500 }, { - "epoch": 0.5852136029650823, - "grad_norm": 1.3046875, - "learning_rate": 0.0019765914558813966, - "loss": 1.6612, + "epoch": 9.363295880149813, + "grad_norm": 0.22265625, + "learning_rate": 0.0016254681647940076, + "loss": 1.3257, "step": 90000 }, { - "epoch": 0.5884647896482216, - "grad_norm": 0.65234375, - "learning_rate": 0.001976461408414071, - "loss": 1.6658, + "epoch": 9.41531419059509, + "grad_norm": 0.26171875, + "learning_rate": 0.0016233874323761963, + "loss": 1.3219, "step": 90500 }, { - "epoch": 0.591715976331361, - "grad_norm": 0.71875, - "learning_rate": 0.0019763313609467456, - "loss": 1.6678, + "epoch": 9.467332501040365, + "grad_norm": 0.2470703125, + "learning_rate": 0.0016213066999583854, + "loss": 1.3219, "step": 91000 }, { - "epoch": 0.5949671630145003, - "grad_norm": 2.4375, - "learning_rate": 0.0019762013134794203, - "loss": 1.6657, + "epoch": 9.519350811485642, + "grad_norm": 0.2109375, + "learning_rate": 0.0016192259675405743, + "loss": 1.3216, "step": 91500 }, { - "epoch": 0.5982183496976397, - "grad_norm": 0.67578125, - "learning_rate": 0.0019760712660120945, - "loss": 1.6659, + "epoch": 9.57136912193092, + "grad_norm": 0.189453125, + "learning_rate": 0.0016171452351227634, + "loss": 1.324, "step": 92000 }, { - "epoch": 0.6014695363807789, - "grad_norm": 0.765625, - "learning_rate": 0.001975941218544769, - "loss": 1.6694, + "epoch": 9.623387432376196, + "grad_norm": 0.1943359375, + "learning_rate": 0.0016150645027049521, + "loss": 1.3212, "step": 92500 }, { - "epoch": 0.6047207230639183, - "grad_norm": 1.5859375, - "learning_rate": 0.0019758111710774435, - "loss": 1.6644, + "epoch": 9.675405742821473, + "grad_norm": 0.2041015625, + "learning_rate": 0.001612983770287141, + "loss": 1.3217, "step": 93000 }, { - "epoch": 0.6079719097470577, - "grad_norm": 0.6796875, - "learning_rate": 0.0019756811236101178, - "loss": 1.6526, + "epoch": 9.72742405326675, + "grad_norm": 0.22265625, + "learning_rate": 0.0016109030378693302, + "loss": 1.3219, "step": 93500 }, { - "epoch": 0.611223096430197, - "grad_norm": 0.84375, - "learning_rate": 0.001975551076142792, - "loss": 1.6593, + "epoch": 9.779442363712027, + "grad_norm": 0.2470703125, + "learning_rate": 0.0016088223054515189, + "loss": 1.3219, "step": 94000 }, { - "epoch": 0.6144742831133364, - "grad_norm": 0.83984375, - "learning_rate": 0.0019754210286754667, - "loss": 1.6524, + "epoch": 9.831460674157304, + "grad_norm": 0.2080078125, + "learning_rate": 0.0016067415730337078, + "loss": 1.3188, "step": 94500 }, { - "epoch": 0.6177254697964757, - "grad_norm": 1.203125, - "learning_rate": 0.001975290981208141, - "loss": 1.6447, + "epoch": 9.88347898460258, + "grad_norm": 0.2392578125, + "learning_rate": 0.001604660840615897, + "loss": 1.3205, "step": 95000 }, { - "epoch": 0.6209766564796151, - "grad_norm": 0.8125, - "learning_rate": 0.0019751609337408152, - "loss": 1.6478, + "epoch": 9.935497295047856, + "grad_norm": 1.828125, + "learning_rate": 0.0016025801081980858, + "loss": 1.3238, "step": 95500 }, { - "epoch": 0.6242278431627544, - "grad_norm": 0.59765625, - "learning_rate": 0.00197503088627349, - "loss": 1.6488, + "epoch": 9.987515605493133, + "grad_norm": 0.3984375, + "learning_rate": 0.0016004993757802745, + "loss": 1.3224, "step": 96000 }, { - "epoch": 0.6274790298458938, - "grad_norm": 0.70703125, - "learning_rate": 0.001974900838806164, - "loss": 1.6573, + "epoch": 10.0, + "eval_loss": 1.3528562784194946, + "eval_runtime": 1.936, + "eval_samples_per_second": 516.533, + "eval_steps_per_second": 0.517, + "step": 96120 + }, + { + "epoch": 10.03953391593841, + "grad_norm": 0.2490234375, + "learning_rate": 0.0015984186433624637, + "loss": 1.3174, "step": 96500 }, { - "epoch": 0.630730216529033, - "grad_norm": 1.4765625, - "learning_rate": 0.0019747707913388385, - "loss": 1.6593, + "epoch": 10.091552226383687, + "grad_norm": 0.404296875, + "learning_rate": 0.0015963379109446526, + "loss": 1.3199, "step": 97000 }, { - "epoch": 0.6339814032121724, - "grad_norm": 0.73828125, - "learning_rate": 0.001974640743871513, - "loss": 1.6694, + "epoch": 10.143570536828964, + "grad_norm": 0.1982421875, + "learning_rate": 0.0015942571785268413, + "loss": 1.3187, "step": 97500 }, { - "epoch": 0.6372325898953118, - "grad_norm": 0.80078125, - "learning_rate": 0.001974510696404188, - "loss": 1.6712, + "epoch": 10.19558884727424, + "grad_norm": 0.27734375, + "learning_rate": 0.0015921764461090304, + "loss": 1.3205, "step": 98000 }, { - "epoch": 0.6404837765784511, - "grad_norm": 1.890625, - "learning_rate": 0.001974380648936862, - "loss": 1.6678, + "epoch": 10.247607157719518, + "grad_norm": 0.259765625, + "learning_rate": 0.0015900957136912193, + "loss": 1.3201, "step": 98500 }, { - "epoch": 0.6437349632615905, - "grad_norm": 0.8359375, - "learning_rate": 0.0019742506014695364, - "loss": 1.662, + "epoch": 10.299625468164795, + "grad_norm": 0.65625, + "learning_rate": 0.0015880149812734085, + "loss": 1.3193, "step": 99000 }, { - "epoch": 0.6469861499447298, - "grad_norm": 1.1640625, - "learning_rate": 0.001974120554002211, - "loss": 1.6648, + "epoch": 10.35164377861007, + "grad_norm": 0.23828125, + "learning_rate": 0.0015859342488555972, + "loss": 1.3181, "step": 99500 }, { - "epoch": 0.6502373366278692, - "grad_norm": 0.66796875, - "learning_rate": 0.0019739905065348853, - "loss": 1.6518, + "epoch": 10.403662089055347, + "grad_norm": 0.23828125, + "learning_rate": 0.001583853516437786, + "loss": 1.3169, "step": 100000 }, { - "epoch": 0.6534885233110085, - "grad_norm": 1.6484375, - "learning_rate": 0.0019738604590675596, - "loss": 1.6528, + "epoch": 10.455680399500624, + "grad_norm": 0.298828125, + "learning_rate": 0.0015817727840199752, + "loss": 1.3162, "step": 100500 }, { - "epoch": 0.6567397099941479, - "grad_norm": 0.84375, - "learning_rate": 0.0019737304116002343, - "loss": 1.6754, + "epoch": 10.5076987099459, + "grad_norm": 0.458984375, + "learning_rate": 0.0015796920516021641, + "loss": 1.3217, "step": 101000 }, { - "epoch": 0.6599908966772872, - "grad_norm": 2.1875, - "learning_rate": 0.0019736003641329086, - "loss": 1.68, + "epoch": 10.559717020391178, + "grad_norm": 0.337890625, + "learning_rate": 0.0015776113191843528, + "loss": 1.3245, "step": 101500 }, { - "epoch": 0.6632420833604266, - "grad_norm": 1.1015625, - "learning_rate": 0.001973470316665583, - "loss": 1.6636, + "epoch": 10.611735330836455, + "grad_norm": 0.369140625, + "learning_rate": 0.001575530586766542, + "loss": 1.3233, "step": 102000 }, { - "epoch": 0.666493270043566, - "grad_norm": 0.76171875, - "learning_rate": 0.0019733402691982575, - "loss": 1.6543, + "epoch": 10.663753641281732, + "grad_norm": 0.31640625, + "learning_rate": 0.0015734498543487309, + "loss": 1.3182, "step": 102500 }, { - "epoch": 0.6697444567267052, - "grad_norm": 0.93359375, - "learning_rate": 0.0019732102217309318, - "loss": 1.6503, + "epoch": 10.715771951727008, + "grad_norm": 0.2216796875, + "learning_rate": 0.0015713691219309195, + "loss": 1.3163, "step": 103000 }, { - "epoch": 0.6729956434098446, - "grad_norm": 1.3203125, - "learning_rate": 0.001973080174263606, - "loss": 1.6522, + "epoch": 10.767790262172285, + "grad_norm": 0.2216796875, + "learning_rate": 0.0015692883895131087, + "loss": 1.3181, "step": 103500 }, { - "epoch": 0.6762468300929839, - "grad_norm": 1.0546875, - "learning_rate": 0.0019729501267962807, - "loss": 1.6545, + "epoch": 10.81980857261756, + "grad_norm": 0.25390625, + "learning_rate": 0.0015672076570952976, + "loss": 1.3171, "step": 104000 }, { - "epoch": 0.6794980167761233, - "grad_norm": 0.90234375, - "learning_rate": 0.001972820079328955, - "loss": 1.651, + "epoch": 10.871826883062838, + "grad_norm": 0.255859375, + "learning_rate": 0.0015651269246774865, + "loss": 1.3177, "step": 104500 }, { - "epoch": 0.6827492034592626, - "grad_norm": 0.75, - "learning_rate": 0.0019726900318616293, - "loss": 1.6601, + "epoch": 10.923845193508114, + "grad_norm": 0.306640625, + "learning_rate": 0.0015630461922596754, + "loss": 1.3211, "step": 105000 }, { - "epoch": 0.686000390142402, - "grad_norm": 0.64453125, - "learning_rate": 0.001972559984394304, - "loss": 1.6394, + "epoch": 10.975863503953391, + "grad_norm": 0.34765625, + "learning_rate": 0.0015609654598418643, + "loss": 1.3183, "step": 105500 }, { - "epoch": 0.6892515768255413, - "grad_norm": 0.68359375, - "learning_rate": 0.0019724299369269786, - "loss": 1.639, + "epoch": 11.0, + "eval_loss": 1.347601056098938, + "eval_runtime": 1.5374, + "eval_samples_per_second": 650.453, + "eval_steps_per_second": 0.65, + "step": 105732 + }, + { + "epoch": 11.027881814398668, + "grad_norm": 0.21484375, + "learning_rate": 0.0015588847274240535, + "loss": 1.315, "step": 106000 }, { - "epoch": 0.6925027635086807, - "grad_norm": 0.6328125, - "learning_rate": 0.001972299889459653, - "loss": 1.629, + "epoch": 11.079900124843945, + "grad_norm": 0.2138671875, + "learning_rate": 0.0015568039950062422, + "loss": 1.3125, "step": 106500 }, { - "epoch": 0.6957539501918201, - "grad_norm": 1.234375, - "learning_rate": 0.001972169841992327, - "loss": 1.645, + "epoch": 11.131918435289222, + "grad_norm": 0.1865234375, + "learning_rate": 0.001554723262588431, + "loss": 1.3114, "step": 107000 }, { - "epoch": 0.6990051368749594, - "grad_norm": 1.0390625, - "learning_rate": 0.001972039794525002, - "loss": 1.6692, + "epoch": 11.1839367457345, + "grad_norm": 0.2412109375, + "learning_rate": 0.0015526425301706202, + "loss": 1.3104, "step": 107500 }, { - "epoch": 0.7022563235580987, - "grad_norm": 0.71875, - "learning_rate": 0.001971909747057676, - "loss": 1.6579, + "epoch": 11.235955056179776, + "grad_norm": 0.412109375, + "learning_rate": 0.0015505617977528091, + "loss": 1.3109, "step": 108000 }, { - "epoch": 0.705507510241238, - "grad_norm": 1.359375, - "learning_rate": 0.0019717796995903504, - "loss": 1.6385, + "epoch": 11.287973366625051, + "grad_norm": 0.2265625, + "learning_rate": 0.0015484810653349978, + "loss": 1.3119, "step": 108500 }, { - "epoch": 0.7087586969243774, - "grad_norm": 0.734375, - "learning_rate": 0.001971649652123025, - "loss": 1.648, + "epoch": 11.339991677070328, + "grad_norm": 0.310546875, + "learning_rate": 0.001546400332917187, + "loss": 1.31, "step": 109000 }, { - "epoch": 0.7120098836075167, - "grad_norm": 1.6015625, - "learning_rate": 0.0019715196046556993, - "loss": 1.6436, + "epoch": 11.392009987515605, + "grad_norm": 0.50390625, + "learning_rate": 0.0015443196004993759, + "loss": 1.3114, "step": 109500 }, { - "epoch": 0.7152610702906561, - "grad_norm": 1.0234375, - "learning_rate": 0.0019713895571883736, - "loss": 1.6362, + "epoch": 11.444028297960882, + "grad_norm": 0.40234375, + "learning_rate": 0.0015422388680815646, + "loss": 1.3133, "step": 110000 }, { - "epoch": 0.7185122569737954, - "grad_norm": 0.71484375, - "learning_rate": 0.0019712595097210483, - "loss": 1.645, + "epoch": 11.496046608406159, + "grad_norm": 0.2216796875, + "learning_rate": 0.0015401581356637537, + "loss": 1.3129, "step": 110500 }, { - "epoch": 0.7217634436569348, - "grad_norm": 0.5234375, - "learning_rate": 0.0019711294622537226, - "loss": 1.642, + "epoch": 11.548064918851436, + "grad_norm": 0.267578125, + "learning_rate": 0.0015380774032459426, + "loss": 1.3125, "step": 111000 }, { - "epoch": 0.7250146303400741, - "grad_norm": 0.734375, - "learning_rate": 0.001970999414786397, - "loss": 1.6355, + "epoch": 11.600083229296713, + "grad_norm": 0.212890625, + "learning_rate": 0.0015359966708281315, + "loss": 1.312, "step": 111500 }, { - "epoch": 0.7282658170232135, - "grad_norm": 0.64453125, - "learning_rate": 0.0019708693673190715, - "loss": 1.6339, + "epoch": 11.65210153974199, + "grad_norm": 0.19140625, + "learning_rate": 0.0015339159384103204, + "loss": 1.3107, "step": 112000 }, { - "epoch": 0.7315170037063529, - "grad_norm": 0.98046875, - "learning_rate": 0.001970739319851746, - "loss": 1.6294, + "epoch": 11.704119850187267, + "grad_norm": 0.251953125, + "learning_rate": 0.0015318352059925093, + "loss": 1.3116, "step": 112500 }, { - "epoch": 0.7347681903894921, - "grad_norm": 3.84375, - "learning_rate": 0.0019706092723844205, - "loss": 1.6307, + "epoch": 11.756138160632542, + "grad_norm": 0.189453125, + "learning_rate": 0.0015297544735746985, + "loss": 1.3104, "step": 113000 }, { - "epoch": 0.7380193770726315, - "grad_norm": 0.75, - "learning_rate": 0.0019704792249170947, - "loss": 1.6373, + "epoch": 11.808156471077819, + "grad_norm": 0.353515625, + "learning_rate": 0.0015276737411568874, + "loss": 1.3102, "step": 113500 }, { - "epoch": 0.7412705637557708, - "grad_norm": 0.88671875, - "learning_rate": 0.0019703491774497694, - "loss": 1.6368, + "epoch": 11.860174781523096, + "grad_norm": 0.2314453125, + "learning_rate": 0.001525593008739076, + "loss": 1.309, "step": 114000 }, { - "epoch": 0.7445217504389102, - "grad_norm": 42.75, - "learning_rate": 0.0019702191299824437, - "loss": 1.6278, + "epoch": 11.912193091968373, + "grad_norm": 0.2431640625, + "learning_rate": 0.0015235122763212652, + "loss": 1.3096, "step": 114500 }, { - "epoch": 0.7477729371220495, - "grad_norm": 3.53125, - "learning_rate": 0.001970089082515118, - "loss": 1.6316, + "epoch": 11.96421140241365, + "grad_norm": 0.9296875, + "learning_rate": 0.0015214315439034541, + "loss": 1.307, "step": 115000 }, { - "epoch": 0.7510241238051889, - "grad_norm": 1.3203125, - "learning_rate": 0.0019699590350477927, - "loss": 1.6202, + "epoch": 12.0, + "eval_loss": 1.3371446132659912, + "eval_runtime": 1.4263, + "eval_samples_per_second": 701.11, + "eval_steps_per_second": 0.701, + "step": 115344 + }, + { + "epoch": 12.016229712858927, + "grad_norm": 0.185546875, + "learning_rate": 0.0015193508114856428, + "loss": 1.3041, "step": 115500 }, { - "epoch": 0.7542753104883282, - "grad_norm": 0.83984375, - "learning_rate": 0.001969828987580467, - "loss": 1.6194, + "epoch": 12.068248023304204, + "grad_norm": 0.345703125, + "learning_rate": 0.001517270079067832, + "loss": 1.3025, "step": 116000 }, { - "epoch": 0.7575264971714676, - "grad_norm": 0.77734375, - "learning_rate": 0.001969698940113141, - "loss": 1.6219, + "epoch": 12.12026633374948, + "grad_norm": 0.232421875, + "learning_rate": 0.0015151893466500209, + "loss": 1.3041, "step": 116500 }, { - "epoch": 0.760777683854607, - "grad_norm": 1.5703125, - "learning_rate": 0.001969568892645816, - "loss": 1.6229, + "epoch": 12.172284644194757, + "grad_norm": 0.20703125, + "learning_rate": 0.0015131086142322098, + "loss": 1.3063, "step": 117000 }, { - "epoch": 0.7640288705377463, - "grad_norm": 0.7578125, - "learning_rate": 0.00196943884517849, - "loss": 1.6253, + "epoch": 12.224302954640033, + "grad_norm": 0.2119140625, + "learning_rate": 0.0015110278818143987, + "loss": 1.3037, "step": 117500 }, { - "epoch": 0.7672800572208857, - "grad_norm": 0.8984375, - "learning_rate": 0.0019693087977111644, - "loss": 1.6203, + "epoch": 12.27632126508531, + "grad_norm": 0.2099609375, + "learning_rate": 0.0015089471493965876, + "loss": 1.3049, "step": 118000 }, { - "epoch": 0.7705312439040249, - "grad_norm": 2.546875, - "learning_rate": 0.001969178750243839, - "loss": 1.6222, + "epoch": 12.328339575530586, + "grad_norm": 0.32421875, + "learning_rate": 0.0015068664169787765, + "loss": 1.3033, "step": 118500 }, { - "epoch": 0.7737824305871643, - "grad_norm": 0.89453125, - "learning_rate": 0.0019690487027765134, - "loss": 1.6241, + "epoch": 12.380357885975863, + "grad_norm": 0.1884765625, + "learning_rate": 0.0015047856845609654, + "loss": 1.3032, "step": 119000 }, { - "epoch": 0.7770336172703036, - "grad_norm": 0.6484375, - "learning_rate": 0.0019689186553091876, - "loss": 1.6173, + "epoch": 12.43237619642114, + "grad_norm": 0.3125, + "learning_rate": 0.0015027049521431544, + "loss": 1.3031, "step": 119500 }, { - "epoch": 0.780284803953443, - "grad_norm": 1.4921875, - "learning_rate": 0.0019687886078418623, - "loss": 1.6136, + "epoch": 12.484394506866417, + "grad_norm": 0.291015625, + "learning_rate": 0.0015006242197253433, + "loss": 1.3031, "step": 120000 }, { - "epoch": 0.7835359906365823, - "grad_norm": 0.6484375, - "learning_rate": 0.001968658560374537, - "loss": 1.6079, + "epoch": 12.536412817311694, + "grad_norm": 0.90625, + "learning_rate": 0.0014985434873075324, + "loss": 1.304, "step": 120500 }, { - "epoch": 0.7867871773197217, - "grad_norm": 0.9296875, - "learning_rate": 0.0019685285129072113, - "loss": 1.6035, + "epoch": 12.588431127756971, + "grad_norm": 0.2216796875, + "learning_rate": 0.001496462754889721, + "loss": 1.3042, "step": 121000 }, { - "epoch": 0.7900383640028611, - "grad_norm": 0.71484375, - "learning_rate": 0.0019683984654398855, - "loss": 1.6098, + "epoch": 12.640449438202246, + "grad_norm": 0.283203125, + "learning_rate": 0.0014943820224719102, + "loss": 1.3055, "step": 121500 }, { - "epoch": 0.7932895506860004, - "grad_norm": 0.74609375, - "learning_rate": 0.0019682684179725602, - "loss": 1.6112, + "epoch": 12.692467748647523, + "grad_norm": 0.20703125, + "learning_rate": 0.0014923012900540991, + "loss": 1.3061, "step": 122000 }, { - "epoch": 0.7965407373691398, - "grad_norm": 0.83984375, - "learning_rate": 0.0019681383705052345, - "loss": 1.6075, + "epoch": 12.7444860590928, + "grad_norm": 0.390625, + "learning_rate": 0.0014902205576362878, + "loss": 1.304, "step": 122500 }, { - "epoch": 0.7997919240522791, - "grad_norm": 0.62109375, - "learning_rate": 0.0019680083230379087, - "loss": 1.6124, + "epoch": 12.796504369538077, + "grad_norm": 0.408203125, + "learning_rate": 0.001488139825218477, + "loss": 1.3048, "step": 123000 }, { - "epoch": 0.8030431107354185, - "grad_norm": 0.73046875, - "learning_rate": 0.0019678782755705834, - "loss": 1.6121, + "epoch": 12.848522679983354, + "grad_norm": 0.2099609375, + "learning_rate": 0.0014860590928006659, + "loss": 1.3038, "step": 123500 }, { - "epoch": 0.8062942974185577, - "grad_norm": 1.046875, - "learning_rate": 0.0019677482281032577, - "loss": 1.5979, + "epoch": 12.900540990428631, + "grad_norm": 0.24609375, + "learning_rate": 0.0014839783603828548, + "loss": 1.3029, "step": 124000 }, { - "epoch": 0.8095454841016971, - "grad_norm": 0.6328125, - "learning_rate": 0.001967618180635932, - "loss": 1.6022, + "epoch": 12.952559300873908, + "grad_norm": 0.2021484375, + "learning_rate": 0.0014818976279650437, + "loss": 1.302, "step": 124500 }, { - "epoch": 0.8127966707848364, - "grad_norm": 0.921875, - "learning_rate": 0.0019674881331686067, - "loss": 1.6035, + "epoch": 13.0, + "eval_loss": 1.336362600326538, + "eval_runtime": 1.5551, + "eval_samples_per_second": 643.05, + "eval_steps_per_second": 0.643, + "step": 124956 + }, + { + "epoch": 13.004577611319185, + "grad_norm": 0.27734375, + "learning_rate": 0.0014798168955472326, + "loss": 1.3067, "step": 125000 }, { - "epoch": 0.8160478574679758, - "grad_norm": 0.8203125, - "learning_rate": 0.001967358085701281, - "loss": 1.5999, + "epoch": 13.056595921764462, + "grad_norm": 0.3125, + "learning_rate": 0.0014777361631294215, + "loss": 1.3017, "step": 125500 }, { - "epoch": 0.8192990441511152, - "grad_norm": 0.5, - "learning_rate": 0.001967228038233955, - "loss": 1.6004, + "epoch": 13.108614232209737, + "grad_norm": 0.28515625, + "learning_rate": 0.0014756554307116107, + "loss": 1.3037, "step": 126000 }, { - "epoch": 0.8225502308342545, - "grad_norm": 0.64453125, - "learning_rate": 0.00196709799076663, - "loss": 1.5918, + "epoch": 13.160632542655014, + "grad_norm": 0.28515625, + "learning_rate": 0.0014735746982937994, + "loss": 1.3047, "step": 126500 }, { - "epoch": 0.8258014175173939, - "grad_norm": 0.75, - "learning_rate": 0.0019669679432993046, - "loss": 1.6031, + "epoch": 13.21265085310029, + "grad_norm": 0.1982421875, + "learning_rate": 0.0014714939658759883, + "loss": 1.3034, "step": 127000 }, { - "epoch": 0.8290526042005332, - "grad_norm": 0.7578125, - "learning_rate": 0.001966837895831979, - "loss": 1.6028, + "epoch": 13.264669163545568, + "grad_norm": 7.0, + "learning_rate": 0.0014694132334581774, + "loss": 1.303, "step": 127500 }, { - "epoch": 0.8323037908836726, - "grad_norm": 0.80859375, - "learning_rate": 0.001966707848364653, - "loss": 1.5984, + "epoch": 13.316687473990845, + "grad_norm": 0.2275390625, + "learning_rate": 0.0014673325010403661, + "loss": 1.304, "step": 128000 }, { - "epoch": 0.8355549775668119, - "grad_norm": 0.89453125, - "learning_rate": 0.001966577800897328, - "loss": 1.5988, + "epoch": 13.368705784436122, + "grad_norm": 0.240234375, + "learning_rate": 0.0014652517686225552, + "loss": 1.3033, "step": 128500 }, { - "epoch": 0.8388061642499512, - "grad_norm": 1.1171875, - "learning_rate": 0.001966447753430002, - "loss": 1.6013, + "epoch": 13.420724094881399, + "grad_norm": 0.349609375, + "learning_rate": 0.0014631710362047442, + "loss": 1.3014, "step": 129000 }, { - "epoch": 0.8420573509330905, - "grad_norm": 0.73046875, - "learning_rate": 0.0019663177059626763, - "loss": 1.6111, + "epoch": 13.472742405326676, + "grad_norm": 0.2392578125, + "learning_rate": 0.001461090303786933, + "loss": 1.3004, "step": 129500 }, { - "epoch": 0.8453085376162299, - "grad_norm": 0.76171875, - "learning_rate": 0.001966187658495351, - "loss": 1.6014, + "epoch": 13.524760715771952, + "grad_norm": 0.6875, + "learning_rate": 0.001459009571369122, + "loss": 1.3011, "step": 130000 }, { - "epoch": 0.8485597242993693, - "grad_norm": 0.96484375, - "learning_rate": 0.0019660576110280253, - "loss": 1.5989, + "epoch": 13.576779026217228, + "grad_norm": 0.3359375, + "learning_rate": 0.001456928838951311, + "loss": 1.3005, "step": 130500 }, { - "epoch": 0.8518109109825086, - "grad_norm": 5.5, - "learning_rate": 0.0019659275635606995, - "loss": 1.5957, + "epoch": 13.628797336662505, + "grad_norm": 0.1943359375, + "learning_rate": 0.0014548481065334998, + "loss": 1.3022, "step": 131000 }, { - "epoch": 0.855062097665648, - "grad_norm": 0.85546875, - "learning_rate": 0.0019657975160933742, - "loss": 1.5993, + "epoch": 13.680815647107782, + "grad_norm": 0.2197265625, + "learning_rate": 0.0014527673741156887, + "loss": 1.3013, "step": 131500 }, { - "epoch": 0.8583132843487873, - "grad_norm": 1.5625, - "learning_rate": 0.0019656674686260485, - "loss": 1.6048, + "epoch": 13.732833957553058, + "grad_norm": 0.1904296875, + "learning_rate": 0.0014506866416978776, + "loss": 1.3005, "step": 132000 }, { - "epoch": 0.8615644710319267, - "grad_norm": 0.79296875, - "learning_rate": 0.0019655374211587228, - "loss": 1.6039, + "epoch": 13.784852267998335, + "grad_norm": 0.2734375, + "learning_rate": 0.0014486059092800666, + "loss": 1.2997, "step": 132500 }, { - "epoch": 0.864815657715066, - "grad_norm": 0.640625, - "learning_rate": 0.0019654073736913974, - "loss": 1.6012, + "epoch": 13.836870578443612, + "grad_norm": 0.2412109375, + "learning_rate": 0.0014465251768622557, + "loss": 1.2998, "step": 133000 }, { - "epoch": 0.8680668443982054, - "grad_norm": 0.6328125, - "learning_rate": 0.0019652773262240717, - "loss": 1.5923, + "epoch": 13.88888888888889, + "grad_norm": 0.2421875, + "learning_rate": 0.0014444444444444444, + "loss": 1.3001, "step": 133500 }, { - "epoch": 0.8713180310813446, - "grad_norm": 1.3671875, - "learning_rate": 0.001965147278756746, - "loss": 1.585, + "epoch": 13.940907199334166, + "grad_norm": 0.2265625, + "learning_rate": 0.0014423637120266333, + "loss": 1.2998, "step": 134000 }, { - "epoch": 0.874569217764484, - "grad_norm": 0.69140625, - "learning_rate": 0.0019650172312894207, - "loss": 1.5838, + "epoch": 13.992925509779443, + "grad_norm": 0.455078125, + "learning_rate": 0.0014402829796088224, + "loss": 1.3013, "step": 134500 }, { - "epoch": 0.8778204044476234, - "grad_norm": 0.76953125, - "learning_rate": 0.0019648871838220954, - "loss": 1.5818, + "epoch": 14.0, + "eval_loss": 1.3362102508544922, + "eval_runtime": 1.3748, + "eval_samples_per_second": 727.372, + "eval_steps_per_second": 0.727, + "step": 134568 + }, + { + "epoch": 14.044943820224718, + "grad_norm": 0.220703125, + "learning_rate": 0.0014382022471910111, + "loss": 1.2953, "step": 135000 }, { - "epoch": 0.8810715911307627, - "grad_norm": 0.8046875, - "learning_rate": 0.0019647571363547696, - "loss": 1.5889, + "epoch": 14.096962130669995, + "grad_norm": 0.2197265625, + "learning_rate": 0.0014361215147732003, + "loss": 1.2956, "step": 135500 }, { - "epoch": 0.8843227778139021, - "grad_norm": 0.69140625, - "learning_rate": 0.001964627088887444, - "loss": 1.587, + "epoch": 14.148980441115272, + "grad_norm": 0.3359375, + "learning_rate": 0.0014340407823553892, + "loss": 1.2953, "step": 136000 }, { - "epoch": 0.8875739644970414, - "grad_norm": 5.25, - "learning_rate": 0.0019644970414201186, - "loss": 1.5835, + "epoch": 14.20099875156055, + "grad_norm": 0.337890625, + "learning_rate": 0.001431960049937578, + "loss": 1.2959, "step": 136500 }, { - "epoch": 0.8908251511801808, - "grad_norm": 0.71875, - "learning_rate": 0.001964366993952793, - "loss": 1.5802, + "epoch": 14.253017062005826, + "grad_norm": 0.2158203125, + "learning_rate": 0.001429879317519767, + "loss": 1.2951, "step": 137000 }, { - "epoch": 0.8940763378633201, - "grad_norm": 0.76171875, - "learning_rate": 0.001964236946485467, - "loss": 1.5836, + "epoch": 14.305035372451103, + "grad_norm": 0.1953125, + "learning_rate": 0.001427798585101956, + "loss": 1.2948, "step": 137500 }, { - "epoch": 0.8973275245464595, - "grad_norm": 0.6484375, - "learning_rate": 0.001964106899018142, - "loss": 1.5861, + "epoch": 14.35705368289638, + "grad_norm": 0.412109375, + "learning_rate": 0.0014257178526841448, + "loss": 1.2962, "step": 138000 }, { - "epoch": 0.9005787112295988, - "grad_norm": 0.828125, - "learning_rate": 0.001963976851550816, - "loss": 1.5777, + "epoch": 14.409071993341657, + "grad_norm": 0.21484375, + "learning_rate": 0.001423637120266334, + "loss": 1.2941, "step": 138500 }, { - "epoch": 0.9038298979127382, - "grad_norm": 0.67578125, - "learning_rate": 0.0019638468040834903, - "loss": 1.5759, + "epoch": 14.461090303786934, + "grad_norm": 0.205078125, + "learning_rate": 0.0014215563878485226, + "loss": 1.2958, "step": 139000 }, { - "epoch": 0.9070810845958774, - "grad_norm": 2.625, - "learning_rate": 0.001963716756616165, - "loss": 1.5863, + "epoch": 14.513108614232209, + "grad_norm": 0.2255859375, + "learning_rate": 0.0014194756554307116, + "loss": 1.2949, "step": 139500 }, { - "epoch": 0.9103322712790168, - "grad_norm": 0.5390625, - "learning_rate": 0.0019635867091488393, - "loss": 1.5756, + "epoch": 14.565126924677486, + "grad_norm": 0.177734375, + "learning_rate": 0.0014173949230129007, + "loss": 1.2933, "step": 140000 }, { - "epoch": 0.9135834579621562, - "grad_norm": 0.7734375, - "learning_rate": 0.0019634566616815135, - "loss": 1.5774, + "epoch": 14.617145235122763, + "grad_norm": 0.291015625, + "learning_rate": 0.0014153141905950894, + "loss": 1.295, "step": 140500 }, { - "epoch": 0.9168346446452955, - "grad_norm": 2.828125, - "learning_rate": 0.0019633266142141882, - "loss": 1.5707, + "epoch": 14.66916354556804, + "grad_norm": 0.33984375, + "learning_rate": 0.0014132334581772783, + "loss": 1.2944, "step": 141000 }, { - "epoch": 0.9200858313284349, - "grad_norm": 0.640625, - "learning_rate": 0.001963196566746863, - "loss": 1.5849, + "epoch": 14.721181856013317, + "grad_norm": 0.19140625, + "learning_rate": 0.0014111527257594674, + "loss": 1.2923, "step": 141500 }, { - "epoch": 0.9233370180115742, - "grad_norm": 0.93359375, - "learning_rate": 0.001963066519279537, - "loss": 1.5774, + "epoch": 14.773200166458594, + "grad_norm": 0.271484375, + "learning_rate": 0.0014090719933416563, + "loss": 1.2925, "step": 142000 }, { - "epoch": 0.9265882046947136, - "grad_norm": 0.73828125, - "learning_rate": 0.0019629364718122115, - "loss": 1.5774, + "epoch": 14.82521847690387, + "grad_norm": 0.3828125, + "learning_rate": 0.0014069912609238453, + "loss": 1.2907, "step": 142500 }, { - "epoch": 0.9298393913778529, - "grad_norm": 0.7578125, - "learning_rate": 0.001962806424344886, - "loss": 1.5733, + "epoch": 14.877236787349148, + "grad_norm": 0.21875, + "learning_rate": 0.0014049105285060342, + "loss": 1.2936, "step": 143000 }, { - "epoch": 0.9330905780609923, - "grad_norm": 0.70703125, - "learning_rate": 0.0019626763768775604, - "loss": 1.5819, + "epoch": 14.929255097794425, + "grad_norm": 0.25390625, + "learning_rate": 0.001402829796088223, + "loss": 1.2927, "step": 143500 }, { - "epoch": 0.9363417647441316, - "grad_norm": 0.62890625, - "learning_rate": 0.0019625463294102347, - "loss": 1.5717, + "epoch": 14.9812734082397, + "grad_norm": 0.419921875, + "learning_rate": 0.001400749063670412, + "loss": 1.2911, "step": 144000 }, { - "epoch": 0.939592951427271, - "grad_norm": 0.63671875, - "learning_rate": 0.0019624162819429094, - "loss": 1.5737, + "epoch": 15.0, + "eval_loss": 1.3259565830230713, + "eval_runtime": 1.5089, + "eval_samples_per_second": 662.754, + "eval_steps_per_second": 0.663, + "step": 144180 + }, + { + "epoch": 15.033291718684977, + "grad_norm": 0.2216796875, + "learning_rate": 0.001398668331252601, + "loss": 1.2892, "step": 144500 }, { - "epoch": 0.9428441381104103, - "grad_norm": 1.3359375, - "learning_rate": 0.0019622862344755836, - "loss": 1.577, + "epoch": 15.085310029130254, + "grad_norm": 0.474609375, + "learning_rate": 0.0013965875988347898, + "loss": 1.2898, "step": 145000 }, { - "epoch": 0.9460953247935496, - "grad_norm": 0.96484375, - "learning_rate": 0.001962156187008258, - "loss": 1.5795, + "epoch": 15.13732833957553, + "grad_norm": 0.2119140625, + "learning_rate": 0.001394506866416979, + "loss": 1.2916, "step": 145500 }, { - "epoch": 0.949346511476689, - "grad_norm": 0.7578125, - "learning_rate": 0.0019620261395409326, - "loss": 1.5723, + "epoch": 15.189346650020807, + "grad_norm": 0.2373046875, + "learning_rate": 0.0013924261339991677, + "loss": 1.2906, "step": 146000 }, { - "epoch": 0.9525976981598283, - "grad_norm": 0.89453125, - "learning_rate": 0.001961896092073607, - "loss": 1.5775, + "epoch": 15.241364960466084, + "grad_norm": 0.294921875, + "learning_rate": 0.0013903454015813566, + "loss": 1.2905, "step": 146500 }, { - "epoch": 0.9558488848429677, - "grad_norm": 1.6875, - "learning_rate": 0.001961766044606281, - "loss": 1.5767, + "epoch": 15.293383270911361, + "grad_norm": 0.2734375, + "learning_rate": 0.0013882646691635457, + "loss": 1.2898, "step": 147000 }, { - "epoch": 0.959100071526107, - "grad_norm": 0.9765625, - "learning_rate": 0.001961635997138956, - "loss": 1.5762, + "epoch": 15.345401581356638, + "grad_norm": 0.2060546875, + "learning_rate": 0.0013861839367457346, + "loss": 1.2893, "step": 147500 }, { - "epoch": 0.9623512582092464, - "grad_norm": 0.6875, - "learning_rate": 0.00196150594967163, - "loss": 1.5697, + "epoch": 15.397419891801913, + "grad_norm": 0.2451171875, + "learning_rate": 0.0013841032043279233, + "loss": 1.2905, "step": 148000 }, { - "epoch": 0.9656024448923857, - "grad_norm": 1.2265625, - "learning_rate": 0.0019613759022043043, - "loss": 1.5608, + "epoch": 15.44943820224719, + "grad_norm": 0.19140625, + "learning_rate": 0.0013820224719101124, + "loss": 1.2899, "step": 148500 }, { - "epoch": 0.9688536315755251, - "grad_norm": 0.7578125, - "learning_rate": 0.001961245854736979, - "loss": 1.5679, + "epoch": 15.501456512692467, + "grad_norm": 0.2138671875, + "learning_rate": 0.0013799417394923014, + "loss": 1.2913, "step": 149000 }, { - "epoch": 0.9721048182586645, - "grad_norm": 0.6328125, - "learning_rate": 0.0019611158072696537, - "loss": 1.5627, + "epoch": 15.553474823137744, + "grad_norm": 0.2109375, + "learning_rate": 0.0013778610070744903, + "loss": 1.29, "step": 149500 }, { - "epoch": 0.9753560049418037, - "grad_norm": 1.046875, - "learning_rate": 0.001960985759802328, - "loss": 1.5678, + "epoch": 15.605493133583021, + "grad_norm": 0.279296875, + "learning_rate": 0.0013757802746566792, + "loss": 1.2906, "step": 150000 }, { - "epoch": 0.9786071916249431, - "grad_norm": 0.69921875, - "learning_rate": 0.0019608557123350022, - "loss": 1.5679, + "epoch": 15.657511444028298, + "grad_norm": 0.224609375, + "learning_rate": 0.001373699542238868, + "loss": 1.289, "step": 150500 }, { - "epoch": 0.9818583783080824, - "grad_norm": 0.69921875, - "learning_rate": 0.001960725664867677, - "loss": 1.5662, + "epoch": 15.709529754473575, + "grad_norm": 0.33984375, + "learning_rate": 0.0013716188098210572, + "loss": 1.2894, "step": 151000 }, { - "epoch": 0.9851095649912218, - "grad_norm": 0.65234375, - "learning_rate": 0.001960595617400351, - "loss": 1.5687, + "epoch": 15.761548064918852, + "grad_norm": 0.271484375, + "learning_rate": 0.001369538077403246, + "loss": 1.2876, "step": 151500 }, { - "epoch": 0.9883607516743611, - "grad_norm": 3.328125, - "learning_rate": 0.0019604655699330255, - "loss": 1.5623, + "epoch": 15.813566375364129, + "grad_norm": 0.201171875, + "learning_rate": 0.0013674573449854348, + "loss": 1.2886, "step": 152000 }, { - "epoch": 0.9916119383575005, - "grad_norm": 1.0, - "learning_rate": 0.0019603355224657, - "loss": 1.5668, + "epoch": 15.865584685809406, + "grad_norm": 0.2158203125, + "learning_rate": 0.001365376612567624, + "loss": 1.2875, "step": 152500 }, { - "epoch": 0.9948631250406398, - "grad_norm": 0.65625, - "learning_rate": 0.0019602054749983744, - "loss": 1.5668, + "epoch": 15.917602996254681, + "grad_norm": 0.1943359375, + "learning_rate": 0.0013632958801498127, + "loss": 1.2874, "step": 153000 }, { - "epoch": 0.9981143117237792, - "grad_norm": 1.03125, - "learning_rate": 0.0019600754275310487, - "loss": 1.5688, + "epoch": 15.969621306699958, + "grad_norm": 0.2138671875, + "learning_rate": 0.0013612151477320016, + "loss": 1.2876, "step": 153500 }, { - "epoch": 1.0, - "eval_loss": 1.5614691972732544, - "eval_runtime": 0.5382, - "eval_samples_per_second": 1857.967, - "eval_steps_per_second": 29.727, - "step": 153790 + "epoch": 16.0, + "eval_loss": 1.3189575672149658, + "eval_runtime": 1.5607, + "eval_samples_per_second": 640.73, + "eval_steps_per_second": 0.641, + "step": 153792 }, { - "epoch": 1.0013654984069185, - "grad_norm": 3.140625, - "learning_rate": 0.0019599453800637234, - "loss": 1.5728, + "epoch": 16.021639617145237, + "grad_norm": 0.20703125, + "learning_rate": 0.0013591344153141907, + "loss": 1.285, "step": 154000 }, { - "epoch": 1.004616685090058, - "grad_norm": 1.1484375, - "learning_rate": 0.0019598153325963976, - "loss": 1.5747, + "epoch": 16.073657927590514, + "grad_norm": 0.1845703125, + "learning_rate": 0.0013570536828963796, + "loss": 1.2834, "step": 154500 }, { - "epoch": 1.0078678717731973, - "grad_norm": 0.828125, - "learning_rate": 0.001959685285129072, - "loss": 1.572, + "epoch": 16.125676238035787, + "grad_norm": 0.310546875, + "learning_rate": 0.0013549729504785683, + "loss": 1.2828, "step": 155000 }, { - "epoch": 1.0111190584563365, - "grad_norm": 0.7578125, - "learning_rate": 0.0019595552376617466, - "loss": 1.5646, + "epoch": 16.177694548481064, + "grad_norm": 0.291015625, + "learning_rate": 0.0013528922180607575, + "loss": 1.2826, "step": 155500 }, { - "epoch": 1.0143702451394758, - "grad_norm": 1.953125, - "learning_rate": 0.0019594251901944213, - "loss": 1.5695, + "epoch": 16.22971285892634, + "grad_norm": 0.208984375, + "learning_rate": 0.0013508114856429464, + "loss": 1.2825, "step": 156000 }, { - "epoch": 1.0176214318226153, - "grad_norm": 0.9296875, - "learning_rate": 0.0019592951427270956, - "loss": 1.5595, + "epoch": 16.281731169371618, + "grad_norm": 0.2578125, + "learning_rate": 0.001348730753225135, + "loss": 1.2836, "step": 156500 }, { - "epoch": 1.0208726185057546, - "grad_norm": 0.81640625, - "learning_rate": 0.00195916509525977, - "loss": 1.5574, + "epoch": 16.333749479816895, + "grad_norm": 0.291015625, + "learning_rate": 0.0013466500208073242, + "loss": 1.2853, "step": 157000 }, { - "epoch": 1.024123805188894, - "grad_norm": 1.1484375, - "learning_rate": 0.0019590350477924445, - "loss": 1.5624, + "epoch": 16.38576779026217, + "grad_norm": 0.208984375, + "learning_rate": 0.0013445692883895131, + "loss": 1.2859, "step": 157500 }, { - "epoch": 1.0273749918720332, - "grad_norm": 0.6953125, - "learning_rate": 0.0019589050003251188, - "loss": 1.5611, + "epoch": 16.43778610070745, + "grad_norm": 0.267578125, + "learning_rate": 0.0013424885559717022, + "loss": 1.2841, "step": 158000 }, { - "epoch": 1.0306261785551727, - "grad_norm": 2.828125, - "learning_rate": 0.001958774952857793, - "loss": 1.5626, + "epoch": 16.489804411152726, + "grad_norm": 0.259765625, + "learning_rate": 0.001340407823553891, + "loss": 1.2834, "step": 158500 }, { - "epoch": 1.033877365238312, - "grad_norm": 0.859375, - "learning_rate": 0.0019586449053904677, - "loss": 1.5657, + "epoch": 16.541822721598002, + "grad_norm": 0.197265625, + "learning_rate": 0.0013383270911360799, + "loss": 1.2834, "step": 159000 }, { - "epoch": 1.0371285519214513, - "grad_norm": 0.78125, - "learning_rate": 0.001958514857923142, - "loss": 1.5622, + "epoch": 16.59384103204328, + "grad_norm": 0.19140625, + "learning_rate": 0.001336246358718269, + "loss": 1.2834, "step": 159500 }, { - "epoch": 1.0403797386045908, - "grad_norm": 0.671875, - "learning_rate": 0.0019583848104558163, - "loss": 1.5667, + "epoch": 16.645859342488556, + "grad_norm": 0.29296875, + "learning_rate": 0.001334165626300458, + "loss": 1.2856, "step": 160000 }, { - "epoch": 1.04363092528773, - "grad_norm": 7.78125, - "learning_rate": 0.001958254762988491, - "loss": 1.5694, + "epoch": 16.697877652933833, + "grad_norm": 0.50390625, + "learning_rate": 0.0013320848938826466, + "loss": 1.2829, "step": 160500 }, { - "epoch": 1.0468821119708693, - "grad_norm": 1.0546875, - "learning_rate": 0.001958124715521165, - "loss": 1.5726, + "epoch": 16.74989596337911, + "grad_norm": 0.2119140625, + "learning_rate": 0.0013300041614648357, + "loss": 1.283, "step": 161000 }, { - "epoch": 1.0501332986540086, - "grad_norm": 0.96875, - "learning_rate": 0.0019579946680538395, - "loss": 1.5669, + "epoch": 16.801914273824387, + "grad_norm": 0.208984375, + "learning_rate": 0.0013279234290470246, + "loss": 1.2848, "step": 161500 }, { - "epoch": 1.0533844853371481, - "grad_norm": 0.7265625, - "learning_rate": 0.001957864620586514, - "loss": 1.567, + "epoch": 16.853932584269664, + "grad_norm": 0.2216796875, + "learning_rate": 0.0013258426966292133, + "loss": 1.2837, "step": 162000 }, { - "epoch": 1.0566356720202874, - "grad_norm": 0.51953125, - "learning_rate": 0.0019577345731191884, - "loss": 1.5696, + "epoch": 16.90595089471494, + "grad_norm": 0.2109375, + "learning_rate": 0.0013237619642114025, + "loss": 1.2824, "step": 162500 }, { - "epoch": 1.0598868587034267, - "grad_norm": 0.6796875, - "learning_rate": 0.0019576045256518627, - "loss": 1.5617, + "epoch": 16.957969205160218, + "grad_norm": 0.2236328125, + "learning_rate": 0.0013216812317935914, + "loss": 1.284, "step": 163000 }, { - "epoch": 1.0631380453865662, - "grad_norm": 0.7890625, - "learning_rate": 0.0019574744781845374, - "loss": 1.5626, + "epoch": 17.0, + "eval_loss": 1.3203132152557373, + "eval_runtime": 1.432, + "eval_samples_per_second": 698.327, + "eval_steps_per_second": 0.698, + "step": 163404 + }, + { + "epoch": 17.00998751560549, + "grad_norm": 0.33984375, + "learning_rate": 0.0013196004993757803, + "loss": 1.2829, "step": 163500 }, { - "epoch": 1.0663892320697055, - "grad_norm": 0.62109375, - "learning_rate": 0.001957344430717212, - "loss": 1.5643, + "epoch": 17.06200582605077, + "grad_norm": 0.21875, + "learning_rate": 0.0013175197669579692, + "loss": 1.2788, "step": 164000 }, { - "epoch": 1.0696404187528448, - "grad_norm": 0.8125, - "learning_rate": 0.0019572143832498863, - "loss": 1.5572, + "epoch": 17.114024136496045, + "grad_norm": 0.25390625, + "learning_rate": 0.0013154390345401581, + "loss": 1.2805, "step": 164500 }, { - "epoch": 1.072891605435984, - "grad_norm": 3.140625, - "learning_rate": 0.0019570843357825606, - "loss": 1.5707, + "epoch": 17.166042446941322, + "grad_norm": 0.2275390625, + "learning_rate": 0.0013133583021223473, + "loss": 1.2815, "step": 165000 }, { - "epoch": 1.0761427921191236, - "grad_norm": 0.7421875, - "learning_rate": 0.0019569542883152353, - "loss": 1.5728, + "epoch": 17.2180607573866, + "grad_norm": 0.41015625, + "learning_rate": 0.001311277569704536, + "loss": 1.2817, "step": 165500 }, { - "epoch": 1.0793939788022628, - "grad_norm": 1.0390625, - "learning_rate": 0.0019568242408479096, - "loss": 1.5795, + "epoch": 17.270079067831876, + "grad_norm": 0.345703125, + "learning_rate": 0.0013091968372867249, + "loss": 1.2835, "step": 166000 }, { - "epoch": 1.0826451654854021, - "grad_norm": 0.640625, - "learning_rate": 0.001956694193380584, - "loss": 1.5778, + "epoch": 17.322097378277153, + "grad_norm": 0.2216796875, + "learning_rate": 0.001307116104868914, + "loss": 1.2823, "step": 166500 }, { - "epoch": 1.0858963521685414, - "grad_norm": 0.5625, - "learning_rate": 0.0019565641459132585, - "loss": 1.5702, + "epoch": 17.37411568872243, + "grad_norm": 0.2158203125, + "learning_rate": 0.001305035372451103, + "loss": 1.2831, "step": 167000 }, { - "epoch": 1.089147538851681, - "grad_norm": 0.91015625, - "learning_rate": 0.0019564340984459328, - "loss": 1.5654, + "epoch": 17.426133999167707, + "grad_norm": 0.2216796875, + "learning_rate": 0.0013029546400332916, + "loss": 1.283, "step": 167500 }, { - "epoch": 1.0923987255348202, - "grad_norm": 0.671875, - "learning_rate": 0.001956304050978607, - "loss": 1.5677, + "epoch": 17.478152309612984, + "grad_norm": 0.3984375, + "learning_rate": 0.0013008739076154807, + "loss": 1.2823, "step": 168000 }, { - "epoch": 1.0956499122179595, - "grad_norm": 0.80078125, - "learning_rate": 0.0019561740035112817, - "loss": 1.5646, + "epoch": 17.53017062005826, + "grad_norm": 0.2333984375, + "learning_rate": 0.0012987931751976696, + "loss": 1.2826, "step": 168500 }, { - "epoch": 1.098901098901099, - "grad_norm": 1.5859375, - "learning_rate": 0.001956043956043956, - "loss": 1.583, + "epoch": 17.582188930503538, + "grad_norm": 0.255859375, + "learning_rate": 0.0012967124427798583, + "loss": 1.2823, "step": 169000 }, { - "epoch": 1.1021522855842383, - "grad_norm": 0.66796875, - "learning_rate": 0.0019559139085766303, - "loss": 1.5675, + "epoch": 17.634207240948815, + "grad_norm": 0.373046875, + "learning_rate": 0.0012946317103620475, + "loss": 1.2851, "step": 169500 }, { - "epoch": 1.1054034722673776, - "grad_norm": 0.7265625, - "learning_rate": 0.001955783861109305, - "loss": 1.5986, + "epoch": 17.68622555139409, + "grad_norm": 0.2138671875, + "learning_rate": 0.0012925509779442364, + "loss": 1.2828, "step": 170000 }, { - "epoch": 1.1086546589505168, - "grad_norm": 0.71484375, - "learning_rate": 0.0019556538136419797, - "loss": 1.6581, + "epoch": 17.73824386183937, + "grad_norm": 0.21875, + "learning_rate": 0.0012904702455264253, + "loss": 1.2842, "step": 170500 }, { - "epoch": 1.1119058456336564, - "grad_norm": 0.671875, - "learning_rate": 0.001955523766174654, - "loss": 1.6557, + "epoch": 17.790262172284645, + "grad_norm": 0.2080078125, + "learning_rate": 0.0012883895131086142, + "loss": 1.2839, "step": 171000 }, { - "epoch": 1.1151570323167956, - "grad_norm": 0.8359375, - "learning_rate": 0.001955393718707328, - "loss": 1.6413, + "epoch": 17.842280482729922, + "grad_norm": 0.25390625, + "learning_rate": 0.0012863087806908031, + "loss": 1.2848, "step": 171500 }, { - "epoch": 1.118408218999935, - "grad_norm": 0.859375, - "learning_rate": 0.001955263671240003, - "loss": 1.6369, + "epoch": 17.8942987931752, + "grad_norm": 1.0, + "learning_rate": 0.0012842280482729923, + "loss": 1.2834, "step": 172000 }, { - "epoch": 1.1216594056830744, - "grad_norm": 0.6328125, - "learning_rate": 0.001955133623772677, - "loss": 1.637, + "epoch": 17.946317103620473, + "grad_norm": 0.2099609375, + "learning_rate": 0.0012821473158551812, + "loss": 1.2839, "step": 172500 }, { - "epoch": 1.1249105923662137, - "grad_norm": 0.82421875, - "learning_rate": 0.0019550035763053514, - "loss": 1.6328, + "epoch": 17.99833541406575, + "grad_norm": 0.283203125, + "learning_rate": 0.0012800665834373699, + "loss": 1.2837, "step": 173000 }, { - "epoch": 1.128161779049353, - "grad_norm": 2.4375, - "learning_rate": 0.001954873528838026, - "loss": 1.6243, + "epoch": 18.0, + "eval_loss": 1.3176885843276978, + "eval_runtime": 1.6332, + "eval_samples_per_second": 612.278, + "eval_steps_per_second": 0.612, + "step": 173016 + }, + { + "epoch": 18.050353724511027, + "grad_norm": 0.7734375, + "learning_rate": 0.001277985851019559, + "loss": 1.2788, "step": 173500 }, { - "epoch": 1.1314129657324923, - "grad_norm": 3.828125, - "learning_rate": 0.0019547434813707004, - "loss": 1.6165, + "epoch": 18.102372034956304, + "grad_norm": 0.224609375, + "learning_rate": 0.001275905118601748, + "loss": 1.28, "step": 174000 }, { - "epoch": 1.1346641524156318, - "grad_norm": 0.83984375, - "learning_rate": 0.0019546134339033746, - "loss": 1.6187, + "epoch": 18.15439034540158, + "grad_norm": 0.251953125, + "learning_rate": 0.0012738243861839366, + "loss": 1.2797, "step": 174500 }, { - "epoch": 1.137915339098771, - "grad_norm": 0.6640625, - "learning_rate": 0.0019544833864360493, - "loss": 1.6138, + "epoch": 18.206408655846857, + "grad_norm": 0.2890625, + "learning_rate": 0.0012717436537661257, + "loss": 1.2813, "step": 175000 }, { - "epoch": 1.1411665257819104, - "grad_norm": 0.73046875, - "learning_rate": 0.0019543533389687236, - "loss": 1.6141, + "epoch": 18.258426966292134, + "grad_norm": 0.205078125, + "learning_rate": 0.0012696629213483147, + "loss": 1.2816, "step": 175500 }, { - "epoch": 1.1444177124650499, - "grad_norm": 1.3515625, - "learning_rate": 0.001954223291501398, - "loss": 1.6027, + "epoch": 18.31044527673741, + "grad_norm": 0.21484375, + "learning_rate": 0.0012675821889305036, + "loss": 1.282, "step": 176000 }, { - "epoch": 1.1476688991481891, - "grad_norm": 0.95703125, - "learning_rate": 0.0019540932440340725, - "loss": 1.6105, + "epoch": 18.36246358718269, + "grad_norm": 0.62109375, + "learning_rate": 0.0012655014565126925, + "loss": 1.2821, "step": 176500 }, { - "epoch": 1.1509200858313284, - "grad_norm": 0.61328125, - "learning_rate": 0.001953963196566747, - "loss": 1.6119, + "epoch": 18.414481897627965, + "grad_norm": 0.2353515625, + "learning_rate": 0.0012634207240948814, + "loss": 1.2827, "step": 177000 }, { - "epoch": 1.1541712725144677, - "grad_norm": 0.61328125, - "learning_rate": 0.001953833149099421, - "loss": 1.6046, + "epoch": 18.466500208073242, + "grad_norm": 0.2197265625, + "learning_rate": 0.0012613399916770703, + "loss": 1.2802, "step": 177500 }, { - "epoch": 1.1574224591976072, - "grad_norm": 1.046875, - "learning_rate": 0.0019537031016320957, - "loss": 1.6093, + "epoch": 18.51851851851852, + "grad_norm": 0.181640625, + "learning_rate": 0.0012592592592592592, + "loss": 1.2803, "step": 178000 }, { - "epoch": 1.1606736458807465, - "grad_norm": 0.8046875, - "learning_rate": 0.0019535730541647704, - "loss": 1.6047, + "epoch": 18.570536828963796, + "grad_norm": 0.353515625, + "learning_rate": 0.0012571785268414481, + "loss": 1.2808, "step": 178500 }, { - "epoch": 1.1639248325638858, - "grad_norm": 1.4140625, - "learning_rate": 0.0019534430066974447, - "loss": 1.5999, + "epoch": 18.622555139409073, + "grad_norm": 0.1865234375, + "learning_rate": 0.0012550977944236373, + "loss": 1.2795, "step": 179000 }, { - "epoch": 1.167176019247025, - "grad_norm": 0.8515625, - "learning_rate": 0.001953312959230119, - "loss": 1.5978, + "epoch": 18.67457344985435, + "grad_norm": 0.361328125, + "learning_rate": 0.0012530170620058262, + "loss": 1.2789, "step": 179500 }, { - "epoch": 1.1704272059301646, - "grad_norm": 1.1796875, - "learning_rate": 0.0019531829117627937, - "loss": 1.5949, + "epoch": 18.726591760299627, + "grad_norm": 0.2158203125, + "learning_rate": 0.0012509363295880149, + "loss": 1.2805, "step": 180000 }, { - "epoch": 1.1736783926133039, - "grad_norm": 0.8828125, - "learning_rate": 0.001953052864295468, - "loss": 1.5913, + "epoch": 18.778610070744904, + "grad_norm": 1.640625, + "learning_rate": 0.001248855597170204, + "loss": 1.2808, "step": 180500 }, { - "epoch": 1.1769295792964432, - "grad_norm": 0.69921875, - "learning_rate": 0.0019529228168281424, - "loss": 1.5873, + "epoch": 18.83062838119018, + "grad_norm": 0.25390625, + "learning_rate": 0.001246774864752393, + "loss": 1.2798, "step": 181000 }, { - "epoch": 1.1801807659795824, - "grad_norm": 0.609375, - "learning_rate": 0.0019527927693608167, - "loss": 1.5831, + "epoch": 18.882646691635454, + "grad_norm": 0.2490234375, + "learning_rate": 0.0012446941323345816, + "loss": 1.2794, "step": 181500 }, { - "epoch": 1.183431952662722, - "grad_norm": 0.71484375, - "learning_rate": 0.0019526627218934911, - "loss": 1.575, + "epoch": 18.93466500208073, + "grad_norm": 0.19140625, + "learning_rate": 0.0012426133999167708, + "loss": 1.2801, "step": 182000 }, { - "epoch": 1.1866831393458612, - "grad_norm": 0.890625, - "learning_rate": 0.0019525326744261656, - "loss": 1.583, + "epoch": 18.986683312526008, + "grad_norm": 0.181640625, + "learning_rate": 0.0012405326674989597, + "loss": 1.2823, "step": 182500 }, { - "epoch": 1.1899343260290005, - "grad_norm": 0.95703125, - "learning_rate": 0.0019524026269588399, - "loss": 1.5749, + "epoch": 19.0, + "eval_loss": 1.3176276683807373, + "eval_runtime": 1.3968, + "eval_samples_per_second": 715.946, + "eval_steps_per_second": 0.716, + "step": 182628 + }, + { + "epoch": 19.038701622971285, + "grad_norm": 0.203125, + "learning_rate": 0.0012384519350811486, + "loss": 1.2802, "step": 183000 }, { - "epoch": 1.19318551271214, - "grad_norm": 0.640625, - "learning_rate": 0.0019522725794915144, - "loss": 1.5779, + "epoch": 19.090719933416562, + "grad_norm": 0.326171875, + "learning_rate": 0.0012363712026633375, + "loss": 1.2782, "step": 183500 }, { - "epoch": 1.1964366993952793, - "grad_norm": 3.5625, - "learning_rate": 0.0019521425320241888, - "loss": 1.5732, + "epoch": 19.14273824386184, + "grad_norm": 0.2216796875, + "learning_rate": 0.0012342904702455264, + "loss": 1.2769, "step": 184000 }, { - "epoch": 1.1996878860784186, - "grad_norm": 0.578125, - "learning_rate": 0.001952012484556863, - "loss": 1.5771, + "epoch": 19.194756554307116, + "grad_norm": 0.2060546875, + "learning_rate": 0.0012322097378277153, + "loss": 1.2789, "step": 184500 }, { - "epoch": 1.2029390727615579, - "grad_norm": 0.90625, - "learning_rate": 0.0019518824370895378, - "loss": 1.58, + "epoch": 19.246774864752393, + "grad_norm": 0.24609375, + "learning_rate": 0.0012301290054099045, + "loss": 1.279, "step": 185000 }, { - "epoch": 1.2061902594446974, - "grad_norm": 0.68359375, - "learning_rate": 0.0019517523896222123, - "loss": 1.5675, + "epoch": 19.29879317519767, + "grad_norm": 0.2080078125, + "learning_rate": 0.0012280482729920932, + "loss": 1.2813, "step": 185500 }, { - "epoch": 1.2094414461278367, - "grad_norm": 4.25, - "learning_rate": 0.0019516223421548868, - "loss": 1.563, + "epoch": 19.350811485642947, + "grad_norm": 0.1923828125, + "learning_rate": 0.0012259675405742823, + "loss": 1.2811, "step": 186000 }, { - "epoch": 1.212692632810976, - "grad_norm": 1.1015625, - "learning_rate": 0.001951492294687561, - "loss": 1.5731, + "epoch": 19.402829796088223, + "grad_norm": 0.2060546875, + "learning_rate": 0.0012238868081564712, + "loss": 1.2826, "step": 186500 }, { - "epoch": 1.2159438194941155, - "grad_norm": 0.70703125, - "learning_rate": 0.0019513622472202355, - "loss": 1.569, + "epoch": 19.4548481065335, + "grad_norm": 0.27734375, + "learning_rate": 0.00122180607573866, + "loss": 1.2811, "step": 187000 }, { - "epoch": 1.2191950061772547, - "grad_norm": 0.984375, - "learning_rate": 0.00195123219975291, - "loss": 1.5707, + "epoch": 19.506866416978777, + "grad_norm": 0.2236328125, + "learning_rate": 0.001219725343320849, + "loss": 1.2814, "step": 187500 }, { - "epoch": 1.222446192860394, - "grad_norm": 0.92578125, - "learning_rate": 0.0019511021522855842, - "loss": 1.5635, + "epoch": 19.558884727424054, + "grad_norm": 0.41015625, + "learning_rate": 0.001217644610903038, + "loss": 1.2794, "step": 188000 }, { - "epoch": 1.2256973795435333, - "grad_norm": 0.76953125, - "learning_rate": 0.0019509721048182587, - "loss": 1.5603, + "epoch": 19.61090303786933, + "grad_norm": 0.2041015625, + "learning_rate": 0.0012155638784852269, + "loss": 1.2801, "step": 188500 }, { - "epoch": 1.2289485662266728, - "grad_norm": 0.80859375, - "learning_rate": 0.0019508420573509332, - "loss": 1.5645, + "epoch": 19.662921348314608, + "grad_norm": 0.435546875, + "learning_rate": 0.0012134831460674158, + "loss": 1.2796, "step": 189000 }, { - "epoch": 1.232199752909812, - "grad_norm": 0.640625, - "learning_rate": 0.0019507120098836074, - "loss": 1.5594, + "epoch": 19.714939658759885, + "grad_norm": 0.1962890625, + "learning_rate": 0.0012114024136496047, + "loss": 1.2785, "step": 189500 }, { - "epoch": 1.2354509395929514, - "grad_norm": 1.015625, - "learning_rate": 0.001950581962416282, - "loss": 1.5594, + "epoch": 19.76695796920516, + "grad_norm": 5.6875, + "learning_rate": 0.0012093216812317936, + "loss": 1.2783, "step": 190000 }, { - "epoch": 1.2387021262760909, - "grad_norm": 0.94140625, - "learning_rate": 0.0019504519149489564, - "loss": 1.5585, + "epoch": 19.818976279650435, + "grad_norm": 0.205078125, + "learning_rate": 0.0012072409488139825, + "loss": 1.2782, "step": 190500 }, { - "epoch": 1.2419533129592302, - "grad_norm": 0.83984375, - "learning_rate": 0.0019503218674816307, - "loss": 1.5542, + "epoch": 19.870994590095712, + "grad_norm": 3.21875, + "learning_rate": 0.0012051602163961714, + "loss": 1.2792, "step": 191000 }, { - "epoch": 1.2452044996423695, - "grad_norm": 2.671875, - "learning_rate": 0.0019501918200143051, - "loss": 1.554, + "epoch": 19.92301290054099, + "grad_norm": 0.2119140625, + "learning_rate": 0.0012030794839783603, + "loss": 1.2803, "step": 191500 }, { - "epoch": 1.2484556863255087, - "grad_norm": 1.890625, - "learning_rate": 0.0019500617725469796, - "loss": 1.5572, + "epoch": 19.975031210986266, + "grad_norm": 0.419921875, + "learning_rate": 0.0012009987515605495, + "loss": 1.2778, "step": 192000 }, { - "epoch": 1.251706873008648, - "grad_norm": 0.78515625, - "learning_rate": 0.0019499317250796543, - "loss": 1.5587, + "epoch": 20.0, + "eval_loss": 1.3075087070465088, + "eval_runtime": 1.6065, + "eval_samples_per_second": 622.457, + "eval_steps_per_second": 0.622, + "step": 192240 + }, + { + "epoch": 20.027049521431543, + "grad_norm": 0.296875, + "learning_rate": 0.0011989180191427382, + "loss": 1.2763, "step": 192500 }, { - "epoch": 1.2549580596917875, - "grad_norm": 8.4375, - "learning_rate": 0.0019498016776123286, - "loss": 1.5515, + "epoch": 20.07906783187682, + "grad_norm": 0.26171875, + "learning_rate": 0.0011968372867249273, + "loss": 1.2755, "step": 193000 }, { - "epoch": 1.2582092463749268, - "grad_norm": 0.60546875, - "learning_rate": 0.001949671630145003, - "loss": 1.5584, + "epoch": 20.131086142322097, + "grad_norm": 0.220703125, + "learning_rate": 0.0011947565543071162, + "loss": 1.2754, "step": 193500 }, { - "epoch": 1.2614604330580663, - "grad_norm": 0.62890625, - "learning_rate": 0.0019495415826776775, - "loss": 1.5514, + "epoch": 20.183104452767374, + "grad_norm": 0.1875, + "learning_rate": 0.001192675821889305, + "loss": 1.2756, "step": 194000 }, { - "epoch": 1.2647116197412056, - "grad_norm": 0.7890625, - "learning_rate": 0.0019494115352103518, - "loss": 1.5492, + "epoch": 20.23512276321265, + "grad_norm": 0.2734375, + "learning_rate": 0.001190595089471494, + "loss": 1.278, "step": 194500 }, { - "epoch": 1.267962806424345, - "grad_norm": 0.8125, - "learning_rate": 0.0019492814877430263, - "loss": 1.5522, + "epoch": 20.287141073657928, + "grad_norm": 0.1767578125, + "learning_rate": 0.001188514357053683, + "loss": 1.2766, "step": 195000 }, { - "epoch": 1.2712139931074842, - "grad_norm": 0.65625, - "learning_rate": 0.0019491514402757008, - "loss": 1.5548, + "epoch": 20.339159384103205, + "grad_norm": 0.466796875, + "learning_rate": 0.0011864336246358719, + "loss": 1.2769, "step": 195500 }, { - "epoch": 1.2744651797906235, - "grad_norm": 0.703125, - "learning_rate": 0.001949021392808375, - "loss": 1.55, + "epoch": 20.39117769454848, + "grad_norm": 0.189453125, + "learning_rate": 0.0011843528922180608, + "loss": 1.2788, "step": 196000 }, { - "epoch": 1.277716366473763, - "grad_norm": 1.21875, - "learning_rate": 0.0019488913453410495, - "loss": 1.5467, + "epoch": 20.44319600499376, + "grad_norm": 0.20703125, + "learning_rate": 0.0011822721598002497, + "loss": 1.279, "step": 196500 }, { - "epoch": 1.2809675531569022, - "grad_norm": 0.640625, - "learning_rate": 0.001948761297873724, - "loss": 1.5492, + "epoch": 20.495214315439036, + "grad_norm": 0.17578125, + "learning_rate": 0.0011801914273824386, + "loss": 1.2782, "step": 197000 }, { - "epoch": 1.2842187398400415, - "grad_norm": 0.65625, - "learning_rate": 0.0019486312504063982, - "loss": 1.5494, + "epoch": 20.547232625884313, + "grad_norm": 0.25390625, + "learning_rate": 0.0011781106949646277, + "loss": 1.2773, "step": 197500 }, { - "epoch": 1.287469926523181, - "grad_norm": 0.66796875, - "learning_rate": 0.0019485012029390727, - "loss": 1.5533, + "epoch": 20.59925093632959, + "grad_norm": 0.22265625, + "learning_rate": 0.0011760299625468164, + "loss": 1.279, "step": 198000 }, { - "epoch": 1.2907211132063203, - "grad_norm": 1.09375, - "learning_rate": 0.0019483711554717472, - "loss": 1.5472, + "epoch": 20.651269246774866, + "grad_norm": 0.265625, + "learning_rate": 0.0011739492301290053, + "loss": 1.2786, "step": 198500 }, { - "epoch": 1.2939722998894596, - "grad_norm": 0.74609375, - "learning_rate": 0.0019482411080044215, - "loss": 1.5491, + "epoch": 20.70328755722014, + "grad_norm": 0.296875, + "learning_rate": 0.0011718684977111945, + "loss": 1.2788, "step": 199000 }, { - "epoch": 1.297223486572599, - "grad_norm": 1.109375, - "learning_rate": 0.0019481110605370962, - "loss": 1.5507, + "epoch": 20.755305867665417, + "grad_norm": 0.8046875, + "learning_rate": 0.0011697877652933832, + "loss": 1.2781, "step": 199500 }, { - "epoch": 1.3004746732557384, - "grad_norm": 0.92578125, - "learning_rate": 0.0019479810130697706, - "loss": 1.5485, + "epoch": 20.807324178110694, + "grad_norm": 0.2314453125, + "learning_rate": 0.001167707032875572, + "loss": 1.2789, "step": 200000 }, { - "epoch": 1.3037258599388777, - "grad_norm": 0.78125, - "learning_rate": 0.0019478509656024451, - "loss": 1.5496, + "epoch": 20.85934248855597, + "grad_norm": 0.23046875, + "learning_rate": 0.0011656263004577612, + "loss": 1.2809, "step": 200500 }, { - "epoch": 1.306977046622017, - "grad_norm": 0.61328125, - "learning_rate": 0.0019477209181351194, - "loss": 1.5401, + "epoch": 20.911360799001248, + "grad_norm": 0.21875, + "learning_rate": 0.0011635455680399501, + "loss": 1.2778, "step": 201000 }, { - "epoch": 1.3102282333051565, - "grad_norm": 0.609375, - "learning_rate": 0.0019475908706677938, - "loss": 1.5444, + "epoch": 20.963379109446524, + "grad_norm": 0.265625, + "learning_rate": 0.001161464835622139, + "loss": 1.2777, "step": 201500 }, { - "epoch": 1.3134794199882958, - "grad_norm": 0.8203125, - "learning_rate": 0.0019474608232004683, - "loss": 1.5402, + "epoch": 21.0, + "eval_loss": 1.3104900121688843, + "eval_runtime": 1.3896, + "eval_samples_per_second": 719.624, + "eval_steps_per_second": 0.72, + "step": 201852 + }, + { + "epoch": 21.0153974198918, + "grad_norm": 0.2265625, + "learning_rate": 0.001159384103204328, + "loss": 1.2777, "step": 202000 }, { - "epoch": 1.316730606671435, - "grad_norm": 1.484375, - "learning_rate": 0.0019473307757331426, - "loss": 1.5417, + "epoch": 21.06741573033708, + "grad_norm": 0.173828125, + "learning_rate": 0.0011573033707865169, + "loss": 1.2763, "step": 202500 }, { - "epoch": 1.3199817933545743, - "grad_norm": 0.93359375, - "learning_rate": 0.001947200728265817, - "loss": 1.5397, + "epoch": 21.119434040782355, + "grad_norm": 0.2353515625, + "learning_rate": 0.0011552226383687058, + "loss": 1.2753, "step": 203000 }, { - "epoch": 1.3232329800377138, - "grad_norm": 0.70703125, - "learning_rate": 0.0019470706807984915, - "loss": 1.5408, + "epoch": 21.171452351227632, + "grad_norm": 0.1875, + "learning_rate": 0.0011531419059508947, + "loss": 1.2752, "step": 203500 }, { - "epoch": 1.3264841667208531, - "grad_norm": 0.53515625, - "learning_rate": 0.0019469406333311658, - "loss": 1.54, + "epoch": 21.22347066167291, + "grad_norm": 0.251953125, + "learning_rate": 0.0011510611735330836, + "loss": 1.2754, "step": 204000 }, { - "epoch": 1.3297353534039924, - "grad_norm": 0.98046875, - "learning_rate": 0.0019468105858638403, - "loss": 1.5395, + "epoch": 21.275488972118186, + "grad_norm": 0.251953125, + "learning_rate": 0.0011489804411152727, + "loss": 1.2756, "step": 204500 }, { - "epoch": 1.332986540087132, - "grad_norm": 0.9453125, - "learning_rate": 0.0019466805383965148, - "loss": 1.5428, + "epoch": 21.327507282563463, + "grad_norm": 0.2412109375, + "learning_rate": 0.0011468997086974614, + "loss": 1.2754, "step": 205000 }, { - "epoch": 1.3362377267702712, - "grad_norm": 0.91796875, - "learning_rate": 0.001946550490929189, - "loss": 1.533, + "epoch": 21.37952559300874, + "grad_norm": 0.208984375, + "learning_rate": 0.0011448189762796504, + "loss": 1.2753, "step": 205500 }, { - "epoch": 1.3394889134534105, - "grad_norm": 0.7578125, - "learning_rate": 0.0019464204434618635, - "loss": 1.5305, + "epoch": 21.431543903454017, + "grad_norm": 0.361328125, + "learning_rate": 0.0011427382438618395, + "loss": 1.2757, "step": 206000 }, { - "epoch": 1.3427401001365498, - "grad_norm": 0.73046875, - "learning_rate": 0.001946290395994538, - "loss": 1.5351, + "epoch": 21.483562213899294, + "grad_norm": 0.19140625, + "learning_rate": 0.0011406575114440284, + "loss": 1.2755, "step": 206500 }, { - "epoch": 1.345991286819689, - "grad_norm": 0.65625, - "learning_rate": 0.0019461603485272127, - "loss": 1.5327, + "epoch": 21.53558052434457, + "grad_norm": 0.248046875, + "learning_rate": 0.001138576779026217, + "loss": 1.2764, "step": 207000 }, { - "epoch": 1.3492424735028286, - "grad_norm": 0.92578125, - "learning_rate": 0.001946030301059887, - "loss": 1.5305, + "epoch": 21.587598834789844, + "grad_norm": 0.1953125, + "learning_rate": 0.0011364960466084062, + "loss": 1.2765, "step": 207500 }, { - "epoch": 1.3524936601859678, - "grad_norm": 0.81640625, - "learning_rate": 0.0019459002535925614, - "loss": 1.5374, + "epoch": 21.63961714523512, + "grad_norm": 0.48828125, + "learning_rate": 0.0011344153141905951, + "loss": 1.2756, "step": 208000 }, { - "epoch": 1.3557448468691073, - "grad_norm": 0.58984375, - "learning_rate": 0.001945770206125236, - "loss": 1.5349, + "epoch": 21.691635455680398, + "grad_norm": 0.1796875, + "learning_rate": 0.001132334581772784, + "loss": 1.2772, "step": 208500 }, { - "epoch": 1.3589960335522466, - "grad_norm": 0.77734375, - "learning_rate": 0.0019456401586579102, - "loss": 1.5313, + "epoch": 21.743653766125675, + "grad_norm": 0.30859375, + "learning_rate": 0.001130253849354973, + "loss": 1.2757, "step": 209000 }, { - "epoch": 1.362247220235386, - "grad_norm": 0.67578125, - "learning_rate": 0.0019455101111905846, - "loss": 1.5399, + "epoch": 21.795672076570952, + "grad_norm": 2.46875, + "learning_rate": 0.0011281731169371619, + "loss": 1.2767, "step": 209500 }, { - "epoch": 1.3654984069185252, - "grad_norm": 0.671875, - "learning_rate": 0.0019453800637232591, - "loss": 1.5343, + "epoch": 21.84769038701623, + "grad_norm": 0.2431640625, + "learning_rate": 0.001126092384519351, + "loss": 1.2741, "step": 210000 }, { - "epoch": 1.3687495936016645, - "grad_norm": 1.2421875, - "learning_rate": 0.0019452500162559334, - "loss": 1.5308, + "epoch": 21.899708697461506, + "grad_norm": 0.94921875, + "learning_rate": 0.0011240116521015397, + "loss": 1.277, "step": 210500 }, { - "epoch": 1.372000780284804, - "grad_norm": 0.65625, - "learning_rate": 0.0019451199687886079, - "loss": 1.5316, + "epoch": 21.951727007906783, + "grad_norm": 0.1923828125, + "learning_rate": 0.0011219309196837286, + "loss": 1.2761, "step": 211000 }, { - "epoch": 1.3752519669679433, - "grad_norm": 0.7265625, - "learning_rate": 0.0019449899213212823, - "loss": 1.5338, + "epoch": 22.0, + "eval_loss": 1.309814453125, + "eval_runtime": 1.7271, + "eval_samples_per_second": 578.995, + "eval_steps_per_second": 0.579, + "step": 211464 + }, + { + "epoch": 22.00374531835206, + "grad_norm": 0.212890625, + "learning_rate": 0.0011198501872659178, + "loss": 1.2756, "step": 211500 }, { - "epoch": 1.3785031536510828, - "grad_norm": 0.6953125, - "learning_rate": 0.0019448598738539566, - "loss": 1.5312, + "epoch": 22.055763628797337, + "grad_norm": 0.259765625, + "learning_rate": 0.0011177694548481065, + "loss": 1.2751, "step": 212000 }, { - "epoch": 1.381754340334222, - "grad_norm": 0.75, - "learning_rate": 0.001944729826386631, - "loss": 1.5325, + "epoch": 22.107781939242614, + "grad_norm": 0.181640625, + "learning_rate": 0.0011156887224302954, + "loss": 1.2746, "step": 212500 }, { - "epoch": 1.3850055270173613, - "grad_norm": 0.80078125, - "learning_rate": 0.0019445997789193056, - "loss": 1.5329, + "epoch": 22.15980024968789, + "grad_norm": 0.279296875, + "learning_rate": 0.0011136079900124845, + "loss": 1.2754, "step": 213000 }, { - "epoch": 1.3882567137005006, - "grad_norm": 0.64453125, - "learning_rate": 0.0019444697314519798, - "loss": 1.5323, + "epoch": 22.211818560133167, + "grad_norm": 0.259765625, + "learning_rate": 0.0011115272575946734, + "loss": 1.2767, "step": 213500 }, { - "epoch": 1.39150790038364, - "grad_norm": 0.87890625, - "learning_rate": 0.0019443396839846543, - "loss": 1.5399, + "epoch": 22.263836870578444, + "grad_norm": 0.2392578125, + "learning_rate": 0.0011094465251768621, + "loss": 1.2737, "step": 214000 }, { - "epoch": 1.3947590870667794, - "grad_norm": 0.91015625, - "learning_rate": 0.001944209636517329, - "loss": 1.5351, + "epoch": 22.31585518102372, + "grad_norm": 0.2734375, + "learning_rate": 0.0011073657927590512, + "loss": 1.2737, "step": 214500 }, { - "epoch": 1.3980102737499187, - "grad_norm": 0.734375, - "learning_rate": 0.0019440795890500035, - "loss": 1.5353, + "epoch": 22.367873491469, + "grad_norm": 0.33984375, + "learning_rate": 0.0011052850603412402, + "loss": 1.2735, "step": 215000 }, { - "epoch": 1.401261460433058, - "grad_norm": 0.578125, - "learning_rate": 0.0019439495415826777, - "loss": 1.5367, + "epoch": 22.419891801914275, + "grad_norm": 0.251953125, + "learning_rate": 0.001103204327923429, + "loss": 1.2733, "step": 215500 }, { - "epoch": 1.4045126471161975, - "grad_norm": 0.74609375, - "learning_rate": 0.0019438194941153522, - "loss": 1.5354, + "epoch": 22.471910112359552, + "grad_norm": 0.373046875, + "learning_rate": 0.001101123595505618, + "loss": 1.2723, "step": 216000 }, { - "epoch": 1.4077638337993368, - "grad_norm": 0.7890625, - "learning_rate": 0.0019436894466480267, - "loss": 1.5261, + "epoch": 22.52392842280483, + "grad_norm": 0.2216796875, + "learning_rate": 0.001099042863087807, + "loss": 1.274, "step": 216500 }, { - "epoch": 1.411015020482476, - "grad_norm": 0.9140625, - "learning_rate": 0.001943559399180701, - "loss": 1.5305, + "epoch": 22.575946733250102, + "grad_norm": 0.267578125, + "learning_rate": 0.001096962130669996, + "loss": 1.2744, "step": 217000 }, { - "epoch": 1.4142662071656154, - "grad_norm": 0.59375, - "learning_rate": 0.0019434293517133754, - "loss": 1.5362, + "epoch": 22.62796504369538, + "grad_norm": 0.1953125, + "learning_rate": 0.0010948813982521847, + "loss": 1.2758, "step": 217500 }, { - "epoch": 1.4175173938487549, - "grad_norm": 0.98828125, - "learning_rate": 0.00194329930424605, - "loss": 1.5272, + "epoch": 22.679983354140656, + "grad_norm": 0.28125, + "learning_rate": 0.0010928006658343736, + "loss": 1.2755, "step": 218000 }, { - "epoch": 1.4207685805318941, - "grad_norm": 0.6328125, - "learning_rate": 0.0019431692567787242, - "loss": 1.5304, + "epoch": 22.732001664585933, + "grad_norm": 0.23828125, + "learning_rate": 0.0010907199334165628, + "loss": 1.2742, "step": 218500 }, { - "epoch": 1.4240197672150334, - "grad_norm": 0.640625, - "learning_rate": 0.0019430392093113986, - "loss": 1.5318, + "epoch": 22.78401997503121, + "grad_norm": 0.2109375, + "learning_rate": 0.0010886392009987517, + "loss": 1.2741, "step": 219000 }, { - "epoch": 1.427270953898173, - "grad_norm": 0.80078125, - "learning_rate": 0.0019429091618440731, - "loss": 1.5312, + "epoch": 22.836038285476487, + "grad_norm": 0.251953125, + "learning_rate": 0.0010865584685809404, + "loss": 1.2739, "step": 219500 }, { - "epoch": 1.4305221405813122, - "grad_norm": 0.91015625, - "learning_rate": 0.0019427791143767474, - "loss": 1.5338, + "epoch": 22.888056595921764, + "grad_norm": 0.166015625, + "learning_rate": 0.0010844777361631295, + "loss": 1.2747, "step": 220000 }, { - "epoch": 1.4337733272644515, - "grad_norm": 0.609375, - "learning_rate": 0.0019426490669094219, - "loss": 1.5308, + "epoch": 22.94007490636704, + "grad_norm": 0.216796875, + "learning_rate": 0.0010823970037453184, + "loss": 1.2733, "step": 220500 }, { - "epoch": 1.4370245139475908, - "grad_norm": 0.76953125, - "learning_rate": 0.0019425190194420963, - "loss": 1.5243, + "epoch": 22.992093216812318, + "grad_norm": 0.197265625, + "learning_rate": 0.0010803162713275071, + "loss": 1.2734, "step": 221000 }, { - "epoch": 1.4402757006307303, - "grad_norm": 0.84765625, - "learning_rate": 0.001942388971974771, - "loss": 1.528, + "epoch": 23.0, + "eval_loss": 1.3059455156326294, + "eval_runtime": 1.6804, + "eval_samples_per_second": 595.106, + "eval_steps_per_second": 0.595, + "step": 221076 + }, + { + "epoch": 23.044111527257595, + "grad_norm": 0.2060546875, + "learning_rate": 0.0010782355389096963, + "loss": 1.2717, "step": 221500 }, { - "epoch": 1.4435268873138696, - "grad_norm": 0.625, - "learning_rate": 0.0019422589245074453, - "loss": 1.5385, + "epoch": 23.096129837702872, + "grad_norm": 0.2021484375, + "learning_rate": 0.0010761548064918852, + "loss": 1.2719, "step": 222000 }, { - "epoch": 1.4467780739970089, - "grad_norm": 0.73046875, - "learning_rate": 0.0019421288770401198, - "loss": 1.5363, + "epoch": 23.14814814814815, + "grad_norm": 0.1796875, + "learning_rate": 0.0010740740740740743, + "loss": 1.2732, "step": 222500 }, { - "epoch": 1.4500292606801484, - "grad_norm": 6.3125, - "learning_rate": 0.0019419988295727943, - "loss": 1.528, + "epoch": 23.200166458593426, + "grad_norm": 0.318359375, + "learning_rate": 0.001071993341656263, + "loss": 1.2743, "step": 223000 }, { - "epoch": 1.4532804473632877, - "grad_norm": 0.88671875, - "learning_rate": 0.0019418687821054685, - "loss": 1.5238, + "epoch": 23.252184769038703, + "grad_norm": 0.87890625, + "learning_rate": 0.001069912609238452, + "loss": 1.2724, "step": 223500 }, { - "epoch": 1.456531634046427, - "grad_norm": 1.953125, - "learning_rate": 0.001941738734638143, - "loss": 1.5261, + "epoch": 23.30420307948398, + "grad_norm": 0.423828125, + "learning_rate": 0.001067831876820641, + "loss": 1.271, "step": 224000 }, { - "epoch": 1.4597828207295662, - "grad_norm": 0.68359375, - "learning_rate": 0.0019416086871708175, - "loss": 1.5223, + "epoch": 23.356221389929257, + "grad_norm": 0.25390625, + "learning_rate": 0.0010657511444028297, + "loss": 1.2721, "step": 224500 }, { - "epoch": 1.4630340074127055, - "grad_norm": 1.796875, - "learning_rate": 0.0019414786397034917, - "loss": 1.5345, + "epoch": 23.408239700374533, + "grad_norm": 0.1962890625, + "learning_rate": 0.0010636704119850186, + "loss": 1.2716, "step": 225000 }, { - "epoch": 1.466285194095845, - "grad_norm": 0.640625, - "learning_rate": 0.0019413485922361662, - "loss": 1.5254, + "epoch": 23.460258010819807, + "grad_norm": 0.294921875, + "learning_rate": 0.0010615896795672078, + "loss": 1.2714, "step": 225500 }, { - "epoch": 1.4695363807789843, - "grad_norm": 0.59765625, - "learning_rate": 0.0019412185447688407, - "loss": 1.5274, + "epoch": 23.512276321265084, + "grad_norm": 0.29296875, + "learning_rate": 0.0010595089471493967, + "loss": 1.2719, "step": 226000 }, { - "epoch": 1.4727875674621238, - "grad_norm": 0.734375, - "learning_rate": 0.001941088497301515, - "loss": 1.5266, + "epoch": 23.56429463171036, + "grad_norm": 0.189453125, + "learning_rate": 0.0010574282147315854, + "loss": 1.2716, "step": 226500 }, { - "epoch": 1.476038754145263, - "grad_norm": 0.765625, - "learning_rate": 0.0019409584498341894, - "loss": 1.5234, + "epoch": 23.616312942155638, + "grad_norm": 0.2138671875, + "learning_rate": 0.0010553474823137745, + "loss": 1.272, "step": 227000 }, { - "epoch": 1.4792899408284024, - "grad_norm": 0.8125, - "learning_rate": 0.001940828402366864, - "loss": 1.5274, + "epoch": 23.668331252600915, + "grad_norm": 0.890625, + "learning_rate": 0.0010532667498959634, + "loss": 1.2729, "step": 227500 }, { - "epoch": 1.4825411275115417, - "grad_norm": 0.69140625, - "learning_rate": 0.0019406983548995382, - "loss": 1.5308, + "epoch": 23.72034956304619, + "grad_norm": 0.2470703125, + "learning_rate": 0.0010511860174781521, + "loss": 1.2728, "step": 228000 }, { - "epoch": 1.485792314194681, - "grad_norm": 1.0859375, - "learning_rate": 0.0019405683074322127, - "loss": 1.5266, + "epoch": 23.77236787349147, + "grad_norm": 0.251953125, + "learning_rate": 0.0010491052850603413, + "loss": 1.2733, "step": 228500 }, { - "epoch": 1.4890435008778204, - "grad_norm": 0.62890625, - "learning_rate": 0.0019404382599648873, - "loss": 1.5297, + "epoch": 23.824386183936745, + "grad_norm": 0.1806640625, + "learning_rate": 0.0010470245526425302, + "loss": 1.2713, "step": 229000 }, { - "epoch": 1.4922946875609597, - "grad_norm": 1.078125, - "learning_rate": 0.0019403082124975618, - "loss": 1.5334, + "epoch": 23.876404494382022, + "grad_norm": 0.40234375, + "learning_rate": 0.0010449438202247193, + "loss": 1.2714, "step": 229500 }, { - "epoch": 1.495545874244099, - "grad_norm": 0.58984375, - "learning_rate": 0.001940178165030236, - "loss": 1.5235, + "epoch": 23.9284228048273, + "grad_norm": 0.30078125, + "learning_rate": 0.001042863087806908, + "loss": 1.2724, "step": 230000 }, { - "epoch": 1.4987970609272385, - "grad_norm": 0.8515625, - "learning_rate": 0.0019400481175629106, - "loss": 1.52, + "epoch": 23.980441115272576, + "grad_norm": 0.2197265625, + "learning_rate": 0.001040782355389097, + "loss": 1.2716, "step": 230500 }, { - "epoch": 1.5020482476103778, - "grad_norm": 0.78515625, - "learning_rate": 0.001939918070095585, - "loss": 1.5206, - "step": 231000 - }, - { - "epoch": 1.505299434293517, - "grad_norm": 1.3515625, - "learning_rate": 0.0019397880226282593, - "loss": 1.5168, - "step": 231500 - }, - { - "epoch": 1.5085506209766564, - "grad_norm": 2.796875, - "learning_rate": 0.0019396579751609338, - "loss": 1.5241, - "step": 232000 - }, - { - "epoch": 1.5118018076597957, - "grad_norm": 0.859375, - "learning_rate": 0.0019395279276936083, - "loss": 1.5297, - "step": 232500 - }, - { - "epoch": 1.5150529943429352, - "grad_norm": 0.71875, - "learning_rate": 0.0019393978802262825, - "loss": 1.526, - "step": 233000 - }, - { - "epoch": 1.5183041810260747, - "grad_norm": 0.90234375, - "learning_rate": 0.001939267832758957, - "loss": 1.5229, - "step": 233500 - }, - { - "epoch": 1.521555367709214, - "grad_norm": 1.609375, - "learning_rate": 0.0019391377852916315, - "loss": 1.5213, - "step": 234000 - }, - { - "epoch": 1.5248065543923532, - "grad_norm": 0.81640625, - "learning_rate": 0.0019390077378243057, - "loss": 1.5181, - "step": 234500 - }, - { - "epoch": 1.5280577410754925, - "grad_norm": 0.88671875, - "learning_rate": 0.0019388776903569802, - "loss": 1.5245, - "step": 235000 - }, - { - "epoch": 1.5313089277586318, - "grad_norm": 0.7265625, - "learning_rate": 0.0019387476428896547, - "loss": 1.5236, - "step": 235500 - }, - { - "epoch": 1.534560114441771, - "grad_norm": 0.77734375, - "learning_rate": 0.0019386175954223294, - "loss": 1.5121, - "step": 236000 - }, - { - "epoch": 1.5378113011249106, - "grad_norm": 2.265625, - "learning_rate": 0.0019384875479550037, - "loss": 1.5178, - "step": 236500 - }, - { - "epoch": 1.5410624878080499, - "grad_norm": 1.0859375, - "learning_rate": 0.0019383575004876781, - "loss": 1.5149, - "step": 237000 - }, - { - "epoch": 1.5443136744911894, - "grad_norm": 1.640625, - "learning_rate": 0.0019382274530203526, - "loss": 1.5149, - "step": 237500 - }, - { - "epoch": 1.5475648611743287, - "grad_norm": 1.359375, - "learning_rate": 0.0019380974055530269, - "loss": 1.5182, - "step": 238000 - }, - { - "epoch": 1.550816047857468, - "grad_norm": 1.453125, - "learning_rate": 0.0019379673580857014, - "loss": 1.5145, - "step": 238500 - }, - { - "epoch": 1.5540672345406072, - "grad_norm": 0.79296875, - "learning_rate": 0.0019378373106183758, - "loss": 1.5163, - "step": 239000 - }, - { - "epoch": 1.5573184212237465, - "grad_norm": 1.0078125, - "learning_rate": 0.00193770726315105, - "loss": 1.5128, - "step": 239500 - }, - { - "epoch": 1.560569607906886, - "grad_norm": 0.69140625, - "learning_rate": 0.0019375772156837246, - "loss": 1.5102, - "step": 240000 - }, - { - "epoch": 1.5638207945900253, - "grad_norm": 0.6875, - "learning_rate": 0.001937447168216399, - "loss": 1.5091, - "step": 240500 - }, - { - "epoch": 1.5670719812731648, - "grad_norm": 0.90625, - "learning_rate": 0.0019373171207490733, - "loss": 1.5095, - "step": 241000 - }, - { - "epoch": 1.5703231679563041, - "grad_norm": 0.734375, - "learning_rate": 0.0019371870732817478, - "loss": 1.5146, - "step": 241500 - }, - { - "epoch": 1.5735743546394434, - "grad_norm": 0.71875, - "learning_rate": 0.0019370570258144223, - "loss": 1.5162, - "step": 242000 - }, - { - "epoch": 1.5768255413225827, - "grad_norm": 0.96875, - "learning_rate": 0.0019369269783470965, - "loss": 1.5214, - "step": 242500 - }, - { - "epoch": 1.580076728005722, - "grad_norm": 1.078125, - "learning_rate": 0.001936796930879771, - "loss": 1.5162, - "step": 243000 - }, - { - "epoch": 1.5833279146888615, - "grad_norm": 0.6328125, - "learning_rate": 0.0019366668834124457, - "loss": 1.5258, - "step": 243500 - }, - { - "epoch": 1.5865791013720008, - "grad_norm": 0.765625, - "learning_rate": 0.0019365368359451202, - "loss": 1.5242, - "step": 244000 - }, - { - "epoch": 1.5898302880551403, - "grad_norm": 0.97265625, - "learning_rate": 0.0019364067884777944, - "loss": 1.5162, - "step": 244500 - }, - { - "epoch": 1.5930814747382795, - "grad_norm": 0.703125, - "learning_rate": 0.001936276741010469, - "loss": 1.5204, - "step": 245000 - }, - { - "epoch": 1.5963326614214188, - "grad_norm": 0.6875, - "learning_rate": 0.0019361466935431434, - "loss": 1.5206, - "step": 245500 - }, - { - "epoch": 1.5995838481045581, - "grad_norm": 1.0078125, - "learning_rate": 0.0019360166460758177, - "loss": 1.5266, - "step": 246000 - }, - { - "epoch": 1.6028350347876974, - "grad_norm": 1.8828125, - "learning_rate": 0.0019358865986084921, - "loss": 1.5218, - "step": 246500 - }, - { - "epoch": 1.606086221470837, - "grad_norm": 1.0234375, - "learning_rate": 0.0019357565511411666, - "loss": 1.5187, - "step": 247000 - }, - { - "epoch": 1.6093374081539762, - "grad_norm": 1.59375, - "learning_rate": 0.0019356265036738409, - "loss": 1.527, - "step": 247500 - }, - { - "epoch": 1.6125885948371157, - "grad_norm": 0.8984375, - "learning_rate": 0.0019354964562065154, - "loss": 1.519, - "step": 248000 - }, - { - "epoch": 1.615839781520255, - "grad_norm": 0.8125, - "learning_rate": 0.0019353664087391898, - "loss": 1.5178, - "step": 248500 - }, - { - "epoch": 1.6190909682033943, - "grad_norm": 0.953125, - "learning_rate": 0.001935236361271864, - "loss": 1.5131, - "step": 249000 - }, - { - "epoch": 1.6223421548865335, - "grad_norm": 1.0390625, - "learning_rate": 0.0019351063138045386, - "loss": 1.5118, - "step": 249500 - }, - { - "epoch": 1.6255933415696728, - "grad_norm": 0.64453125, - "learning_rate": 0.001934976266337213, - "loss": 1.5144, - "step": 250000 - }, - { - "epoch": 1.6288445282528121, - "grad_norm": 0.7109375, - "learning_rate": 0.0019348462188698878, - "loss": 1.5197, - "step": 250500 - }, - { - "epoch": 1.6320957149359516, - "grad_norm": 0.79296875, - "learning_rate": 0.001934716171402562, - "loss": 1.5172, - "step": 251000 - }, - { - "epoch": 1.6353469016190911, - "grad_norm": 1.984375, - "learning_rate": 0.0019345861239352365, - "loss": 1.5207, - "step": 251500 - }, - { - "epoch": 1.6385980883022304, - "grad_norm": 0.703125, - "learning_rate": 0.001934456076467911, - "loss": 1.5156, - "step": 252000 - }, - { - "epoch": 1.6418492749853697, - "grad_norm": 2.71875, - "learning_rate": 0.0019343260290005852, - "loss": 1.5161, - "step": 252500 - }, - { - "epoch": 1.645100461668509, - "grad_norm": 1.1015625, - "learning_rate": 0.0019341959815332597, - "loss": 1.5193, - "step": 253000 - }, - { - "epoch": 1.6483516483516483, - "grad_norm": 33.5, - "learning_rate": 0.0019340659340659342, - "loss": 1.5211, - "step": 253500 - }, - { - "epoch": 1.6516028350347876, - "grad_norm": 1.125, - "learning_rate": 0.0019339358865986085, - "loss": 1.5132, - "step": 254000 - }, - { - "epoch": 1.654854021717927, - "grad_norm": 0.953125, - "learning_rate": 0.001933805839131283, - "loss": 1.5106, - "step": 254500 - }, - { - "epoch": 1.6581052084010663, - "grad_norm": 0.640625, - "learning_rate": 0.0019336757916639574, - "loss": 1.5101, - "step": 255000 - }, - { - "epoch": 1.6613563950842059, - "grad_norm": 27.125, - "learning_rate": 0.0019335457441966317, - "loss": 1.5107, - "step": 255500 - }, - { - "epoch": 1.6646075817673451, - "grad_norm": 0.94921875, - "learning_rate": 0.0019334156967293062, - "loss": 1.5099, - "step": 256000 - }, - { - "epoch": 1.6678587684504844, - "grad_norm": 0.60546875, - "learning_rate": 0.0019332856492619806, - "loss": 1.5145, - "step": 256500 - }, - { - "epoch": 1.6711099551336237, - "grad_norm": 0.6015625, - "learning_rate": 0.001933155601794655, - "loss": 1.5125, - "step": 257000 - }, - { - "epoch": 1.674361141816763, - "grad_norm": 0.578125, - "learning_rate": 0.0019330255543273294, - "loss": 1.5107, - "step": 257500 - }, - { - "epoch": 1.6776123284999025, - "grad_norm": 1.4296875, - "learning_rate": 0.001932895506860004, - "loss": 1.5076, - "step": 258000 - }, - { - "epoch": 1.6808635151830418, - "grad_norm": 0.6796875, - "learning_rate": 0.0019327654593926785, - "loss": 1.5095, - "step": 258500 - }, - { - "epoch": 1.6841147018661813, - "grad_norm": 0.73046875, - "learning_rate": 0.0019326354119253528, - "loss": 1.5088, - "step": 259000 - }, - { - "epoch": 1.6873658885493206, - "grad_norm": 0.56640625, - "learning_rate": 0.0019325053644580273, - "loss": 1.5099, - "step": 259500 - }, - { - "epoch": 1.6906170752324599, - "grad_norm": 0.6953125, - "learning_rate": 0.0019323753169907018, - "loss": 1.5095, - "step": 260000 - }, - { - "epoch": 1.6938682619155991, - "grad_norm": 1.640625, - "learning_rate": 0.001932245269523376, - "loss": 1.514, - "step": 260500 - }, - { - "epoch": 1.6971194485987384, - "grad_norm": 1.953125, - "learning_rate": 0.0019321152220560505, - "loss": 1.5228, - "step": 261000 - }, - { - "epoch": 1.700370635281878, - "grad_norm": 0.671875, - "learning_rate": 0.001931985174588725, - "loss": 1.5193, - "step": 261500 - }, - { - "epoch": 1.7036218219650172, - "grad_norm": 1.0703125, - "learning_rate": 0.0019318551271213992, - "loss": 1.5193, - "step": 262000 - }, - { - "epoch": 1.7068730086481567, - "grad_norm": 0.67578125, - "learning_rate": 0.0019317250796540737, - "loss": 1.5236, - "step": 262500 - }, - { - "epoch": 1.710124195331296, - "grad_norm": 1.3671875, - "learning_rate": 0.0019315950321867482, - "loss": 1.51, - "step": 263000 - }, - { - "epoch": 1.7133753820144353, - "grad_norm": 0.88671875, - "learning_rate": 0.0019314649847194225, - "loss": 1.5129, - "step": 263500 - }, - { - "epoch": 1.7166265686975746, - "grad_norm": 0.7109375, - "learning_rate": 0.001931334937252097, - "loss": 1.5101, - "step": 264000 - }, - { - "epoch": 1.7198777553807139, - "grad_norm": 0.63671875, - "learning_rate": 0.0019312048897847714, - "loss": 1.5112, - "step": 264500 - }, - { - "epoch": 1.7231289420638531, - "grad_norm": 0.8046875, - "learning_rate": 0.0019310748423174461, - "loss": 1.5131, - "step": 265000 - }, - { - "epoch": 1.7263801287469926, - "grad_norm": 0.6328125, - "learning_rate": 0.0019309447948501204, - "loss": 1.5135, - "step": 265500 - }, - { - "epoch": 1.7296313154301322, - "grad_norm": 0.73828125, - "learning_rate": 0.0019308147473827949, - "loss": 1.5105, - "step": 266000 - }, - { - "epoch": 1.7328825021132714, - "grad_norm": 0.67578125, - "learning_rate": 0.0019306846999154693, - "loss": 1.5133, - "step": 266500 - }, - { - "epoch": 1.7361336887964107, - "grad_norm": 0.89453125, - "learning_rate": 0.0019305546524481436, - "loss": 1.5098, - "step": 267000 - }, - { - "epoch": 1.73938487547955, - "grad_norm": 5.84375, - "learning_rate": 0.001930424604980818, - "loss": 1.5059, - "step": 267500 - }, - { - "epoch": 1.7426360621626893, - "grad_norm": 0.65234375, - "learning_rate": 0.0019302945575134926, - "loss": 1.514, - "step": 268000 - }, - { - "epoch": 1.7458872488458286, - "grad_norm": 0.78515625, - "learning_rate": 0.0019301645100461668, - "loss": 1.4967, - "step": 268500 - }, - { - "epoch": 1.749138435528968, - "grad_norm": 0.83984375, - "learning_rate": 0.0019300344625788413, - "loss": 1.505, - "step": 269000 - }, - { - "epoch": 1.7523896222121074, - "grad_norm": 1.640625, - "learning_rate": 0.0019299044151115158, - "loss": 1.5132, - "step": 269500 - }, - { - "epoch": 1.7556408088952469, - "grad_norm": 0.80859375, - "learning_rate": 0.00192977436764419, - "loss": 1.5124, - "step": 270000 - }, - { - "epoch": 1.7588919955783862, - "grad_norm": 0.6484375, - "learning_rate": 0.0019296443201768645, - "loss": 1.509, - "step": 270500 - }, - { - "epoch": 1.7621431822615254, - "grad_norm": 0.74609375, - "learning_rate": 0.001929514272709539, - "loss": 1.5049, - "step": 271000 - }, - { - "epoch": 1.7653943689446647, - "grad_norm": 2.359375, - "learning_rate": 0.0019293842252422133, - "loss": 1.5067, - "step": 271500 - }, - { - "epoch": 1.768645555627804, - "grad_norm": 0.81640625, - "learning_rate": 0.0019292541777748877, - "loss": 1.5076, - "step": 272000 - }, - { - "epoch": 1.7718967423109435, - "grad_norm": 0.90234375, - "learning_rate": 0.0019291241303075624, - "loss": 1.5, - "step": 272500 - }, - { - "epoch": 1.7751479289940828, - "grad_norm": 1.484375, - "learning_rate": 0.001928994082840237, - "loss": 1.5139, - "step": 273000 - }, - { - "epoch": 1.7783991156772223, - "grad_norm": 0.875, - "learning_rate": 0.0019288640353729112, - "loss": 1.5092, - "step": 273500 - }, - { - "epoch": 1.7816503023603616, - "grad_norm": 4.34375, - "learning_rate": 0.0019287339879055856, - "loss": 1.5087, - "step": 274000 - }, - { - "epoch": 1.7849014890435009, - "grad_norm": 1.2265625, - "learning_rate": 0.0019286039404382601, - "loss": 1.5178, - "step": 274500 - }, - { - "epoch": 1.7881526757266402, - "grad_norm": 0.73828125, - "learning_rate": 0.0019284738929709344, - "loss": 1.5253, - "step": 275000 - }, - { - "epoch": 1.7914038624097794, - "grad_norm": 2.140625, - "learning_rate": 0.0019283438455036089, - "loss": 1.5293, - "step": 275500 - }, - { - "epoch": 1.794655049092919, - "grad_norm": 1.59375, - "learning_rate": 0.0019282137980362833, - "loss": 1.528, - "step": 276000 - }, - { - "epoch": 1.7979062357760582, - "grad_norm": 0.67578125, - "learning_rate": 0.0019280837505689576, - "loss": 1.5178, - "step": 276500 - }, - { - "epoch": 1.8011574224591977, - "grad_norm": 0.62109375, - "learning_rate": 0.001927953703101632, - "loss": 1.5362, - "step": 277000 - }, - { - "epoch": 1.804408609142337, - "grad_norm": 0.671875, - "learning_rate": 0.0019278236556343066, - "loss": 1.5321, - "step": 277500 - }, - { - "epoch": 1.8076597958254763, - "grad_norm": 1.53125, - "learning_rate": 0.0019276936081669808, - "loss": 1.5199, - "step": 278000 - }, - { - "epoch": 1.8109109825086156, - "grad_norm": 3.921875, - "learning_rate": 0.0019275635606996553, - "loss": 1.52, - "step": 278500 - }, - { - "epoch": 1.8141621691917549, - "grad_norm": 0.9375, - "learning_rate": 0.0019274335132323298, - "loss": 1.5242, - "step": 279000 - }, - { - "epoch": 1.8174133558748944, - "grad_norm": 2.125, - "learning_rate": 0.0019273034657650045, - "loss": 1.5241, - "step": 279500 - }, - { - "epoch": 1.8206645425580337, - "grad_norm": 0.65625, - "learning_rate": 0.0019271734182976787, - "loss": 1.5294, - "step": 280000 - }, - { - "epoch": 1.8239157292411732, - "grad_norm": 0.63671875, - "learning_rate": 0.0019270433708303532, - "loss": 1.5173, - "step": 280500 - }, - { - "epoch": 1.8271669159243125, - "grad_norm": 2.046875, - "learning_rate": 0.0019269133233630277, - "loss": 1.5224, - "step": 281000 - }, - { - "epoch": 1.8304181026074517, - "grad_norm": 0.96875, - "learning_rate": 0.001926783275895702, - "loss": 1.5304, - "step": 281500 - }, - { - "epoch": 1.833669289290591, - "grad_norm": 0.69140625, - "learning_rate": 0.0019266532284283764, - "loss": 1.5289, - "step": 282000 - }, - { - "epoch": 1.8369204759737303, - "grad_norm": 0.8203125, - "learning_rate": 0.001926523180961051, - "loss": 1.5198, - "step": 282500 - }, - { - "epoch": 1.8401716626568696, - "grad_norm": 0.703125, - "learning_rate": 0.0019263931334937252, - "loss": 1.5292, - "step": 283000 - }, - { - "epoch": 1.843422849340009, - "grad_norm": 0.83984375, - "learning_rate": 0.0019262630860263997, - "loss": 1.5336, - "step": 283500 - }, - { - "epoch": 1.8466740360231486, - "grad_norm": 3.90625, - "learning_rate": 0.0019261330385590741, - "loss": 1.5415, - "step": 284000 - }, - { - "epoch": 1.849925222706288, - "grad_norm": 0.77734375, - "learning_rate": 0.0019260029910917484, - "loss": 1.5339, - "step": 284500 - }, - { - "epoch": 1.8531764093894272, - "grad_norm": 1.3984375, - "learning_rate": 0.0019258729436244229, - "loss": 1.5423, - "step": 285000 - }, - { - "epoch": 1.8564275960725665, - "grad_norm": 1.1875, - "learning_rate": 0.0019257428961570973, - "loss": 1.5517, - "step": 285500 - }, - { - "epoch": 1.8596787827557058, - "grad_norm": 0.8359375, - "learning_rate": 0.0019256128486897716, - "loss": 1.5582, - "step": 286000 - }, - { - "epoch": 1.862929969438845, - "grad_norm": 1.0390625, - "learning_rate": 0.001925482801222446, - "loss": 1.5598, - "step": 286500 - }, - { - "epoch": 1.8661811561219845, - "grad_norm": 0.6171875, - "learning_rate": 0.0019253527537551208, - "loss": 1.5541, - "step": 287000 - }, - { - "epoch": 1.8694323428051238, - "grad_norm": 1.1328125, - "learning_rate": 0.0019252227062877953, - "loss": 1.551, - "step": 287500 - }, - { - "epoch": 1.8726835294882633, - "grad_norm": 0.625, - "learning_rate": 0.0019250926588204695, - "loss": 1.5573, - "step": 288000 - }, - { - "epoch": 1.8759347161714026, - "grad_norm": 1.1953125, - "learning_rate": 0.001924962611353144, - "loss": 1.5485, - "step": 288500 - }, - { - "epoch": 1.879185902854542, - "grad_norm": 0.7578125, - "learning_rate": 0.0019248325638858185, - "loss": 1.5529, - "step": 289000 - }, - { - "epoch": 1.8824370895376812, - "grad_norm": 0.8359375, - "learning_rate": 0.0019247025164184927, - "loss": 1.5538, - "step": 289500 - }, - { - "epoch": 1.8856882762208205, - "grad_norm": 0.87109375, - "learning_rate": 0.0019245724689511672, - "loss": 1.5524, - "step": 290000 - }, - { - "epoch": 1.88893946290396, - "grad_norm": 0.8515625, - "learning_rate": 0.0019244424214838417, - "loss": 1.5582, - "step": 290500 - }, - { - "epoch": 1.8921906495870993, - "grad_norm": 2.21875, - "learning_rate": 0.001924312374016516, - "loss": 1.5682, - "step": 291000 - }, - { - "epoch": 1.8954418362702388, - "grad_norm": 0.86328125, - "learning_rate": 0.0019241823265491904, - "loss": 1.5639, - "step": 291500 - }, - { - "epoch": 1.898693022953378, - "grad_norm": 0.7890625, - "learning_rate": 0.001924052279081865, - "loss": 1.5732, - "step": 292000 - }, - { - "epoch": 1.9019442096365173, - "grad_norm": 0.73828125, - "learning_rate": 0.0019239222316145392, - "loss": 1.5634, - "step": 292500 - }, - { - "epoch": 1.9051953963196566, - "grad_norm": 0.55859375, - "learning_rate": 0.0019237921841472137, - "loss": 1.5599, - "step": 293000 - }, - { - "epoch": 1.908446583002796, - "grad_norm": 0.6171875, - "learning_rate": 0.0019236621366798881, - "loss": 1.5507, - "step": 293500 - }, - { - "epoch": 1.9116977696859354, - "grad_norm": 0.74609375, - "learning_rate": 0.0019235320892125628, - "loss": 1.5467, - "step": 294000 - }, - { - "epoch": 1.9149489563690747, - "grad_norm": 0.62109375, - "learning_rate": 0.001923402041745237, - "loss": 1.5465, - "step": 294500 - }, - { - "epoch": 1.9182001430522142, - "grad_norm": 0.83203125, - "learning_rate": 0.0019232719942779116, - "loss": 1.5408, - "step": 295000 - }, - { - "epoch": 1.9214513297353535, - "grad_norm": 0.76171875, - "learning_rate": 0.001923141946810586, - "loss": 1.5481, - "step": 295500 - }, - { - "epoch": 1.9247025164184928, - "grad_norm": 0.796875, - "learning_rate": 0.0019230118993432603, - "loss": 1.5488, - "step": 296000 - }, - { - "epoch": 1.927953703101632, - "grad_norm": 0.828125, - "learning_rate": 0.0019228818518759348, - "loss": 1.5443, - "step": 296500 - }, - { - "epoch": 1.9312048897847713, - "grad_norm": 0.94921875, - "learning_rate": 0.0019227518044086093, - "loss": 1.5579, - "step": 297000 - }, - { - "epoch": 1.9344560764679106, - "grad_norm": 1.5078125, - "learning_rate": 0.0019226217569412835, - "loss": 1.5481, - "step": 297500 - }, - { - "epoch": 1.9377072631510501, - "grad_norm": 0.5859375, - "learning_rate": 0.001922491709473958, - "loss": 1.5384, - "step": 298000 - }, - { - "epoch": 1.9409584498341896, - "grad_norm": 1.3828125, - "learning_rate": 0.0019223616620066325, - "loss": 1.5387, - "step": 298500 - }, - { - "epoch": 1.944209636517329, - "grad_norm": 1.8125, - "learning_rate": 0.0019222316145393068, - "loss": 1.5439, - "step": 299000 - }, - { - "epoch": 1.9474608232004682, - "grad_norm": 0.640625, - "learning_rate": 0.0019221015670719812, - "loss": 1.5443, - "step": 299500 - }, - { - "epoch": 1.9507120098836075, - "grad_norm": 1.046875, - "learning_rate": 0.0019219715196046557, - "loss": 1.5366, - "step": 300000 - }, - { - "epoch": 1.9539631965667468, - "grad_norm": 0.734375, - "learning_rate": 0.00192184147213733, - "loss": 1.5391, - "step": 300500 - }, - { - "epoch": 1.957214383249886, - "grad_norm": 0.75390625, - "learning_rate": 0.0019217114246700044, - "loss": 1.5394, - "step": 301000 - }, - { - "epoch": 1.9604655699330256, - "grad_norm": 1.0703125, - "learning_rate": 0.0019215813772026791, - "loss": 1.5341, - "step": 301500 - }, - { - "epoch": 1.9637167566161648, - "grad_norm": 1.375, - "learning_rate": 0.0019214513297353536, - "loss": 1.5414, - "step": 302000 - }, - { - "epoch": 1.9669679432993044, - "grad_norm": 1.171875, - "learning_rate": 0.0019213212822680279, - "loss": 1.5436, - "step": 302500 - }, - { - "epoch": 1.9702191299824436, - "grad_norm": 1.2578125, - "learning_rate": 0.0019211912348007024, - "loss": 1.5284, - "step": 303000 - }, - { - "epoch": 1.973470316665583, - "grad_norm": 0.58984375, - "learning_rate": 0.0019210611873333768, - "loss": 1.5208, - "step": 303500 - }, - { - "epoch": 1.9767215033487222, - "grad_norm": 0.8984375, - "learning_rate": 0.001920931139866051, - "loss": 1.5234, - "step": 304000 - }, - { - "epoch": 1.9799726900318615, - "grad_norm": 0.91015625, - "learning_rate": 0.0019208010923987256, - "loss": 1.5299, - "step": 304500 - }, - { - "epoch": 1.983223876715001, - "grad_norm": 0.87109375, - "learning_rate": 0.0019206710449314, - "loss": 1.5367, - "step": 305000 - }, - { - "epoch": 1.9864750633981403, - "grad_norm": 1.0234375, - "learning_rate": 0.0019205409974640743, - "loss": 1.5328, - "step": 305500 - }, - { - "epoch": 1.9897262500812798, - "grad_norm": 0.74609375, - "learning_rate": 0.0019204109499967488, - "loss": 1.5326, - "step": 306000 - }, - { - "epoch": 1.992977436764419, - "grad_norm": 0.796875, - "learning_rate": 0.0019202809025294233, - "loss": 1.5357, - "step": 306500 - }, - { - "epoch": 1.9962286234475584, - "grad_norm": 0.78515625, - "learning_rate": 0.0019201508550620975, - "loss": 1.5266, - "step": 307000 - }, - { - "epoch": 1.9994798101306976, - "grad_norm": 5.8125, - "learning_rate": 0.001920020807594772, - "loss": 1.5319, - "step": 307500 - }, - { - "epoch": 2.0, - "eval_loss": 1.5122345685958862, - "eval_runtime": 0.5402, - "eval_samples_per_second": 1851.06, - "eval_steps_per_second": 29.617, - "step": 307580 - }, - { - "epoch": 2.002730996813837, - "grad_norm": 1.359375, - "learning_rate": 0.0019198907601274465, - "loss": 1.53, - "step": 308000 - }, - { - "epoch": 2.005982183496976, - "grad_norm": 0.74609375, - "learning_rate": 0.0019197607126601212, - "loss": 1.5226, - "step": 308500 - }, - { - "epoch": 2.009233370180116, - "grad_norm": 0.7734375, - "learning_rate": 0.0019196306651927955, - "loss": 1.5161, - "step": 309000 - }, - { - "epoch": 2.0124845568632552, - "grad_norm": 0.96484375, - "learning_rate": 0.00191950061772547, - "loss": 1.5224, - "step": 309500 - }, - { - "epoch": 2.0157357435463945, - "grad_norm": 0.66015625, - "learning_rate": 0.0019193705702581444, - "loss": 1.5189, - "step": 310000 - }, - { - "epoch": 2.018986930229534, - "grad_norm": 0.734375, - "learning_rate": 0.0019192405227908187, - "loss": 1.5231, - "step": 310500 - }, - { - "epoch": 2.022238116912673, - "grad_norm": 0.93359375, - "learning_rate": 0.0019191104753234932, - "loss": 1.5372, - "step": 311000 - }, - { - "epoch": 2.0254893035958124, - "grad_norm": 1.5859375, - "learning_rate": 0.0019189804278561676, - "loss": 1.5421, - "step": 311500 - }, - { - "epoch": 2.0287404902789516, - "grad_norm": 1.109375, - "learning_rate": 0.001918850380388842, - "loss": 1.5291, - "step": 312000 - }, - { - "epoch": 2.0319916769620914, - "grad_norm": 0.58984375, - "learning_rate": 0.0019187203329215164, - "loss": 1.534, - "step": 312500 - }, - { - "epoch": 2.0352428636452307, - "grad_norm": 2.078125, - "learning_rate": 0.0019185902854541908, - "loss": 1.5307, - "step": 313000 - }, - { - "epoch": 2.03849405032837, - "grad_norm": 0.58203125, - "learning_rate": 0.0019184602379868651, - "loss": 1.5325, - "step": 313500 - }, - { - "epoch": 2.0417452370115092, - "grad_norm": 0.94921875, - "learning_rate": 0.0019183301905195396, - "loss": 1.5389, - "step": 314000 - }, - { - "epoch": 2.0449964236946485, - "grad_norm": 0.69140625, - "learning_rate": 0.001918200143052214, - "loss": 1.5328, - "step": 314500 - }, - { - "epoch": 2.048247610377788, - "grad_norm": 1.640625, - "learning_rate": 0.0019180700955848883, - "loss": 1.5415, - "step": 315000 - }, - { - "epoch": 2.051498797060927, - "grad_norm": 0.76953125, - "learning_rate": 0.0019179400481175628, - "loss": 1.5549, - "step": 315500 - }, - { - "epoch": 2.0547499837440664, - "grad_norm": 0.6328125, - "learning_rate": 0.0019178100006502375, - "loss": 1.5597, - "step": 316000 - }, - { - "epoch": 2.058001170427206, - "grad_norm": 0.6796875, - "learning_rate": 0.001917679953182912, - "loss": 1.5947, - "step": 316500 - }, - { - "epoch": 2.0612523571103454, - "grad_norm": 0.68359375, - "learning_rate": 0.0019175499057155862, - "loss": 1.5896, - "step": 317000 - }, - { - "epoch": 2.0645035437934847, - "grad_norm": 1.203125, - "learning_rate": 0.0019174198582482607, - "loss": 1.5825, - "step": 317500 - }, - { - "epoch": 2.067754730476624, - "grad_norm": 1.203125, - "learning_rate": 0.0019172898107809352, - "loss": 1.5415, - "step": 318000 - }, - { - "epoch": 2.0710059171597632, - "grad_norm": 0.59765625, - "learning_rate": 0.0019171597633136095, - "loss": 1.5405, - "step": 318500 - }, - { - "epoch": 2.0742571038429025, - "grad_norm": 1.5, - "learning_rate": 0.001917029715846284, - "loss": 1.5333, - "step": 319000 - }, - { - "epoch": 2.077508290526042, - "grad_norm": 0.67578125, - "learning_rate": 0.0019168996683789584, - "loss": 1.5374, - "step": 319500 - }, - { - "epoch": 2.0807594772091815, - "grad_norm": 1.2109375, - "learning_rate": 0.0019167696209116327, - "loss": 1.5327, - "step": 320000 - }, - { - "epoch": 2.084010663892321, - "grad_norm": 1.3046875, - "learning_rate": 0.0019166395734443072, - "loss": 1.5281, - "step": 320500 - }, - { - "epoch": 2.08726185057546, - "grad_norm": 0.57421875, - "learning_rate": 0.0019165095259769816, - "loss": 1.529, - "step": 321000 - }, - { - "epoch": 2.0905130372585994, - "grad_norm": 0.8515625, - "learning_rate": 0.001916379478509656, - "loss": 1.5322, - "step": 321500 - }, - { - "epoch": 2.0937642239417387, - "grad_norm": 0.8125, - "learning_rate": 0.0019162494310423304, - "loss": 1.5284, - "step": 322000 - }, - { - "epoch": 2.097015410624878, - "grad_norm": 0.71875, - "learning_rate": 0.0019161193835750049, - "loss": 1.5338, - "step": 322500 - }, - { - "epoch": 2.1002665973080172, - "grad_norm": 0.85546875, - "learning_rate": 0.0019159893361076796, - "loss": 1.524, - "step": 323000 - }, - { - "epoch": 2.103517783991157, - "grad_norm": 0.8125, - "learning_rate": 0.0019158592886403538, - "loss": 1.5225, - "step": 323500 - }, - { - "epoch": 2.1067689706742962, - "grad_norm": 0.73046875, - "learning_rate": 0.0019157292411730283, - "loss": 1.5322, - "step": 324000 - }, - { - "epoch": 2.1100201573574355, - "grad_norm": 0.640625, - "learning_rate": 0.0019155991937057028, - "loss": 1.5216, - "step": 324500 - }, - { - "epoch": 2.113271344040575, - "grad_norm": 0.7734375, - "learning_rate": 0.001915469146238377, - "loss": 1.5237, - "step": 325000 - }, - { - "epoch": 2.116522530723714, - "grad_norm": 5.28125, - "learning_rate": 0.0019153390987710515, - "loss": 1.5298, - "step": 325500 - }, - { - "epoch": 2.1197737174068534, - "grad_norm": 0.66796875, - "learning_rate": 0.001915209051303726, - "loss": 1.5249, - "step": 326000 - }, - { - "epoch": 2.1230249040899927, - "grad_norm": 1.71875, - "learning_rate": 0.0019150790038364003, - "loss": 1.5262, - "step": 326500 - }, - { - "epoch": 2.1262760907731324, - "grad_norm": 1.0390625, - "learning_rate": 0.0019149489563690747, - "loss": 1.5222, - "step": 327000 - }, - { - "epoch": 2.1295272774562717, - "grad_norm": 0.76171875, - "learning_rate": 0.0019148189089017492, - "loss": 1.5148, - "step": 327500 - }, - { - "epoch": 2.132778464139411, - "grad_norm": 0.65234375, - "learning_rate": 0.0019146888614344235, - "loss": 1.5151, - "step": 328000 - }, - { - "epoch": 2.1360296508225503, - "grad_norm": 0.859375, - "learning_rate": 0.001914558813967098, - "loss": 1.5202, - "step": 328500 - }, - { - "epoch": 2.1392808375056895, - "grad_norm": 0.96875, - "learning_rate": 0.0019144287664997724, - "loss": 1.5299, - "step": 329000 - }, - { - "epoch": 2.142532024188829, - "grad_norm": 1.7265625, - "learning_rate": 0.0019142987190324467, - "loss": 1.5327, - "step": 329500 - }, - { - "epoch": 2.145783210871968, - "grad_norm": 0.81640625, - "learning_rate": 0.0019141686715651212, - "loss": 1.5325, - "step": 330000 - }, - { - "epoch": 2.1490343975551074, - "grad_norm": 1.15625, - "learning_rate": 0.0019140386240977959, - "loss": 1.5311, - "step": 330500 - }, - { - "epoch": 2.152285584238247, - "grad_norm": 0.75390625, - "learning_rate": 0.0019139085766304703, - "loss": 1.5228, - "step": 331000 - }, - { - "epoch": 2.1555367709213864, - "grad_norm": 0.75, - "learning_rate": 0.0019137785291631446, - "loss": 1.5265, - "step": 331500 - }, - { - "epoch": 2.1587879576045257, - "grad_norm": 0.62890625, - "learning_rate": 0.001913648481695819, - "loss": 1.5246, - "step": 332000 - }, - { - "epoch": 2.162039144287665, - "grad_norm": 1.078125, - "learning_rate": 0.0019135184342284936, - "loss": 1.5268, - "step": 332500 - }, - { - "epoch": 2.1652903309708043, - "grad_norm": 1.8046875, - "learning_rate": 0.0019133883867611678, - "loss": 1.5286, - "step": 333000 - }, - { - "epoch": 2.1685415176539435, - "grad_norm": 0.58203125, - "learning_rate": 0.0019132583392938423, - "loss": 1.5289, - "step": 333500 - }, - { - "epoch": 2.171792704337083, - "grad_norm": 3.296875, - "learning_rate": 0.0019131282918265168, - "loss": 1.5281, - "step": 334000 - }, - { - "epoch": 2.1750438910202226, - "grad_norm": 0.765625, - "learning_rate": 0.001912998244359191, - "loss": 1.5205, - "step": 334500 - }, - { - "epoch": 2.178295077703362, - "grad_norm": 1.171875, - "learning_rate": 0.0019128681968918655, - "loss": 1.518, - "step": 335000 - }, - { - "epoch": 2.181546264386501, - "grad_norm": 0.69921875, - "learning_rate": 0.00191273814942454, - "loss": 1.5153, - "step": 335500 - }, - { - "epoch": 2.1847974510696404, - "grad_norm": 0.58203125, - "learning_rate": 0.0019126081019572143, - "loss": 1.5212, - "step": 336000 - }, - { - "epoch": 2.1880486377527797, - "grad_norm": 1.6875, - "learning_rate": 0.0019124780544898887, - "loss": 1.5129, - "step": 336500 - }, - { - "epoch": 2.191299824435919, - "grad_norm": 0.87890625, - "learning_rate": 0.0019123480070225632, - "loss": 1.523, - "step": 337000 - }, - { - "epoch": 2.1945510111190583, - "grad_norm": 0.9296875, - "learning_rate": 0.001912217959555238, - "loss": 1.5178, - "step": 337500 - }, - { - "epoch": 2.197802197802198, - "grad_norm": 0.75390625, - "learning_rate": 0.0019120879120879122, - "loss": 1.5259, - "step": 338000 - }, - { - "epoch": 2.2010533844853373, - "grad_norm": 0.84765625, - "learning_rate": 0.0019119578646205867, - "loss": 1.5123, - "step": 338500 - }, - { - "epoch": 2.2043045711684766, - "grad_norm": 0.64453125, - "learning_rate": 0.0019118278171532611, - "loss": 1.5192, - "step": 339000 - }, - { - "epoch": 2.207555757851616, - "grad_norm": 0.7109375, - "learning_rate": 0.0019116977696859354, - "loss": 1.5165, - "step": 339500 - }, - { - "epoch": 2.210806944534755, - "grad_norm": 0.91796875, - "learning_rate": 0.0019115677222186099, - "loss": 1.517, - "step": 340000 - }, - { - "epoch": 2.2140581312178944, - "grad_norm": 0.53515625, - "learning_rate": 0.0019114376747512843, - "loss": 1.5276, - "step": 340500 - }, - { - "epoch": 2.2173093179010337, - "grad_norm": 0.66015625, - "learning_rate": 0.0019113076272839586, - "loss": 1.5122, - "step": 341000 - }, - { - "epoch": 2.2205605045841734, - "grad_norm": 0.859375, - "learning_rate": 0.001911177579816633, - "loss": 1.5157, - "step": 341500 - }, - { - "epoch": 2.2238116912673127, - "grad_norm": 0.6484375, - "learning_rate": 0.0019110475323493076, - "loss": 1.5224, - "step": 342000 - }, - { - "epoch": 2.227062877950452, - "grad_norm": 0.71484375, - "learning_rate": 0.0019109174848819818, - "loss": 1.5313, - "step": 342500 - }, - { - "epoch": 2.2303140646335913, - "grad_norm": 0.87890625, - "learning_rate": 0.0019107874374146563, - "loss": 1.5349, - "step": 343000 - }, - { - "epoch": 2.2335652513167306, - "grad_norm": 0.84375, - "learning_rate": 0.0019106573899473308, - "loss": 1.5298, - "step": 343500 - }, - { - "epoch": 2.23681643799987, - "grad_norm": 1.6875, - "learning_rate": 0.001910527342480005, - "loss": 1.5312, - "step": 344000 - }, - { - "epoch": 2.240067624683009, - "grad_norm": 1.0078125, - "learning_rate": 0.0019103972950126795, - "loss": 1.5363, - "step": 344500 - }, - { - "epoch": 2.243318811366149, - "grad_norm": 0.70703125, - "learning_rate": 0.0019102672475453542, - "loss": 1.535, - "step": 345000 - }, - { - "epoch": 2.246569998049288, - "grad_norm": 0.71484375, - "learning_rate": 0.0019101372000780287, - "loss": 1.5359, - "step": 345500 - }, - { - "epoch": 2.2498211847324274, - "grad_norm": 1.21875, - "learning_rate": 0.001910007152610703, - "loss": 1.5286, - "step": 346000 - }, - { - "epoch": 2.2530723714155667, - "grad_norm": 0.72265625, - "learning_rate": 0.0019098771051433774, - "loss": 1.5244, - "step": 346500 - }, - { - "epoch": 2.256323558098706, - "grad_norm": 0.61328125, - "learning_rate": 0.001909747057676052, - "loss": 1.5219, - "step": 347000 - }, - { - "epoch": 2.2595747447818453, - "grad_norm": 0.83203125, - "learning_rate": 0.0019096170102087262, - "loss": 1.5239, - "step": 347500 - }, - { - "epoch": 2.2628259314649846, - "grad_norm": 0.90234375, - "learning_rate": 0.0019094869627414007, - "loss": 1.5331, - "step": 348000 - }, - { - "epoch": 2.2660771181481243, - "grad_norm": 3.984375, - "learning_rate": 0.0019093569152740751, - "loss": 1.5267, - "step": 348500 - }, - { - "epoch": 2.2693283048312636, - "grad_norm": 0.59375, - "learning_rate": 0.0019092268678067494, - "loss": 1.5241, - "step": 349000 - }, - { - "epoch": 2.272579491514403, - "grad_norm": 0.734375, - "learning_rate": 0.0019090968203394239, - "loss": 1.5222, - "step": 349500 - }, - { - "epoch": 2.275830678197542, - "grad_norm": 0.78125, - "learning_rate": 0.0019089667728720984, - "loss": 1.5208, - "step": 350000 - }, - { - "epoch": 2.2790818648806814, - "grad_norm": 0.703125, - "learning_rate": 0.0019088367254047726, - "loss": 1.5276, - "step": 350500 - }, - { - "epoch": 2.2823330515638207, - "grad_norm": 0.63671875, - "learning_rate": 0.001908706677937447, - "loss": 1.5304, - "step": 351000 - }, - { - "epoch": 2.28558423824696, - "grad_norm": 0.63671875, - "learning_rate": 0.0019085766304701216, - "loss": 1.5276, - "step": 351500 - }, - { - "epoch": 2.2888354249300997, - "grad_norm": 1.1328125, - "learning_rate": 0.0019084465830027963, - "loss": 1.5292, - "step": 352000 - }, - { - "epoch": 2.292086611613239, - "grad_norm": 0.6328125, - "learning_rate": 0.0019083165355354705, - "loss": 1.5298, - "step": 352500 - }, - { - "epoch": 2.2953377982963783, - "grad_norm": 3.234375, - "learning_rate": 0.001908186488068145, - "loss": 1.5257, - "step": 353000 - }, - { - "epoch": 2.2985889849795176, - "grad_norm": 3.953125, - "learning_rate": 0.0019080564406008195, - "loss": 1.5232, - "step": 353500 - }, - { - "epoch": 2.301840171662657, - "grad_norm": 14.6875, - "learning_rate": 0.0019079263931334938, - "loss": 1.519, - "step": 354000 - }, - { - "epoch": 2.305091358345796, - "grad_norm": 0.82421875, - "learning_rate": 0.0019077963456661682, - "loss": 1.5199, - "step": 354500 - }, - { - "epoch": 2.3083425450289354, - "grad_norm": 0.66796875, - "learning_rate": 0.0019076662981988427, - "loss": 1.5216, - "step": 355000 - }, - { - "epoch": 2.3115937317120747, - "grad_norm": 1.0078125, - "learning_rate": 0.001907536250731517, - "loss": 1.522, - "step": 355500 - }, - { - "epoch": 2.3148449183952144, - "grad_norm": 0.94140625, - "learning_rate": 0.0019074062032641914, - "loss": 1.5132, - "step": 356000 - }, - { - "epoch": 2.3180961050783537, - "grad_norm": 0.93359375, - "learning_rate": 0.001907276155796866, - "loss": 1.5111, - "step": 356500 - }, - { - "epoch": 2.321347291761493, - "grad_norm": 2.78125, - "learning_rate": 0.0019071461083295402, - "loss": 1.5108, - "step": 357000 - }, - { - "epoch": 2.3245984784446323, - "grad_norm": 3.3125, - "learning_rate": 0.0019070160608622147, - "loss": 1.507, - "step": 357500 - }, - { - "epoch": 2.3278496651277716, - "grad_norm": 0.7734375, - "learning_rate": 0.0019068860133948891, - "loss": 1.5052, - "step": 358000 - }, - { - "epoch": 2.331100851810911, - "grad_norm": 0.91015625, - "learning_rate": 0.0019067559659275634, - "loss": 1.5098, - "step": 358500 - }, - { - "epoch": 2.33435203849405, - "grad_norm": 0.8203125, - "learning_rate": 0.0019066259184602379, - "loss": 1.5149, - "step": 359000 - }, - { - "epoch": 2.3376032251771894, - "grad_norm": 0.68359375, - "learning_rate": 0.0019064958709929126, - "loss": 1.5081, - "step": 359500 - }, - { - "epoch": 2.340854411860329, - "grad_norm": 0.8203125, - "learning_rate": 0.001906365823525587, - "loss": 1.5098, - "step": 360000 - }, - { - "epoch": 2.3441055985434684, - "grad_norm": 0.5859375, - "learning_rate": 0.0019062357760582613, - "loss": 1.5113, - "step": 360500 - }, - { - "epoch": 2.3473567852266077, - "grad_norm": 0.921875, - "learning_rate": 0.0019061057285909358, - "loss": 1.5087, - "step": 361000 - }, - { - "epoch": 2.350607971909747, - "grad_norm": 0.74609375, - "learning_rate": 0.0019059756811236103, - "loss": 1.5004, - "step": 361500 - }, - { - "epoch": 2.3538591585928863, - "grad_norm": 0.59375, - "learning_rate": 0.0019058456336562845, - "loss": 1.5032, - "step": 362000 - }, - { - "epoch": 2.3571103452760256, - "grad_norm": 0.66796875, - "learning_rate": 0.001905715586188959, - "loss": 1.5072, - "step": 362500 - }, - { - "epoch": 2.360361531959165, - "grad_norm": 0.57421875, - "learning_rate": 0.0019055855387216335, - "loss": 1.5085, - "step": 363000 - }, - { - "epoch": 2.3636127186423046, - "grad_norm": 0.73828125, - "learning_rate": 0.0019054554912543078, - "loss": 1.5041, - "step": 363500 - }, - { - "epoch": 2.366863905325444, - "grad_norm": 0.58203125, - "learning_rate": 0.0019053254437869822, - "loss": 1.5088, - "step": 364000 - }, - { - "epoch": 2.370115092008583, - "grad_norm": 0.84375, - "learning_rate": 0.0019051953963196567, - "loss": 1.5057, - "step": 364500 - }, - { - "epoch": 2.3733662786917225, - "grad_norm": 0.7890625, - "learning_rate": 0.001905065348852331, - "loss": 1.5074, - "step": 365000 - }, - { - "epoch": 2.3766174653748617, - "grad_norm": 7.75, - "learning_rate": 0.0019049353013850055, - "loss": 1.5067, - "step": 365500 - }, - { - "epoch": 2.379868652058001, - "grad_norm": 1.5625, - "learning_rate": 0.00190480525391768, - "loss": 1.5077, - "step": 366000 - }, - { - "epoch": 2.3831198387411403, - "grad_norm": 0.9609375, - "learning_rate": 0.0019046752064503546, - "loss": 1.5026, - "step": 366500 - }, - { - "epoch": 2.38637102542428, - "grad_norm": 2.265625, - "learning_rate": 0.0019045451589830289, - "loss": 1.5061, - "step": 367000 - }, - { - "epoch": 2.3896222121074193, - "grad_norm": 1.1015625, - "learning_rate": 0.0019044151115157034, - "loss": 1.5035, - "step": 367500 - }, - { - "epoch": 2.3928733987905586, - "grad_norm": 1.1875, - "learning_rate": 0.0019042850640483778, - "loss": 1.5019, - "step": 368000 - }, - { - "epoch": 2.396124585473698, - "grad_norm": 1.5, - "learning_rate": 0.0019041550165810521, - "loss": 1.4978, - "step": 368500 - }, - { - "epoch": 2.399375772156837, - "grad_norm": 0.74609375, - "learning_rate": 0.0019040249691137266, - "loss": 1.4969, - "step": 369000 - }, - { - "epoch": 2.4026269588399765, - "grad_norm": 2.109375, - "learning_rate": 0.001903894921646401, - "loss": 1.4945, - "step": 369500 - }, - { - "epoch": 2.4058781455231157, - "grad_norm": 0.7265625, - "learning_rate": 0.0019037648741790753, - "loss": 1.4858, - "step": 370000 - }, - { - "epoch": 2.4091293322062555, - "grad_norm": 0.93359375, - "learning_rate": 0.0019036348267117498, - "loss": 1.4978, - "step": 370500 - }, - { - "epoch": 2.4123805188893948, - "grad_norm": 0.94921875, - "learning_rate": 0.0019035047792444243, - "loss": 1.491, - "step": 371000 - }, - { - "epoch": 2.415631705572534, - "grad_norm": 1.53125, - "learning_rate": 0.0019033747317770985, - "loss": 1.498, - "step": 371500 - }, - { - "epoch": 2.4188828922556733, - "grad_norm": 1.8828125, - "learning_rate": 0.001903244684309773, - "loss": 1.4916, - "step": 372000 - }, - { - "epoch": 2.4221340789388126, - "grad_norm": 0.75, - "learning_rate": 0.0019031146368424475, - "loss": 1.4962, - "step": 372500 - }, - { - "epoch": 2.425385265621952, - "grad_norm": 0.8984375, - "learning_rate": 0.0019029845893751218, - "loss": 1.4885, - "step": 373000 - }, - { - "epoch": 2.428636452305091, - "grad_norm": 1.421875, - "learning_rate": 0.0019028545419077962, - "loss": 1.4952, - "step": 373500 - }, - { - "epoch": 2.431887638988231, - "grad_norm": 1.6953125, - "learning_rate": 0.001902724494440471, - "loss": 1.4921, - "step": 374000 - }, - { - "epoch": 2.43513882567137, - "grad_norm": 0.5625, - "learning_rate": 0.0019025944469731454, - "loss": 1.4982, - "step": 374500 - }, - { - "epoch": 2.4383900123545095, - "grad_norm": 0.69140625, - "learning_rate": 0.0019024643995058197, - "loss": 1.5005, - "step": 375000 - }, - { - "epoch": 2.4416411990376488, - "grad_norm": 0.6796875, - "learning_rate": 0.0019023343520384942, - "loss": 1.4998, - "step": 375500 - }, - { - "epoch": 2.444892385720788, - "grad_norm": 0.75390625, - "learning_rate": 0.0019022043045711686, - "loss": 1.4939, - "step": 376000 - }, - { - "epoch": 2.4481435724039273, - "grad_norm": 0.83203125, - "learning_rate": 0.001902074257103843, - "loss": 1.4971, - "step": 376500 - }, - { - "epoch": 2.4513947590870666, - "grad_norm": 0.80078125, - "learning_rate": 0.0019019442096365174, - "loss": 1.4931, - "step": 377000 - }, - { - "epoch": 2.4546459457702063, - "grad_norm": 2.890625, - "learning_rate": 0.0019018141621691919, - "loss": 1.4884, - "step": 377500 - }, - { - "epoch": 2.4578971324533456, - "grad_norm": 2.0625, - "learning_rate": 0.0019016841147018661, - "loss": 1.4982, - "step": 378000 - }, - { - "epoch": 2.461148319136485, - "grad_norm": 1.0625, - "learning_rate": 0.0019015540672345406, - "loss": 1.496, - "step": 378500 - }, - { - "epoch": 2.464399505819624, - "grad_norm": 0.6796875, - "learning_rate": 0.001901424019767215, - "loss": 1.4937, - "step": 379000 - }, - { - "epoch": 2.4676506925027635, - "grad_norm": 1.2109375, - "learning_rate": 0.0019012939722998893, - "loss": 1.4894, - "step": 379500 - }, - { - "epoch": 2.4709018791859028, - "grad_norm": 0.6484375, - "learning_rate": 0.0019011639248325638, - "loss": 1.4896, - "step": 380000 - }, - { - "epoch": 2.474153065869042, - "grad_norm": 3.859375, - "learning_rate": 0.0019010338773652383, - "loss": 1.492, - "step": 380500 - }, - { - "epoch": 2.4774042525521818, - "grad_norm": 0.92578125, - "learning_rate": 0.001900903829897913, - "loss": 1.499, - "step": 381000 - }, - { - "epoch": 2.480655439235321, - "grad_norm": 0.69921875, - "learning_rate": 0.0019007737824305873, - "loss": 1.4932, - "step": 381500 - }, - { - "epoch": 2.4839066259184603, - "grad_norm": 0.66796875, - "learning_rate": 0.0019006437349632617, - "loss": 1.4962, - "step": 382000 - }, - { - "epoch": 2.4871578126015996, - "grad_norm": 0.8359375, - "learning_rate": 0.0019005136874959362, - "loss": 1.4942, - "step": 382500 - }, - { - "epoch": 2.490408999284739, - "grad_norm": 0.671875, - "learning_rate": 0.0019003836400286105, - "loss": 1.487, - "step": 383000 - }, - { - "epoch": 2.493660185967878, - "grad_norm": 0.75, - "learning_rate": 0.001900253592561285, - "loss": 1.4892, - "step": 383500 - }, - { - "epoch": 2.4969113726510175, - "grad_norm": 0.85546875, - "learning_rate": 0.0019001235450939594, - "loss": 1.4959, - "step": 384000 - }, - { - "epoch": 2.500162559334157, - "grad_norm": 0.765625, - "learning_rate": 0.0018999934976266337, - "loss": 1.4904, - "step": 384500 - }, - { - "epoch": 2.503413746017296, - "grad_norm": 0.53515625, - "learning_rate": 0.0018998634501593082, - "loss": 1.4926, - "step": 385000 - }, - { - "epoch": 2.5066649327004358, - "grad_norm": 2.390625, - "learning_rate": 0.0018997334026919826, - "loss": 1.4885, - "step": 385500 - }, - { - "epoch": 2.509916119383575, - "grad_norm": 0.6796875, - "learning_rate": 0.001899603355224657, - "loss": 1.489, - "step": 386000 - }, - { - "epoch": 2.5131673060667143, - "grad_norm": 0.7890625, - "learning_rate": 0.0018994733077573314, - "loss": 1.4856, - "step": 386500 - }, - { - "epoch": 2.5164184927498536, - "grad_norm": 2.359375, - "learning_rate": 0.0018993432602900059, - "loss": 1.4854, - "step": 387000 - }, - { - "epoch": 2.519669679432993, - "grad_norm": 0.7421875, - "learning_rate": 0.0018992132128226801, - "loss": 1.487, - "step": 387500 - }, - { - "epoch": 2.5229208661161326, - "grad_norm": 0.86328125, - "learning_rate": 0.0018990831653553546, - "loss": 1.4987, - "step": 388000 - }, - { - "epoch": 2.5261720527992715, - "grad_norm": 0.671875, - "learning_rate": 0.0018989531178880293, - "loss": 1.4923, - "step": 388500 - }, - { - "epoch": 2.529423239482411, - "grad_norm": 1.0390625, - "learning_rate": 0.0018988230704207038, - "loss": 1.4905, - "step": 389000 - }, - { - "epoch": 2.5326744261655505, - "grad_norm": 4.875, - "learning_rate": 0.001898693022953378, - "loss": 1.4929, - "step": 389500 - }, - { - "epoch": 2.53592561284869, - "grad_norm": 1.2109375, - "learning_rate": 0.0018985629754860525, - "loss": 1.4889, - "step": 390000 - }, - { - "epoch": 2.539176799531829, - "grad_norm": 0.8671875, - "learning_rate": 0.001898432928018727, - "loss": 1.4915, - "step": 390500 - }, - { - "epoch": 2.5424279862149683, - "grad_norm": 0.87109375, - "learning_rate": 0.0018983028805514013, - "loss": 1.484, - "step": 391000 - }, - { - "epoch": 2.545679172898108, - "grad_norm": 0.8125, - "learning_rate": 0.0018981728330840757, - "loss": 1.4877, - "step": 391500 - }, - { - "epoch": 2.548930359581247, - "grad_norm": 0.71875, - "learning_rate": 0.0018980427856167502, - "loss": 1.4979, - "step": 392000 - }, - { - "epoch": 2.5521815462643866, - "grad_norm": 0.71875, - "learning_rate": 0.0018979127381494245, - "loss": 1.4893, - "step": 392500 - }, - { - "epoch": 2.555432732947526, - "grad_norm": 1.0078125, - "learning_rate": 0.001897782690682099, - "loss": 1.4904, - "step": 393000 - }, - { - "epoch": 2.558683919630665, - "grad_norm": 0.64453125, - "learning_rate": 0.0018976526432147734, - "loss": 1.4982, - "step": 393500 - }, - { - "epoch": 2.5619351063138045, - "grad_norm": 1.46875, - "learning_rate": 0.0018975225957474477, - "loss": 1.4952, - "step": 394000 - }, - { - "epoch": 2.565186292996944, - "grad_norm": 0.7578125, - "learning_rate": 0.0018973925482801222, - "loss": 1.4959, - "step": 394500 - }, - { - "epoch": 2.568437479680083, - "grad_norm": 0.7578125, - "learning_rate": 0.0018972625008127967, - "loss": 1.488, - "step": 395000 - }, - { - "epoch": 2.5716886663632224, - "grad_norm": 1.734375, - "learning_rate": 0.0018971324533454713, - "loss": 1.491, - "step": 395500 - }, - { - "epoch": 2.574939853046362, - "grad_norm": 1.609375, - "learning_rate": 0.0018970024058781456, - "loss": 1.4935, - "step": 396000 - }, - { - "epoch": 2.5781910397295014, - "grad_norm": 1.1328125, - "learning_rate": 0.00189687235841082, - "loss": 1.4867, - "step": 396500 - }, - { - "epoch": 2.5814422264126407, - "grad_norm": 0.69921875, - "learning_rate": 0.0018967423109434946, - "loss": 1.4899, - "step": 397000 - }, - { - "epoch": 2.58469341309578, - "grad_norm": 2.890625, - "learning_rate": 0.0018966122634761688, - "loss": 1.4949, - "step": 397500 - }, - { - "epoch": 2.587944599778919, - "grad_norm": 0.87109375, - "learning_rate": 0.0018964822160088433, - "loss": 1.4928, - "step": 398000 - }, - { - "epoch": 2.5911957864620585, - "grad_norm": 0.70703125, - "learning_rate": 0.0018963521685415178, - "loss": 1.4994, - "step": 398500 - }, - { - "epoch": 2.594446973145198, - "grad_norm": 1.0078125, - "learning_rate": 0.001896222121074192, - "loss": 1.4884, - "step": 399000 - }, - { - "epoch": 2.5976981598283375, - "grad_norm": 0.88671875, - "learning_rate": 0.0018960920736068665, - "loss": 1.479, - "step": 399500 - }, - { - "epoch": 2.600949346511477, - "grad_norm": 0.7109375, - "learning_rate": 0.001895962026139541, - "loss": 1.4862, - "step": 400000 - }, - { - "epoch": 2.604200533194616, - "grad_norm": 0.75, - "learning_rate": 0.0018958319786722153, - "loss": 1.4912, - "step": 400500 - }, - { - "epoch": 2.6074517198777554, - "grad_norm": 0.65234375, - "learning_rate": 0.0018957019312048897, - "loss": 1.4863, - "step": 401000 - }, - { - "epoch": 2.6107029065608947, - "grad_norm": 0.7109375, - "learning_rate": 0.0018955718837375642, - "loss": 1.4892, - "step": 401500 - }, - { - "epoch": 2.613954093244034, - "grad_norm": 0.87109375, - "learning_rate": 0.0018954418362702385, - "loss": 1.4843, - "step": 402000 - }, - { - "epoch": 2.6172052799271732, - "grad_norm": 1.0703125, - "learning_rate": 0.001895311788802913, - "loss": 1.4803, - "step": 402500 - }, - { - "epoch": 2.620456466610313, - "grad_norm": 0.99609375, - "learning_rate": 0.0018951817413355877, - "loss": 1.4781, - "step": 403000 - }, - { - "epoch": 2.6237076532934522, - "grad_norm": 0.75, - "learning_rate": 0.0018950516938682621, - "loss": 1.484, - "step": 403500 - }, - { - "epoch": 2.6269588399765915, - "grad_norm": 0.7265625, - "learning_rate": 0.0018949216464009364, - "loss": 1.4841, - "step": 404000 - }, - { - "epoch": 2.630210026659731, - "grad_norm": 0.8359375, - "learning_rate": 0.0018947915989336109, - "loss": 1.4807, - "step": 404500 - }, - { - "epoch": 2.63346121334287, - "grad_norm": 0.7578125, - "learning_rate": 0.0018946615514662854, - "loss": 1.4828, - "step": 405000 - }, - { - "epoch": 2.6367124000260094, - "grad_norm": 0.78125, - "learning_rate": 0.0018945315039989596, - "loss": 1.4817, - "step": 405500 - }, - { - "epoch": 2.6399635867091487, - "grad_norm": 0.83203125, - "learning_rate": 0.001894401456531634, - "loss": 1.4844, - "step": 406000 - }, - { - "epoch": 2.6432147733922884, - "grad_norm": 0.82421875, - "learning_rate": 0.0018942714090643086, - "loss": 1.4832, - "step": 406500 - }, - { - "epoch": 2.6464659600754277, - "grad_norm": 0.6796875, - "learning_rate": 0.0018941413615969828, - "loss": 1.4784, - "step": 407000 - }, - { - "epoch": 2.649717146758567, - "grad_norm": 1.0078125, - "learning_rate": 0.0018940113141296573, - "loss": 1.4795, - "step": 407500 - }, - { - "epoch": 2.6529683334417062, - "grad_norm": 0.7109375, - "learning_rate": 0.0018938812666623318, - "loss": 1.4795, - "step": 408000 - }, - { - "epoch": 2.6562195201248455, - "grad_norm": 0.76171875, - "learning_rate": 0.001893751219195006, - "loss": 1.4835, - "step": 408500 - }, - { - "epoch": 2.659470706807985, - "grad_norm": 0.7109375, - "learning_rate": 0.0018936211717276805, - "loss": 1.4775, - "step": 409000 - }, - { - "epoch": 2.662721893491124, - "grad_norm": 0.7265625, - "learning_rate": 0.001893491124260355, - "loss": 1.4808, - "step": 409500 - }, - { - "epoch": 2.665973080174264, - "grad_norm": 0.69921875, - "learning_rate": 0.0018933610767930297, - "loss": 1.4789, - "step": 410000 - }, - { - "epoch": 2.669224266857403, - "grad_norm": 0.73046875, - "learning_rate": 0.001893231029325704, - "loss": 1.4741, - "step": 410500 - }, - { - "epoch": 2.6724754535405424, - "grad_norm": 0.83984375, - "learning_rate": 0.0018931009818583784, - "loss": 1.4803, - "step": 411000 - }, - { - "epoch": 2.6757266402236817, - "grad_norm": 0.6484375, - "learning_rate": 0.001892970934391053, - "loss": 1.4856, - "step": 411500 - }, - { - "epoch": 2.678977826906821, - "grad_norm": 1.4296875, - "learning_rate": 0.0018928408869237272, - "loss": 1.4796, - "step": 412000 - }, - { - "epoch": 2.6822290135899602, - "grad_norm": 0.91015625, - "learning_rate": 0.0018927108394564017, - "loss": 1.4844, - "step": 412500 - }, - { - "epoch": 2.6854802002730995, - "grad_norm": 3.4375, - "learning_rate": 0.0018925807919890761, - "loss": 1.4866, - "step": 413000 - }, - { - "epoch": 2.6887313869562393, - "grad_norm": 0.70703125, - "learning_rate": 0.0018924507445217504, - "loss": 1.496, - "step": 413500 - }, - { - "epoch": 2.691982573639378, - "grad_norm": 0.7109375, - "learning_rate": 0.0018923206970544249, - "loss": 1.4882, - "step": 414000 - }, - { - "epoch": 2.695233760322518, - "grad_norm": 2.734375, - "learning_rate": 0.0018921906495870994, - "loss": 1.4902, - "step": 414500 - }, - { - "epoch": 2.698484947005657, - "grad_norm": 0.7109375, - "learning_rate": 0.0018920606021197736, - "loss": 1.4878, - "step": 415000 - }, - { - "epoch": 2.7017361336887964, - "grad_norm": 0.6328125, - "learning_rate": 0.001891930554652448, - "loss": 1.4858, - "step": 415500 - }, - { - "epoch": 2.7049873203719357, - "grad_norm": 0.7265625, - "learning_rate": 0.0018918005071851226, - "loss": 1.4857, - "step": 416000 - }, - { - "epoch": 2.708238507055075, - "grad_norm": 1.09375, - "learning_rate": 0.0018916704597177968, - "loss": 1.4811, - "step": 416500 - }, - { - "epoch": 2.7114896937382147, - "grad_norm": 0.67578125, - "learning_rate": 0.0018915404122504713, - "loss": 1.481, - "step": 417000 - }, - { - "epoch": 2.7147408804213535, - "grad_norm": 0.609375, - "learning_rate": 0.001891410364783146, - "loss": 1.4776, - "step": 417500 - }, - { - "epoch": 2.7179920671044933, - "grad_norm": 15.0625, - "learning_rate": 0.0018912803173158205, - "loss": 1.4783, - "step": 418000 - }, - { - "epoch": 2.7212432537876325, - "grad_norm": 1.1640625, - "learning_rate": 0.0018911502698484948, - "loss": 1.4751, - "step": 418500 - }, - { - "epoch": 2.724494440470772, - "grad_norm": 0.90234375, - "learning_rate": 0.0018910202223811692, - "loss": 1.4733, - "step": 419000 - }, - { - "epoch": 2.727745627153911, - "grad_norm": 1.8359375, - "learning_rate": 0.0018908901749138437, - "loss": 1.4783, - "step": 419500 - }, - { - "epoch": 2.7309968138370504, - "grad_norm": 1.6171875, - "learning_rate": 0.001890760127446518, - "loss": 1.4742, - "step": 420000 - }, - { - "epoch": 2.73424800052019, - "grad_norm": 0.6796875, - "learning_rate": 0.0018906300799791925, - "loss": 1.4728, - "step": 420500 - }, - { - "epoch": 2.737499187203329, - "grad_norm": 0.66015625, - "learning_rate": 0.001890500032511867, - "loss": 1.4777, - "step": 421000 - }, - { - "epoch": 2.7407503738864687, - "grad_norm": 0.66796875, - "learning_rate": 0.0018903699850445412, - "loss": 1.4702, - "step": 421500 - }, - { - "epoch": 2.744001560569608, - "grad_norm": 0.74609375, - "learning_rate": 0.0018902399375772157, - "loss": 1.474, - "step": 422000 - }, - { - "epoch": 2.7472527472527473, - "grad_norm": 1.09375, - "learning_rate": 0.0018901098901098902, - "loss": 1.4753, - "step": 422500 - }, - { - "epoch": 2.7505039339358865, - "grad_norm": 0.88671875, - "learning_rate": 0.0018899798426425644, - "loss": 1.4768, - "step": 423000 - }, - { - "epoch": 2.753755120619026, - "grad_norm": 0.63671875, - "learning_rate": 0.001889849795175239, - "loss": 1.4777, - "step": 423500 - }, - { - "epoch": 2.7570063073021656, - "grad_norm": 1.921875, - "learning_rate": 0.0018897197477079134, - "loss": 1.4726, - "step": 424000 - }, - { - "epoch": 2.7602574939853044, - "grad_norm": 0.63671875, - "learning_rate": 0.001889589700240588, - "loss": 1.477, - "step": 424500 - }, - { - "epoch": 2.763508680668444, - "grad_norm": 0.62890625, - "learning_rate": 0.0018894596527732623, - "loss": 1.4721, - "step": 425000 - }, - { - "epoch": 2.7667598673515834, - "grad_norm": 0.76953125, - "learning_rate": 0.0018893296053059368, - "loss": 1.4787, - "step": 425500 - }, - { - "epoch": 2.7700110540347227, - "grad_norm": 0.8125, - "learning_rate": 0.0018891995578386113, - "loss": 1.4727, - "step": 426000 - }, - { - "epoch": 2.773262240717862, - "grad_norm": 0.78125, - "learning_rate": 0.0018890695103712855, - "loss": 1.4694, - "step": 426500 - }, - { - "epoch": 2.7765134274010013, - "grad_norm": 0.6796875, - "learning_rate": 0.00188893946290396, - "loss": 1.476, - "step": 427000 - }, - { - "epoch": 2.7797646140841406, - "grad_norm": 0.7265625, - "learning_rate": 0.0018888094154366345, - "loss": 1.4783, - "step": 427500 - }, - { - "epoch": 2.78301580076728, - "grad_norm": 0.8046875, - "learning_rate": 0.0018886793679693088, - "loss": 1.4707, - "step": 428000 - }, - { - "epoch": 2.7862669874504196, - "grad_norm": 0.6484375, - "learning_rate": 0.0018885493205019832, - "loss": 1.4717, - "step": 428500 - }, - { - "epoch": 2.789518174133559, - "grad_norm": 0.8203125, - "learning_rate": 0.0018884192730346577, - "loss": 1.4716, - "step": 429000 - }, - { - "epoch": 2.792769360816698, - "grad_norm": 0.6328125, - "learning_rate": 0.001888289225567332, - "loss": 1.4667, - "step": 429500 - }, - { - "epoch": 2.7960205474998374, - "grad_norm": 0.69140625, - "learning_rate": 0.0018881591781000065, - "loss": 1.4683, - "step": 430000 - }, - { - "epoch": 2.7992717341829767, - "grad_norm": 12.0625, - "learning_rate": 0.001888029130632681, - "loss": 1.4719, - "step": 430500 - }, - { - "epoch": 2.802522920866116, - "grad_norm": 1.9296875, - "learning_rate": 0.0018878990831653552, - "loss": 1.4711, - "step": 431000 - }, - { - "epoch": 2.8057741075492553, - "grad_norm": 1.390625, - "learning_rate": 0.0018877690356980297, - "loss": 1.4727, - "step": 431500 - }, - { - "epoch": 2.809025294232395, - "grad_norm": 0.65234375, - "learning_rate": 0.0018876389882307044, - "loss": 1.472, - "step": 432000 - }, - { - "epoch": 2.8122764809155343, - "grad_norm": 0.515625, - "learning_rate": 0.0018875089407633789, - "loss": 1.4796, - "step": 432500 - }, - { - "epoch": 2.8155276675986736, - "grad_norm": 0.90234375, - "learning_rate": 0.0018873788932960531, - "loss": 1.4719, - "step": 433000 - }, - { - "epoch": 2.818778854281813, - "grad_norm": 0.83984375, - "learning_rate": 0.0018872488458287276, - "loss": 1.4764, - "step": 433500 - }, - { - "epoch": 2.822030040964952, - "grad_norm": 0.7109375, - "learning_rate": 0.001887118798361402, - "loss": 1.4712, - "step": 434000 - }, - { - "epoch": 2.8252812276480914, - "grad_norm": 12.875, - "learning_rate": 0.0018869887508940763, - "loss": 1.4664, - "step": 434500 - }, - { - "epoch": 2.8285324143312307, - "grad_norm": 0.6640625, - "learning_rate": 0.0018868587034267508, - "loss": 1.4708, - "step": 435000 - }, - { - "epoch": 2.8317836010143704, - "grad_norm": 3.203125, - "learning_rate": 0.0018867286559594253, - "loss": 1.4718, - "step": 435500 - }, - { - "epoch": 2.8350347876975097, - "grad_norm": 0.66796875, - "learning_rate": 0.0018865986084920996, - "loss": 1.4719, - "step": 436000 - }, - { - "epoch": 2.838285974380649, - "grad_norm": 0.69921875, - "learning_rate": 0.001886468561024774, - "loss": 1.4692, - "step": 436500 - }, - { - "epoch": 2.8415371610637883, - "grad_norm": 2.953125, - "learning_rate": 0.0018863385135574485, - "loss": 1.4735, - "step": 437000 - }, - { - "epoch": 2.8447883477469276, - "grad_norm": 0.70703125, - "learning_rate": 0.0018862084660901228, - "loss": 1.4724, - "step": 437500 - }, - { - "epoch": 2.848039534430067, - "grad_norm": 0.74609375, - "learning_rate": 0.0018860784186227973, - "loss": 1.4687, - "step": 438000 - }, - { - "epoch": 2.851290721113206, - "grad_norm": 0.73046875, - "learning_rate": 0.0018859483711554717, - "loss": 1.4734, - "step": 438500 - }, - { - "epoch": 2.854541907796346, - "grad_norm": 0.76171875, - "learning_rate": 0.0018858183236881464, - "loss": 1.4705, - "step": 439000 - }, - { - "epoch": 2.857793094479485, - "grad_norm": 0.6953125, - "learning_rate": 0.0018856882762208207, - "loss": 1.4702, - "step": 439500 - }, - { - "epoch": 2.8610442811626244, - "grad_norm": 0.640625, - "learning_rate": 0.0018855582287534952, - "loss": 1.4725, - "step": 440000 - }, - { - "epoch": 2.8642954678457637, - "grad_norm": 0.87109375, - "learning_rate": 0.0018854281812861696, - "loss": 1.4784, - "step": 440500 - }, - { - "epoch": 2.867546654528903, - "grad_norm": 1.15625, - "learning_rate": 0.001885298133818844, - "loss": 1.4737, - "step": 441000 - }, - { - "epoch": 2.8707978412120423, - "grad_norm": 0.61328125, - "learning_rate": 0.0018851680863515184, - "loss": 1.4762, - "step": 441500 - }, - { - "epoch": 2.8740490278951816, - "grad_norm": 1.2421875, - "learning_rate": 0.0018850380388841929, - "loss": 1.4729, - "step": 442000 - }, - { - "epoch": 2.8773002145783213, - "grad_norm": 0.64453125, - "learning_rate": 0.0018849079914168671, - "loss": 1.4756, - "step": 442500 - }, - { - "epoch": 2.8805514012614606, - "grad_norm": 0.578125, - "learning_rate": 0.0018847779439495416, - "loss": 1.4757, - "step": 443000 - }, - { - "epoch": 2.8838025879446, - "grad_norm": 0.56640625, - "learning_rate": 0.001884647896482216, - "loss": 1.4769, - "step": 443500 - }, - { - "epoch": 2.887053774627739, - "grad_norm": 0.703125, - "learning_rate": 0.0018845178490148903, - "loss": 1.4805, - "step": 444000 - }, - { - "epoch": 2.8903049613108784, - "grad_norm": 0.68359375, - "learning_rate": 0.0018843878015475648, - "loss": 1.4756, - "step": 444500 - }, - { - "epoch": 2.8935561479940177, - "grad_norm": 0.80078125, - "learning_rate": 0.0018842577540802393, - "loss": 1.4753, - "step": 445000 - }, - { - "epoch": 2.896807334677157, - "grad_norm": 2.875, - "learning_rate": 0.0018841277066129136, - "loss": 1.474, - "step": 445500 - }, - { - "epoch": 2.9000585213602967, - "grad_norm": 0.73046875, - "learning_rate": 0.001883997659145588, - "loss": 1.4762, - "step": 446000 - }, - { - "epoch": 2.9033097080434356, - "grad_norm": 0.86328125, - "learning_rate": 0.0018838676116782627, - "loss": 1.4703, - "step": 446500 - }, - { - "epoch": 2.9065608947265753, - "grad_norm": 0.7890625, - "learning_rate": 0.0018837375642109372, - "loss": 1.4681, - "step": 447000 - }, - { - "epoch": 2.9098120814097146, - "grad_norm": 2.6875, - "learning_rate": 0.0018836075167436115, - "loss": 1.4768, - "step": 447500 - }, - { - "epoch": 2.913063268092854, - "grad_norm": 0.92578125, - "learning_rate": 0.001883477469276286, - "loss": 1.4705, - "step": 448000 - }, - { - "epoch": 2.916314454775993, - "grad_norm": 1.8828125, - "learning_rate": 0.0018833474218089604, - "loss": 1.4775, - "step": 448500 - }, - { - "epoch": 2.9195656414591324, - "grad_norm": 0.703125, - "learning_rate": 0.0018832173743416347, - "loss": 1.4738, - "step": 449000 - }, - { - "epoch": 2.922816828142272, - "grad_norm": 1.03125, - "learning_rate": 0.0018830873268743092, - "loss": 1.4776, - "step": 449500 - }, - { - "epoch": 2.926068014825411, - "grad_norm": 0.8515625, - "learning_rate": 0.0018829572794069837, - "loss": 1.4748, - "step": 450000 - }, - { - "epoch": 2.9293192015085507, - "grad_norm": 0.93359375, - "learning_rate": 0.001882827231939658, - "loss": 1.4722, - "step": 450500 - }, - { - "epoch": 2.93257038819169, - "grad_norm": 0.70703125, - "learning_rate": 0.0018826971844723324, - "loss": 1.476, - "step": 451000 - }, - { - "epoch": 2.9358215748748293, - "grad_norm": 0.6640625, - "learning_rate": 0.0018825671370050069, - "loss": 1.4676, - "step": 451500 - }, - { - "epoch": 2.9390727615579686, - "grad_norm": 0.671875, - "learning_rate": 0.0018824370895376811, - "loss": 1.4714, - "step": 452000 - }, - { - "epoch": 2.942323948241108, - "grad_norm": 0.77734375, - "learning_rate": 0.0018823070420703556, - "loss": 1.468, - "step": 452500 - }, - { - "epoch": 2.9455751349242476, - "grad_norm": 0.8671875, - "learning_rate": 0.00188217699460303, - "loss": 1.472, - "step": 453000 - }, - { - "epoch": 2.9488263216073864, - "grad_norm": 0.9296875, - "learning_rate": 0.0018820469471357048, - "loss": 1.4727, - "step": 453500 - }, - { - "epoch": 2.952077508290526, - "grad_norm": 0.8515625, - "learning_rate": 0.001881916899668379, - "loss": 1.4722, - "step": 454000 - }, - { - "epoch": 2.9553286949736655, - "grad_norm": 0.61328125, - "learning_rate": 0.0018817868522010535, - "loss": 1.4691, - "step": 454500 - }, - { - "epoch": 2.9585798816568047, - "grad_norm": 0.75, - "learning_rate": 0.001881656804733728, - "loss": 1.4649, - "step": 455000 - }, - { - "epoch": 2.961831068339944, - "grad_norm": 0.67578125, - "learning_rate": 0.0018815267572664023, - "loss": 1.4633, - "step": 455500 - }, - { - "epoch": 2.9650822550230833, - "grad_norm": 0.734375, - "learning_rate": 0.0018813967097990767, - "loss": 1.461, - "step": 456000 - }, - { - "epoch": 2.968333441706223, - "grad_norm": 0.84765625, - "learning_rate": 0.0018812666623317512, - "loss": 1.4665, - "step": 456500 - }, - { - "epoch": 2.971584628389362, - "grad_norm": 0.6875, - "learning_rate": 0.0018811366148644255, - "loss": 1.4658, - "step": 457000 - }, - { - "epoch": 2.9748358150725016, - "grad_norm": 0.79296875, - "learning_rate": 0.0018810065673971, - "loss": 1.4662, - "step": 457500 - }, - { - "epoch": 2.978087001755641, - "grad_norm": 0.6484375, - "learning_rate": 0.0018808765199297744, - "loss": 1.4632, - "step": 458000 - }, - { - "epoch": 2.98133818843878, - "grad_norm": 0.80078125, - "learning_rate": 0.0018807464724624487, - "loss": 1.4701, - "step": 458500 - }, - { - "epoch": 2.9845893751219195, - "grad_norm": 0.95703125, - "learning_rate": 0.0018806164249951232, - "loss": 1.4664, - "step": 459000 - }, - { - "epoch": 2.9878405618050587, - "grad_norm": 0.7578125, - "learning_rate": 0.0018804863775277977, - "loss": 1.4679, - "step": 459500 - }, - { - "epoch": 2.991091748488198, - "grad_norm": 0.765625, - "learning_rate": 0.001880356330060472, - "loss": 1.4674, - "step": 460000 - }, - { - "epoch": 2.9943429351713373, - "grad_norm": 1.515625, - "learning_rate": 0.0018802262825931464, - "loss": 1.4695, - "step": 460500 - }, - { - "epoch": 2.997594121854477, - "grad_norm": 0.8359375, - "learning_rate": 0.001880096235125821, - "loss": 1.464, - "step": 461000 - }, - { - "epoch": 3.0, - "eval_loss": 1.4555257558822632, - "eval_runtime": 0.5334, - "eval_samples_per_second": 1874.801, - "eval_steps_per_second": 29.997, - "step": 461370 - }, - { - "epoch": 3.0008453085376163, - "grad_norm": 0.78125, - "learning_rate": 0.0018799661876584956, - "loss": 1.4686, - "step": 461500 - }, - { - "epoch": 3.0040964952207556, - "grad_norm": 0.8359375, - "learning_rate": 0.0018798361401911698, - "loss": 1.4627, - "step": 462000 - }, - { - "epoch": 3.007347681903895, - "grad_norm": 2.734375, - "learning_rate": 0.0018797060927238443, - "loss": 1.4682, - "step": 462500 - }, - { - "epoch": 3.010598868587034, - "grad_norm": 1.0, - "learning_rate": 0.0018795760452565188, - "loss": 1.4645, - "step": 463000 - }, - { - "epoch": 3.0138500552701735, - "grad_norm": 0.6796875, - "learning_rate": 0.001879445997789193, - "loss": 1.4582, - "step": 463500 - }, - { - "epoch": 3.0171012419533128, - "grad_norm": 0.8828125, - "learning_rate": 0.0018793159503218675, - "loss": 1.462, - "step": 464000 - }, - { - "epoch": 3.0203524286364525, - "grad_norm": 0.921875, - "learning_rate": 0.001879185902854542, - "loss": 1.4655, - "step": 464500 - }, - { - "epoch": 3.0236036153195918, - "grad_norm": 0.6171875, - "learning_rate": 0.0018790558553872163, - "loss": 1.4624, - "step": 465000 - }, - { - "epoch": 3.026854802002731, - "grad_norm": 0.8125, - "learning_rate": 0.0018789258079198908, - "loss": 1.4621, - "step": 465500 - }, - { - "epoch": 3.0301059886858703, - "grad_norm": 1.25, - "learning_rate": 0.0018787957604525652, - "loss": 1.4682, - "step": 466000 - }, - { - "epoch": 3.0333571753690096, - "grad_norm": 0.7890625, - "learning_rate": 0.0018786657129852395, - "loss": 1.4636, - "step": 466500 - }, - { - "epoch": 3.036608362052149, - "grad_norm": 0.609375, - "learning_rate": 0.001878535665517914, - "loss": 1.4661, - "step": 467000 - }, - { - "epoch": 3.039859548735288, - "grad_norm": 0.78515625, - "learning_rate": 0.0018784056180505884, - "loss": 1.4604, - "step": 467500 - }, - { - "epoch": 3.043110735418428, - "grad_norm": 0.6796875, - "learning_rate": 0.0018782755705832631, - "loss": 1.4597, - "step": 468000 - }, - { - "epoch": 3.046361922101567, - "grad_norm": 0.65625, - "learning_rate": 0.0018781455231159374, - "loss": 1.4634, - "step": 468500 - }, - { - "epoch": 3.0496131087847065, - "grad_norm": 1.0390625, - "learning_rate": 0.0018780154756486119, - "loss": 1.4617, - "step": 469000 - }, - { - "epoch": 3.0528642954678458, - "grad_norm": 0.65625, - "learning_rate": 0.0018778854281812864, - "loss": 1.4653, - "step": 469500 - }, - { - "epoch": 3.056115482150985, - "grad_norm": 1.4609375, - "learning_rate": 0.0018777553807139606, - "loss": 1.4659, - "step": 470000 - }, - { - "epoch": 3.0593666688341243, - "grad_norm": 0.79296875, - "learning_rate": 0.001877625333246635, - "loss": 1.4602, - "step": 470500 - }, - { - "epoch": 3.0626178555172636, - "grad_norm": 1.0546875, - "learning_rate": 0.0018774952857793096, - "loss": 1.4627, - "step": 471000 - }, - { - "epoch": 3.0658690422004033, - "grad_norm": 1.15625, - "learning_rate": 0.0018773652383119838, - "loss": 1.4682, - "step": 471500 - }, - { - "epoch": 3.0691202288835426, - "grad_norm": 2.609375, - "learning_rate": 0.0018772351908446583, - "loss": 1.4656, - "step": 472000 - }, - { - "epoch": 3.072371415566682, - "grad_norm": 0.671875, - "learning_rate": 0.0018771051433773328, - "loss": 1.4598, - "step": 472500 - }, - { - "epoch": 3.075622602249821, - "grad_norm": 0.70703125, - "learning_rate": 0.001876975095910007, - "loss": 1.4602, - "step": 473000 - }, - { - "epoch": 3.0788737889329605, - "grad_norm": 0.7578125, - "learning_rate": 0.0018768450484426815, - "loss": 1.4621, - "step": 473500 - }, - { - "epoch": 3.0821249756160998, - "grad_norm": 0.6171875, - "learning_rate": 0.001876715000975356, - "loss": 1.4628, - "step": 474000 - }, - { - "epoch": 3.085376162299239, - "grad_norm": 0.8203125, - "learning_rate": 0.0018765849535080303, - "loss": 1.4626, - "step": 474500 - }, - { - "epoch": 3.088627348982379, - "grad_norm": 0.73828125, - "learning_rate": 0.0018764549060407048, - "loss": 1.466, - "step": 475000 - }, - { - "epoch": 3.091878535665518, - "grad_norm": 0.8671875, - "learning_rate": 0.0018763248585733795, - "loss": 1.462, - "step": 475500 - }, - { - "epoch": 3.0951297223486574, - "grad_norm": 7.71875, - "learning_rate": 0.001876194811106054, - "loss": 1.464, - "step": 476000 - }, - { - "epoch": 3.0983809090317966, - "grad_norm": 0.625, - "learning_rate": 0.0018760647636387282, - "loss": 1.4624, - "step": 476500 - }, - { - "epoch": 3.101632095714936, - "grad_norm": 1.4921875, - "learning_rate": 0.0018759347161714027, - "loss": 1.4656, - "step": 477000 - }, - { - "epoch": 3.104883282398075, - "grad_norm": 0.73828125, - "learning_rate": 0.0018758046687040772, - "loss": 1.464, - "step": 477500 - }, - { - "epoch": 3.1081344690812145, - "grad_norm": 0.9140625, - "learning_rate": 0.0018756746212367514, - "loss": 1.461, - "step": 478000 - }, - { - "epoch": 3.1113856557643538, - "grad_norm": 2.328125, - "learning_rate": 0.0018755445737694259, - "loss": 1.4651, - "step": 478500 - }, - { - "epoch": 3.1146368424474935, - "grad_norm": 1.1328125, - "learning_rate": 0.0018754145263021004, - "loss": 1.4645, - "step": 479000 - }, - { - "epoch": 3.117888029130633, - "grad_norm": 0.62109375, - "learning_rate": 0.0018752844788347746, - "loss": 1.4721, - "step": 479500 - }, - { - "epoch": 3.121139215813772, - "grad_norm": 0.671875, - "learning_rate": 0.0018751544313674491, - "loss": 1.4656, - "step": 480000 - }, - { - "epoch": 3.1243904024969114, - "grad_norm": 0.90625, - "learning_rate": 0.0018750243839001236, - "loss": 1.464, - "step": 480500 - }, - { - "epoch": 3.1276415891800506, - "grad_norm": 0.88671875, - "learning_rate": 0.0018748943364327978, - "loss": 1.4671, - "step": 481000 - }, - { - "epoch": 3.13089277586319, - "grad_norm": 1.71875, - "learning_rate": 0.0018747642889654723, - "loss": 1.4645, - "step": 481500 - }, - { - "epoch": 3.134143962546329, - "grad_norm": 1.8359375, - "learning_rate": 0.0018746342414981468, - "loss": 1.4666, - "step": 482000 - }, - { - "epoch": 3.137395149229469, - "grad_norm": 0.74609375, - "learning_rate": 0.0018745041940308215, - "loss": 1.4619, - "step": 482500 - }, - { - "epoch": 3.1406463359126082, - "grad_norm": 0.765625, - "learning_rate": 0.0018743741465634958, - "loss": 1.4618, - "step": 483000 - }, - { - "epoch": 3.1438975225957475, - "grad_norm": 0.81640625, - "learning_rate": 0.0018742440990961702, - "loss": 1.4534, - "step": 483500 - }, - { - "epoch": 3.147148709278887, - "grad_norm": 0.671875, - "learning_rate": 0.0018741140516288447, - "loss": 1.4633, - "step": 484000 - }, - { - "epoch": 3.150399895962026, - "grad_norm": 0.59375, - "learning_rate": 0.001873984004161519, - "loss": 1.4596, - "step": 484500 - }, - { - "epoch": 3.1536510826451654, - "grad_norm": 1.9453125, - "learning_rate": 0.0018738539566941935, - "loss": 1.4652, - "step": 485000 - }, - { - "epoch": 3.1569022693283046, - "grad_norm": 0.98828125, - "learning_rate": 0.001873723909226868, - "loss": 1.4645, - "step": 485500 - }, - { - "epoch": 3.1601534560114444, - "grad_norm": 0.67578125, - "learning_rate": 0.0018735938617595422, - "loss": 1.4604, - "step": 486000 - }, - { - "epoch": 3.1634046426945837, - "grad_norm": 0.66796875, - "learning_rate": 0.0018734638142922167, - "loss": 1.4634, - "step": 486500 - }, - { - "epoch": 3.166655829377723, - "grad_norm": 0.83984375, - "learning_rate": 0.0018733337668248912, - "loss": 1.4572, - "step": 487000 - }, - { - "epoch": 3.1699070160608622, - "grad_norm": 1.1875, - "learning_rate": 0.0018732037193575654, - "loss": 1.4651, - "step": 487500 - }, - { - "epoch": 3.1731582027440015, - "grad_norm": 1.6328125, - "learning_rate": 0.00187307367189024, - "loss": 1.465, - "step": 488000 - }, - { - "epoch": 3.176409389427141, - "grad_norm": 0.84765625, - "learning_rate": 0.0018729436244229144, - "loss": 1.4643, - "step": 488500 - }, - { - "epoch": 3.17966057611028, - "grad_norm": 2.234375, - "learning_rate": 0.0018728135769555886, - "loss": 1.4622, - "step": 489000 - }, - { - "epoch": 3.1829117627934194, - "grad_norm": 1.015625, - "learning_rate": 0.0018726835294882631, - "loss": 1.4618, - "step": 489500 - }, - { - "epoch": 3.186162949476559, - "grad_norm": 4.8125, - "learning_rate": 0.0018725534820209378, - "loss": 1.4592, - "step": 490000 - }, - { - "epoch": 3.1894141361596984, - "grad_norm": 0.71875, - "learning_rate": 0.0018724234345536123, - "loss": 1.4562, - "step": 490500 - }, - { - "epoch": 3.1926653228428377, - "grad_norm": 0.671875, - "learning_rate": 0.0018722933870862866, - "loss": 1.4643, - "step": 491000 - }, - { - "epoch": 3.195916509525977, - "grad_norm": 0.6015625, - "learning_rate": 0.001872163339618961, - "loss": 1.4631, - "step": 491500 - }, - { - "epoch": 3.1991676962091162, - "grad_norm": 0.68359375, - "learning_rate": 0.0018720332921516355, - "loss": 1.4638, - "step": 492000 - }, - { - "epoch": 3.2024188828922555, - "grad_norm": 0.71484375, - "learning_rate": 0.0018719032446843098, - "loss": 1.4637, - "step": 492500 - }, - { - "epoch": 3.205670069575395, - "grad_norm": 0.6875, - "learning_rate": 0.0018717731972169842, - "loss": 1.4646, - "step": 493000 - }, - { - "epoch": 3.2089212562585345, - "grad_norm": 0.64453125, - "learning_rate": 0.0018716431497496587, - "loss": 1.4647, - "step": 493500 - }, - { - "epoch": 3.212172442941674, - "grad_norm": 4.71875, - "learning_rate": 0.001871513102282333, - "loss": 1.4617, - "step": 494000 - }, - { - "epoch": 3.215423629624813, - "grad_norm": 0.6953125, - "learning_rate": 0.0018713830548150075, - "loss": 1.4578, - "step": 494500 - }, - { - "epoch": 3.2186748163079524, - "grad_norm": 0.87890625, - "learning_rate": 0.001871253007347682, - "loss": 1.4573, - "step": 495000 - }, - { - "epoch": 3.2219260029910917, - "grad_norm": 0.61328125, - "learning_rate": 0.0018711229598803562, - "loss": 1.4608, - "step": 495500 - }, - { - "epoch": 3.225177189674231, - "grad_norm": 0.84765625, - "learning_rate": 0.0018709929124130307, - "loss": 1.4641, - "step": 496000 - }, - { - "epoch": 3.2284283763573702, - "grad_norm": 1.34375, - "learning_rate": 0.0018708628649457052, - "loss": 1.4638, - "step": 496500 - }, - { - "epoch": 3.23167956304051, - "grad_norm": 0.765625, - "learning_rate": 0.0018707328174783799, - "loss": 1.4615, - "step": 497000 - }, - { - "epoch": 3.2349307497236492, - "grad_norm": 1.21875, - "learning_rate": 0.0018706027700110541, - "loss": 1.4576, - "step": 497500 - }, - { - "epoch": 3.2381819364067885, - "grad_norm": 1.203125, - "learning_rate": 0.0018704727225437286, - "loss": 1.4655, - "step": 498000 - }, - { - "epoch": 3.241433123089928, - "grad_norm": 0.64453125, - "learning_rate": 0.001870342675076403, - "loss": 1.4625, - "step": 498500 - }, - { - "epoch": 3.244684309773067, - "grad_norm": 0.71484375, - "learning_rate": 0.0018702126276090773, - "loss": 1.4666, - "step": 499000 - }, - { - "epoch": 3.2479354964562064, - "grad_norm": 1.046875, - "learning_rate": 0.0018700825801417518, - "loss": 1.4697, - "step": 499500 - }, - { - "epoch": 3.2511866831393457, - "grad_norm": 0.7109375, - "learning_rate": 0.0018699525326744263, - "loss": 1.4659, - "step": 500000 - }, - { - "epoch": 3.2544378698224854, - "grad_norm": 1.0546875, - "learning_rate": 0.0018698224852071006, - "loss": 1.4683, - "step": 500500 - }, - { - "epoch": 3.2576890565056247, - "grad_norm": 0.60546875, - "learning_rate": 0.001869692437739775, - "loss": 1.4658, - "step": 501000 - }, - { - "epoch": 3.260940243188764, - "grad_norm": 0.859375, - "learning_rate": 0.0018695623902724495, - "loss": 1.4642, - "step": 501500 - }, - { - "epoch": 3.2641914298719032, - "grad_norm": 0.86328125, - "learning_rate": 0.0018694323428051238, - "loss": 1.4566, - "step": 502000 - }, - { - "epoch": 3.2674426165550425, - "grad_norm": 1.15625, - "learning_rate": 0.0018693022953377983, - "loss": 1.4618, - "step": 502500 - }, - { - "epoch": 3.270693803238182, - "grad_norm": 0.6484375, - "learning_rate": 0.0018691722478704727, - "loss": 1.4692, - "step": 503000 - }, - { - "epoch": 3.273944989921321, - "grad_norm": 0.7734375, - "learning_rate": 0.001869042200403147, - "loss": 1.4591, - "step": 503500 - }, - { - "epoch": 3.277196176604461, - "grad_norm": 0.58984375, - "learning_rate": 0.0018689121529358215, - "loss": 1.4591, - "step": 504000 - }, - { - "epoch": 3.2804473632876, - "grad_norm": 0.796875, - "learning_rate": 0.0018687821054684962, - "loss": 1.4616, - "step": 504500 - }, - { - "epoch": 3.2836985499707394, - "grad_norm": 1.296875, - "learning_rate": 0.0018686520580011706, - "loss": 1.4581, - "step": 505000 - }, - { - "epoch": 3.2869497366538787, - "grad_norm": 0.87890625, - "learning_rate": 0.001868522010533845, - "loss": 1.4645, - "step": 505500 - }, - { - "epoch": 3.290200923337018, - "grad_norm": 0.6875, - "learning_rate": 0.0018683919630665194, - "loss": 1.468, - "step": 506000 - }, - { - "epoch": 3.2934521100201573, - "grad_norm": 1.2734375, - "learning_rate": 0.0018682619155991939, - "loss": 1.4623, - "step": 506500 - }, - { - "epoch": 3.2967032967032965, - "grad_norm": 0.6015625, - "learning_rate": 0.0018681318681318681, - "loss": 1.4576, - "step": 507000 - }, - { - "epoch": 3.2999544833864363, - "grad_norm": 0.796875, - "learning_rate": 0.0018680018206645426, - "loss": 1.4624, - "step": 507500 - }, - { - "epoch": 3.3032056700695756, - "grad_norm": 0.94921875, - "learning_rate": 0.001867871773197217, - "loss": 1.4646, - "step": 508000 - }, - { - "epoch": 3.306456856752715, - "grad_norm": 1.0234375, - "learning_rate": 0.0018677417257298913, - "loss": 1.4571, - "step": 508500 - }, - { - "epoch": 3.309708043435854, - "grad_norm": 0.734375, - "learning_rate": 0.0018676116782625658, - "loss": 1.4602, - "step": 509000 - }, - { - "epoch": 3.3129592301189934, - "grad_norm": 0.67578125, - "learning_rate": 0.0018674816307952403, - "loss": 1.4644, - "step": 509500 - }, - { - "epoch": 3.3162104168021327, - "grad_norm": 0.69140625, - "learning_rate": 0.0018673515833279146, - "loss": 1.4602, - "step": 510000 - }, - { - "epoch": 3.319461603485272, - "grad_norm": 0.78515625, - "learning_rate": 0.001867221535860589, - "loss": 1.4605, - "step": 510500 - }, - { - "epoch": 3.3227127901684117, - "grad_norm": 8.0625, - "learning_rate": 0.0018670914883932635, - "loss": 1.4608, - "step": 511000 - }, - { - "epoch": 3.325963976851551, - "grad_norm": 0.80859375, - "learning_rate": 0.0018669614409259382, - "loss": 1.4653, - "step": 511500 - }, - { - "epoch": 3.3292151635346903, - "grad_norm": 1.2578125, - "learning_rate": 0.0018668313934586125, - "loss": 1.4617, - "step": 512000 - }, - { - "epoch": 3.3324663502178296, - "grad_norm": 0.85546875, - "learning_rate": 0.001866701345991287, - "loss": 1.4624, - "step": 512500 - }, - { - "epoch": 3.335717536900969, - "grad_norm": 0.69921875, - "learning_rate": 0.0018665712985239614, - "loss": 1.4737, - "step": 513000 - }, - { - "epoch": 3.338968723584108, - "grad_norm": 0.609375, - "learning_rate": 0.0018664412510566357, - "loss": 1.4689, - "step": 513500 - }, - { - "epoch": 3.3422199102672474, - "grad_norm": 0.8828125, - "learning_rate": 0.0018663112035893102, - "loss": 1.4679, - "step": 514000 - }, - { - "epoch": 3.345471096950387, - "grad_norm": 0.8984375, - "learning_rate": 0.0018661811561219847, - "loss": 1.4666, - "step": 514500 - }, - { - "epoch": 3.3487222836335264, - "grad_norm": 0.70703125, - "learning_rate": 0.001866051108654659, - "loss": 1.4707, - "step": 515000 - }, - { - "epoch": 3.3519734703166657, - "grad_norm": 0.80859375, - "learning_rate": 0.0018659210611873334, - "loss": 1.4658, - "step": 515500 - }, - { - "epoch": 3.355224656999805, - "grad_norm": 1.2265625, - "learning_rate": 0.0018657910137200079, - "loss": 1.4621, - "step": 516000 - }, - { - "epoch": 3.3584758436829443, - "grad_norm": 1.71875, - "learning_rate": 0.0018656609662526821, - "loss": 1.4627, - "step": 516500 - }, - { - "epoch": 3.3617270303660836, - "grad_norm": 2.28125, - "learning_rate": 0.0018655309187853566, - "loss": 1.466, - "step": 517000 - }, - { - "epoch": 3.364978217049223, - "grad_norm": 2.1875, - "learning_rate": 0.001865400871318031, - "loss": 1.4665, - "step": 517500 - }, - { - "epoch": 3.368229403732362, - "grad_norm": 0.8125, - "learning_rate": 0.0018652708238507054, - "loss": 1.4685, - "step": 518000 - }, - { - "epoch": 3.3714805904155014, - "grad_norm": 0.94140625, - "learning_rate": 0.0018651407763833798, - "loss": 1.4629, - "step": 518500 - }, - { - "epoch": 3.374731777098641, - "grad_norm": 0.73828125, - "learning_rate": 0.0018650107289160545, - "loss": 1.4591, - "step": 519000 - }, - { - "epoch": 3.3779829637817804, - "grad_norm": 0.79296875, - "learning_rate": 0.001864880681448729, - "loss": 1.4592, - "step": 519500 - }, - { - "epoch": 3.3812341504649197, - "grad_norm": 0.77734375, - "learning_rate": 0.0018647506339814033, - "loss": 1.4595, - "step": 520000 - }, - { - "epoch": 3.384485337148059, - "grad_norm": 1.25, - "learning_rate": 0.0018646205865140777, - "loss": 1.454, - "step": 520500 - }, - { - "epoch": 3.3877365238311983, - "grad_norm": 0.796875, - "learning_rate": 0.0018644905390467522, - "loss": 1.4567, - "step": 521000 - }, - { - "epoch": 3.3909877105143376, - "grad_norm": 0.8046875, - "learning_rate": 0.0018643604915794265, - "loss": 1.4625, - "step": 521500 - }, - { - "epoch": 3.394238897197477, - "grad_norm": 0.91015625, - "learning_rate": 0.001864230444112101, - "loss": 1.4636, - "step": 522000 - }, - { - "epoch": 3.3974900838806166, - "grad_norm": 1.1875, - "learning_rate": 0.0018641003966447754, - "loss": 1.4584, - "step": 522500 - }, - { - "epoch": 3.400741270563756, - "grad_norm": 12.1875, - "learning_rate": 0.0018639703491774497, - "loss": 1.4628, - "step": 523000 - }, - { - "epoch": 3.403992457246895, - "grad_norm": 0.69140625, - "learning_rate": 0.0018638403017101242, - "loss": 1.4558, - "step": 523500 - }, - { - "epoch": 3.4072436439300344, - "grad_norm": 0.79296875, - "learning_rate": 0.0018637102542427987, - "loss": 1.4548, - "step": 524000 - }, - { - "epoch": 3.4104948306131737, - "grad_norm": 0.92578125, - "learning_rate": 0.001863580206775473, - "loss": 1.4562, - "step": 524500 - }, - { - "epoch": 3.413746017296313, - "grad_norm": 1.640625, - "learning_rate": 0.0018634501593081474, - "loss": 1.4587, - "step": 525000 - }, - { - "epoch": 3.4169972039794523, - "grad_norm": 0.65625, - "learning_rate": 0.0018633201118408219, - "loss": 1.4576, - "step": 525500 - }, - { - "epoch": 3.420248390662592, - "grad_norm": 1.4140625, - "learning_rate": 0.0018631900643734966, - "loss": 1.4567, - "step": 526000 - }, - { - "epoch": 3.4234995773457313, - "grad_norm": 1.1953125, - "learning_rate": 0.0018630600169061708, - "loss": 1.4547, - "step": 526500 - }, - { - "epoch": 3.4267507640288706, - "grad_norm": 0.84375, - "learning_rate": 0.0018629299694388453, - "loss": 1.4625, - "step": 527000 - }, - { - "epoch": 3.43000195071201, - "grad_norm": 0.76953125, - "learning_rate": 0.0018627999219715198, - "loss": 1.4562, - "step": 527500 - }, - { - "epoch": 3.433253137395149, - "grad_norm": 0.7109375, - "learning_rate": 0.001862669874504194, - "loss": 1.4553, - "step": 528000 - }, - { - "epoch": 3.4365043240782884, - "grad_norm": 0.74609375, - "learning_rate": 0.0018625398270368685, - "loss": 1.4575, - "step": 528500 - }, - { - "epoch": 3.4397555107614277, - "grad_norm": 0.66796875, - "learning_rate": 0.001862409779569543, - "loss": 1.4557, - "step": 529000 - }, - { - "epoch": 3.4430066974445674, - "grad_norm": 7.375, - "learning_rate": 0.0018622797321022173, - "loss": 1.4513, - "step": 529500 - }, - { - "epoch": 3.4462578841277067, - "grad_norm": 0.67578125, - "learning_rate": 0.0018621496846348918, - "loss": 1.4547, - "step": 530000 - }, - { - "epoch": 3.449509070810846, - "grad_norm": 0.734375, - "learning_rate": 0.0018620196371675662, - "loss": 1.4489, - "step": 530500 - }, - { - "epoch": 3.4527602574939853, - "grad_norm": 0.703125, - "learning_rate": 0.0018618895897002405, - "loss": 1.4588, - "step": 531000 - }, - { - "epoch": 3.4560114441771246, - "grad_norm": 1.6484375, - "learning_rate": 0.001861759542232915, - "loss": 1.4543, - "step": 531500 - }, - { - "epoch": 3.459262630860264, - "grad_norm": 0.984375, - "learning_rate": 0.0018616294947655895, - "loss": 1.4538, - "step": 532000 - }, - { - "epoch": 3.462513817543403, - "grad_norm": 1.8515625, - "learning_rate": 0.0018614994472982637, - "loss": 1.4574, - "step": 532500 - }, - { - "epoch": 3.465765004226543, - "grad_norm": 0.84765625, - "learning_rate": 0.0018613693998309382, - "loss": 1.4542, - "step": 533000 - }, - { - "epoch": 3.469016190909682, - "grad_norm": 0.8515625, - "learning_rate": 0.0018612393523636129, - "loss": 1.4478, - "step": 533500 - }, - { - "epoch": 3.4722673775928214, - "grad_norm": 0.80078125, - "learning_rate": 0.0018611093048962874, - "loss": 1.4491, - "step": 534000 - }, - { - "epoch": 3.4755185642759607, - "grad_norm": 0.66796875, - "learning_rate": 0.0018609792574289616, - "loss": 1.4509, - "step": 534500 - }, - { - "epoch": 3.4787697509591, - "grad_norm": 1.328125, - "learning_rate": 0.001860849209961636, - "loss": 1.4515, - "step": 535000 - }, - { - "epoch": 3.4820209376422393, - "grad_norm": 1.4296875, - "learning_rate": 0.0018607191624943106, - "loss": 1.4496, - "step": 535500 - }, - { - "epoch": 3.4852721243253786, - "grad_norm": 0.625, - "learning_rate": 0.0018605891150269848, - "loss": 1.4531, - "step": 536000 - }, - { - "epoch": 3.4885233110085183, - "grad_norm": 0.66015625, - "learning_rate": 0.0018604590675596593, - "loss": 1.4456, - "step": 536500 - }, - { - "epoch": 3.4917744976916576, - "grad_norm": 0.9609375, - "learning_rate": 0.0018603290200923338, - "loss": 1.445, - "step": 537000 - }, - { - "epoch": 3.495025684374797, - "grad_norm": 0.61328125, - "learning_rate": 0.001860198972625008, - "loss": 1.4524, - "step": 537500 - }, - { - "epoch": 3.498276871057936, - "grad_norm": 0.68359375, - "learning_rate": 0.0018600689251576825, - "loss": 1.4515, - "step": 538000 - }, - { - "epoch": 3.5015280577410755, - "grad_norm": 0.703125, - "learning_rate": 0.001859938877690357, - "loss": 1.452, - "step": 538500 - }, - { - "epoch": 3.5047792444242147, - "grad_norm": 0.984375, - "learning_rate": 0.0018598088302230313, - "loss": 1.4509, - "step": 539000 - }, - { - "epoch": 3.508030431107354, - "grad_norm": 0.7890625, - "learning_rate": 0.0018596787827557058, - "loss": 1.4488, - "step": 539500 - }, - { - "epoch": 3.5112816177904937, - "grad_norm": 0.81640625, - "learning_rate": 0.0018595487352883802, - "loss": 1.4503, - "step": 540000 - }, - { - "epoch": 3.514532804473633, - "grad_norm": 1.0703125, - "learning_rate": 0.001859418687821055, - "loss": 1.4458, - "step": 540500 - }, - { - "epoch": 3.5177839911567723, - "grad_norm": 3.40625, - "learning_rate": 0.0018592886403537292, - "loss": 1.448, - "step": 541000 - }, - { - "epoch": 3.5210351778399116, - "grad_norm": 0.953125, - "learning_rate": 0.0018591585928864037, - "loss": 1.4465, - "step": 541500 - }, - { - "epoch": 3.524286364523051, - "grad_norm": 0.95703125, - "learning_rate": 0.0018590285454190782, - "loss": 1.4497, - "step": 542000 - }, - { - "epoch": 3.52753755120619, - "grad_norm": 0.9609375, - "learning_rate": 0.0018588984979517524, - "loss": 1.4566, - "step": 542500 - }, - { - "epoch": 3.5307887378893295, - "grad_norm": 0.94921875, - "learning_rate": 0.001858768450484427, - "loss": 1.446, - "step": 543000 - }, - { - "epoch": 3.534039924572469, - "grad_norm": 0.8046875, - "learning_rate": 0.0018586384030171014, - "loss": 1.4494, - "step": 543500 - }, - { - "epoch": 3.537291111255608, - "grad_norm": 1.546875, - "learning_rate": 0.0018585083555497756, - "loss": 1.4507, - "step": 544000 - }, - { - "epoch": 3.5405422979387478, - "grad_norm": 0.84765625, - "learning_rate": 0.0018583783080824501, - "loss": 1.4519, - "step": 544500 - }, - { - "epoch": 3.543793484621887, - "grad_norm": 0.9375, - "learning_rate": 0.0018582482606151246, - "loss": 1.4519, - "step": 545000 - }, - { - "epoch": 3.5470446713050263, - "grad_norm": 0.78125, - "learning_rate": 0.0018581182131477989, - "loss": 1.4518, - "step": 545500 - }, - { - "epoch": 3.5502958579881656, - "grad_norm": 1.1171875, - "learning_rate": 0.0018579881656804733, - "loss": 1.4468, - "step": 546000 - }, - { - "epoch": 3.553547044671305, - "grad_norm": 1.4296875, - "learning_rate": 0.0018578581182131478, - "loss": 1.4491, - "step": 546500 - }, - { - "epoch": 3.5567982313544446, - "grad_norm": 0.6953125, - "learning_rate": 0.001857728070745822, - "loss": 1.4475, - "step": 547000 - }, - { - "epoch": 3.5600494180375835, - "grad_norm": 0.85546875, - "learning_rate": 0.0018575980232784966, - "loss": 1.4443, - "step": 547500 - }, - { - "epoch": 3.563300604720723, - "grad_norm": 0.59765625, - "learning_rate": 0.0018574679758111712, - "loss": 1.4528, - "step": 548000 - }, - { - "epoch": 3.5665517914038625, - "grad_norm": 0.671875, - "learning_rate": 0.0018573379283438457, - "loss": 1.4504, - "step": 548500 - }, - { - "epoch": 3.5698029780870018, - "grad_norm": 0.8046875, - "learning_rate": 0.00185720788087652, - "loss": 1.4515, - "step": 549000 - }, - { - "epoch": 3.573054164770141, - "grad_norm": 0.91015625, - "learning_rate": 0.0018570778334091945, - "loss": 1.4514, - "step": 549500 - }, - { - "epoch": 3.5763053514532803, - "grad_norm": 0.78125, - "learning_rate": 0.001856947785941869, - "loss": 1.4486, - "step": 550000 - }, - { - "epoch": 3.57955653813642, - "grad_norm": 0.62109375, - "learning_rate": 0.0018568177384745432, - "loss": 1.4558, - "step": 550500 - }, - { - "epoch": 3.582807724819559, - "grad_norm": 0.828125, - "learning_rate": 0.0018566876910072177, - "loss": 1.4568, - "step": 551000 - }, - { - "epoch": 3.5860589115026986, - "grad_norm": 0.75, - "learning_rate": 0.0018565576435398922, - "loss": 1.4611, - "step": 551500 - }, - { - "epoch": 3.589310098185838, - "grad_norm": 0.8515625, - "learning_rate": 0.0018564275960725664, - "loss": 1.4546, - "step": 552000 - }, - { - "epoch": 3.592561284868977, - "grad_norm": 0.75390625, - "learning_rate": 0.001856297548605241, - "loss": 1.4468, - "step": 552500 - }, - { - "epoch": 3.5958124715521165, - "grad_norm": 0.6953125, - "learning_rate": 0.0018561675011379154, - "loss": 1.4537, - "step": 553000 - }, - { - "epoch": 3.5990636582352558, - "grad_norm": 2.21875, - "learning_rate": 0.0018560374536705896, - "loss": 1.4525, - "step": 553500 - }, - { - "epoch": 3.6023148449183955, - "grad_norm": 0.72265625, - "learning_rate": 0.0018559074062032641, - "loss": 1.4554, - "step": 554000 - }, - { - "epoch": 3.6055660316015343, - "grad_norm": 0.81640625, - "learning_rate": 0.0018557773587359386, - "loss": 1.4624, - "step": 554500 - }, - { - "epoch": 3.608817218284674, - "grad_norm": 0.83984375, - "learning_rate": 0.0018556473112686133, - "loss": 1.45, - "step": 555000 - }, - { - "epoch": 3.6120684049678133, - "grad_norm": 0.91015625, - "learning_rate": 0.0018555172638012876, - "loss": 1.4379, - "step": 555500 - }, - { - "epoch": 3.6153195916509526, - "grad_norm": 0.734375, - "learning_rate": 0.001855387216333962, - "loss": 1.4456, - "step": 556000 - }, - { - "epoch": 3.618570778334092, - "grad_norm": 1.546875, - "learning_rate": 0.0018552571688666365, - "loss": 1.4477, - "step": 556500 - }, - { - "epoch": 3.621821965017231, - "grad_norm": 1.0390625, - "learning_rate": 0.0018551271213993108, - "loss": 1.4479, - "step": 557000 - }, - { - "epoch": 3.6250731517003705, - "grad_norm": 0.69921875, - "learning_rate": 0.0018549970739319853, - "loss": 1.4461, - "step": 557500 - }, - { - "epoch": 3.6283243383835098, - "grad_norm": 0.578125, - "learning_rate": 0.0018548670264646597, - "loss": 1.4451, - "step": 558000 - }, - { - "epoch": 3.6315755250666495, - "grad_norm": 0.8359375, - "learning_rate": 0.001854736978997334, - "loss": 1.4457, - "step": 558500 - }, - { - "epoch": 3.6348267117497888, - "grad_norm": 0.984375, - "learning_rate": 0.0018546069315300085, - "loss": 1.4474, - "step": 559000 - }, - { - "epoch": 3.638077898432928, - "grad_norm": 0.875, - "learning_rate": 0.001854476884062683, - "loss": 1.4448, - "step": 559500 - }, - { - "epoch": 3.6413290851160673, - "grad_norm": 0.7421875, - "learning_rate": 0.0018543468365953572, - "loss": 1.4462, - "step": 560000 - }, - { - "epoch": 3.6445802717992066, - "grad_norm": 1.0390625, - "learning_rate": 0.0018542167891280317, - "loss": 1.446, - "step": 560500 - }, - { - "epoch": 3.647831458482346, - "grad_norm": 1.3359375, - "learning_rate": 0.0018540867416607062, - "loss": 1.4452, - "step": 561000 - }, - { - "epoch": 3.651082645165485, - "grad_norm": 0.609375, - "learning_rate": 0.0018539566941933804, - "loss": 1.4469, - "step": 561500 - }, - { - "epoch": 3.654333831848625, - "grad_norm": 0.74609375, - "learning_rate": 0.001853826646726055, - "loss": 1.4522, - "step": 562000 - }, - { - "epoch": 3.657585018531764, - "grad_norm": 1.1484375, - "learning_rate": 0.0018536965992587296, - "loss": 1.4482, - "step": 562500 - }, - { - "epoch": 3.6608362052149035, - "grad_norm": 0.765625, - "learning_rate": 0.001853566551791404, - "loss": 1.4472, - "step": 563000 - }, - { - "epoch": 3.664087391898043, - "grad_norm": 0.64453125, - "learning_rate": 0.0018534365043240783, - "loss": 1.443, - "step": 563500 - }, - { - "epoch": 3.667338578581182, - "grad_norm": 0.84375, - "learning_rate": 0.0018533064568567528, - "loss": 1.4432, - "step": 564000 - }, - { - "epoch": 3.6705897652643213, - "grad_norm": 0.5703125, - "learning_rate": 0.0018531764093894273, - "loss": 1.4502, - "step": 564500 - }, - { - "epoch": 3.6738409519474606, - "grad_norm": 0.9765625, - "learning_rate": 0.0018530463619221016, - "loss": 1.4471, - "step": 565000 - }, - { - "epoch": 3.6770921386306004, - "grad_norm": 0.76953125, - "learning_rate": 0.001852916314454776, - "loss": 1.4464, - "step": 565500 - }, - { - "epoch": 3.6803433253137396, - "grad_norm": 0.96484375, - "learning_rate": 0.0018527862669874505, - "loss": 1.4476, - "step": 566000 - }, - { - "epoch": 3.683594511996879, - "grad_norm": 0.546875, - "learning_rate": 0.0018526562195201248, - "loss": 1.4494, - "step": 566500 - }, - { - "epoch": 3.686845698680018, - "grad_norm": 0.734375, - "learning_rate": 0.0018525261720527993, - "loss": 1.4483, - "step": 567000 - }, - { - "epoch": 3.6900968853631575, - "grad_norm": 0.65625, - "learning_rate": 0.0018523961245854737, - "loss": 1.4481, - "step": 567500 - }, - { - "epoch": 3.693348072046297, - "grad_norm": 0.828125, - "learning_rate": 0.001852266077118148, - "loss": 1.4484, - "step": 568000 - }, - { - "epoch": 3.696599258729436, - "grad_norm": 1.015625, - "learning_rate": 0.0018521360296508225, - "loss": 1.4434, - "step": 568500 - }, - { - "epoch": 3.699850445412576, - "grad_norm": 0.6796875, - "learning_rate": 0.001852005982183497, - "loss": 1.4446, - "step": 569000 - }, - { - "epoch": 3.703101632095715, - "grad_norm": 1.0703125, - "learning_rate": 0.0018518759347161717, - "loss": 1.443, - "step": 569500 - }, - { - "epoch": 3.7063528187788544, - "grad_norm": 0.53515625, - "learning_rate": 0.001851745887248846, - "loss": 1.4453, - "step": 570000 - }, - { - "epoch": 3.7096040054619936, - "grad_norm": 1.21875, - "learning_rate": 0.0018516158397815204, - "loss": 1.4477, - "step": 570500 - }, - { - "epoch": 3.712855192145133, - "grad_norm": 1.4140625, - "learning_rate": 0.0018514857923141949, - "loss": 1.441, - "step": 571000 - }, - { - "epoch": 3.716106378828272, - "grad_norm": 0.7578125, - "learning_rate": 0.0018513557448468691, - "loss": 1.4444, - "step": 571500 - }, - { - "epoch": 3.7193575655114115, - "grad_norm": 1.4140625, - "learning_rate": 0.0018512256973795436, - "loss": 1.4424, - "step": 572000 - }, - { - "epoch": 3.7226087521945512, - "grad_norm": 0.83203125, - "learning_rate": 0.001851095649912218, - "loss": 1.4451, - "step": 572500 - }, - { - "epoch": 3.7258599388776905, - "grad_norm": 0.71484375, - "learning_rate": 0.0018509656024448924, - "loss": 1.4453, - "step": 573000 - }, - { - "epoch": 3.72911112556083, - "grad_norm": 0.6875, - "learning_rate": 0.0018508355549775668, - "loss": 1.446, - "step": 573500 - }, - { - "epoch": 3.732362312243969, - "grad_norm": 0.8828125, - "learning_rate": 0.0018507055075102413, - "loss": 1.4498, - "step": 574000 - }, - { - "epoch": 3.7356134989271084, - "grad_norm": 0.703125, - "learning_rate": 0.0018505754600429156, - "loss": 1.4429, - "step": 574500 - }, - { - "epoch": 3.7388646856102477, - "grad_norm": 0.7578125, - "learning_rate": 0.00185044541257559, - "loss": 1.4459, - "step": 575000 - }, - { - "epoch": 3.742115872293387, - "grad_norm": 0.57421875, - "learning_rate": 0.0018503153651082645, - "loss": 1.4468, - "step": 575500 - }, - { - "epoch": 3.7453670589765267, - "grad_norm": 0.73046875, - "learning_rate": 0.0018501853176409388, - "loss": 1.4479, - "step": 576000 - }, - { - "epoch": 3.7486182456596655, - "grad_norm": 0.84375, - "learning_rate": 0.0018500552701736133, - "loss": 1.4447, - "step": 576500 - }, - { - "epoch": 3.7518694323428052, - "grad_norm": 7.25, - "learning_rate": 0.001849925222706288, - "loss": 1.447, - "step": 577000 - }, - { - "epoch": 3.7551206190259445, - "grad_norm": 1.265625, - "learning_rate": 0.0018497951752389624, - "loss": 1.443, - "step": 577500 - }, - { - "epoch": 3.758371805709084, - "grad_norm": 0.828125, - "learning_rate": 0.0018496651277716367, - "loss": 1.4386, - "step": 578000 - }, - { - "epoch": 3.761622992392223, - "grad_norm": 3.828125, - "learning_rate": 0.0018495350803043112, - "loss": 1.4434, - "step": 578500 - }, - { - "epoch": 3.7648741790753624, - "grad_norm": 0.86328125, - "learning_rate": 0.0018494050328369857, - "loss": 1.4513, - "step": 579000 - }, - { - "epoch": 3.768125365758502, - "grad_norm": 0.609375, - "learning_rate": 0.00184927498536966, - "loss": 1.444, - "step": 579500 - }, - { - "epoch": 3.771376552441641, - "grad_norm": 0.7265625, - "learning_rate": 0.0018491449379023344, - "loss": 1.444, - "step": 580000 - }, - { - "epoch": 3.7746277391247807, - "grad_norm": 1.0703125, - "learning_rate": 0.0018490148904350089, - "loss": 1.449, - "step": 580500 - }, - { - "epoch": 3.77787892580792, - "grad_norm": 0.69921875, - "learning_rate": 0.0018488848429676831, - "loss": 1.447, - "step": 581000 - }, - { - "epoch": 3.7811301124910592, - "grad_norm": 0.91015625, - "learning_rate": 0.0018487547955003576, - "loss": 1.4415, - "step": 581500 - }, - { - "epoch": 3.7843812991741985, - "grad_norm": 0.75390625, - "learning_rate": 0.001848624748033032, - "loss": 1.443, - "step": 582000 - }, - { - "epoch": 3.787632485857338, - "grad_norm": 0.69140625, - "learning_rate": 0.0018484947005657064, - "loss": 1.4442, - "step": 582500 - }, - { - "epoch": 3.7908836725404775, - "grad_norm": 0.8046875, - "learning_rate": 0.0018483646530983808, - "loss": 1.4434, - "step": 583000 - }, - { - "epoch": 3.7941348592236164, - "grad_norm": 0.9140625, - "learning_rate": 0.0018482346056310553, - "loss": 1.4487, - "step": 583500 - }, - { - "epoch": 3.797386045906756, - "grad_norm": 0.703125, - "learning_rate": 0.00184810455816373, - "loss": 1.4402, - "step": 584000 - }, - { - "epoch": 3.8006372325898954, - "grad_norm": 0.5546875, - "learning_rate": 0.0018479745106964043, - "loss": 1.4463, - "step": 584500 - }, - { - "epoch": 3.8038884192730347, - "grad_norm": 0.53125, - "learning_rate": 0.0018478444632290788, - "loss": 1.4456, - "step": 585000 - }, - { - "epoch": 3.807139605956174, - "grad_norm": 0.66796875, - "learning_rate": 0.0018477144157617532, - "loss": 1.4485, - "step": 585500 - }, - { - "epoch": 3.8103907926393132, - "grad_norm": 1.6796875, - "learning_rate": 0.0018475843682944275, - "loss": 1.4514, - "step": 586000 - }, - { - "epoch": 3.813641979322453, - "grad_norm": 0.84765625, - "learning_rate": 0.001847454320827102, - "loss": 1.4494, - "step": 586500 - }, - { - "epoch": 3.816893166005592, - "grad_norm": 0.703125, - "learning_rate": 0.0018473242733597765, - "loss": 1.4423, - "step": 587000 - }, - { - "epoch": 3.8201443526887315, - "grad_norm": 0.9765625, - "learning_rate": 0.0018471942258924507, - "loss": 1.454, - "step": 587500 - }, - { - "epoch": 3.823395539371871, - "grad_norm": 0.78515625, - "learning_rate": 0.0018470641784251252, - "loss": 1.4479, - "step": 588000 - }, - { - "epoch": 3.82664672605501, - "grad_norm": 0.79296875, - "learning_rate": 0.0018469341309577997, - "loss": 1.449, - "step": 588500 - }, - { - "epoch": 3.8298979127381494, - "grad_norm": 0.75390625, - "learning_rate": 0.001846804083490474, - "loss": 1.4458, - "step": 589000 - }, - { - "epoch": 3.8331490994212887, - "grad_norm": 0.65234375, - "learning_rate": 0.0018466740360231484, - "loss": 1.4449, - "step": 589500 - }, - { - "epoch": 3.836400286104428, - "grad_norm": 0.71875, - "learning_rate": 0.0018465439885558229, - "loss": 1.4478, - "step": 590000 - }, - { - "epoch": 3.8396514727875672, - "grad_norm": 0.75390625, - "learning_rate": 0.0018464139410884972, - "loss": 1.4433, - "step": 590500 - }, - { - "epoch": 3.842902659470707, - "grad_norm": 0.62890625, - "learning_rate": 0.0018462838936211716, - "loss": 1.442, - "step": 591000 - }, - { - "epoch": 3.8461538461538463, - "grad_norm": 7.4375, - "learning_rate": 0.0018461538461538463, - "loss": 1.4469, - "step": 591500 - }, - { - "epoch": 3.8494050328369855, - "grad_norm": 0.64453125, - "learning_rate": 0.0018460237986865208, - "loss": 1.4432, - "step": 592000 - }, - { - "epoch": 3.852656219520125, - "grad_norm": 0.70703125, - "learning_rate": 0.001845893751219195, - "loss": 1.4445, - "step": 592500 - }, - { - "epoch": 3.855907406203264, - "grad_norm": 0.84375, - "learning_rate": 0.0018457637037518695, - "loss": 1.4427, - "step": 593000 - }, - { - "epoch": 3.8591585928864034, - "grad_norm": 0.796875, - "learning_rate": 0.001845633656284544, - "loss": 1.4461, - "step": 593500 - }, - { - "epoch": 3.8624097795695427, - "grad_norm": 0.859375, - "learning_rate": 0.0018455036088172183, - "loss": 1.4441, - "step": 594000 - }, - { - "epoch": 3.8656609662526824, - "grad_norm": 0.66796875, - "learning_rate": 0.0018453735613498928, - "loss": 1.4397, - "step": 594500 - }, - { - "epoch": 3.8689121529358217, - "grad_norm": 0.5859375, - "learning_rate": 0.0018452435138825672, - "loss": 1.4462, - "step": 595000 - }, - { - "epoch": 3.872163339618961, - "grad_norm": 0.6328125, - "learning_rate": 0.0018451134664152415, - "loss": 1.4442, - "step": 595500 - }, - { - "epoch": 3.8754145263021003, - "grad_norm": 0.75390625, - "learning_rate": 0.001844983418947916, - "loss": 1.4517, - "step": 596000 - }, - { - "epoch": 3.8786657129852395, - "grad_norm": 1.359375, - "learning_rate": 0.0018448533714805905, - "loss": 1.4499, - "step": 596500 - }, - { - "epoch": 3.881916899668379, - "grad_norm": 0.8671875, - "learning_rate": 0.0018447233240132647, - "loss": 1.4482, - "step": 597000 - }, - { - "epoch": 3.885168086351518, - "grad_norm": 1.4375, - "learning_rate": 0.0018445932765459392, - "loss": 1.4522, - "step": 597500 - }, - { - "epoch": 3.888419273034658, - "grad_norm": 0.90625, - "learning_rate": 0.0018444632290786137, - "loss": 1.4533, - "step": 598000 - }, - { - "epoch": 3.891670459717797, - "grad_norm": 1.75, - "learning_rate": 0.0018443331816112884, - "loss": 1.4641, - "step": 598500 - }, - { - "epoch": 3.8949216464009364, - "grad_norm": 0.62890625, - "learning_rate": 0.0018442031341439626, - "loss": 1.4639, - "step": 599000 - }, - { - "epoch": 3.8981728330840757, - "grad_norm": 0.71875, - "learning_rate": 0.0018440730866766371, - "loss": 1.4723, - "step": 599500 - }, - { - "epoch": 3.901424019767215, - "grad_norm": 0.5703125, - "learning_rate": 0.0018439430392093116, - "loss": 1.47, - "step": 600000 - }, - { - "epoch": 3.9046752064503543, - "grad_norm": 0.734375, - "learning_rate": 0.0018438129917419859, - "loss": 1.4575, - "step": 600500 - }, - { - "epoch": 3.9079263931334935, - "grad_norm": 1.3046875, - "learning_rate": 0.0018436829442746603, - "loss": 1.4491, - "step": 601000 - }, - { - "epoch": 3.9111775798166333, - "grad_norm": 0.60546875, - "learning_rate": 0.0018435528968073348, - "loss": 1.4543, - "step": 601500 - }, - { - "epoch": 3.9144287664997726, - "grad_norm": 1.09375, - "learning_rate": 0.001843422849340009, - "loss": 1.4517, - "step": 602000 - }, - { - "epoch": 3.917679953182912, - "grad_norm": 0.65625, - "learning_rate": 0.0018432928018726836, - "loss": 1.4506, - "step": 602500 - }, - { - "epoch": 3.920931139866051, - "grad_norm": 0.625, - "learning_rate": 0.001843162754405358, - "loss": 1.4549, - "step": 603000 - }, - { - "epoch": 3.9241823265491904, - "grad_norm": 1.7578125, - "learning_rate": 0.0018430327069380323, - "loss": 1.4574, - "step": 603500 - }, - { - "epoch": 3.9274335132323297, - "grad_norm": 1.671875, - "learning_rate": 0.0018429026594707068, - "loss": 1.4556, - "step": 604000 - }, - { - "epoch": 3.930684699915469, - "grad_norm": 1.0234375, - "learning_rate": 0.0018427726120033812, - "loss": 1.4504, - "step": 604500 - }, - { - "epoch": 3.9339358865986087, - "grad_norm": 0.65625, - "learning_rate": 0.0018426425645360555, - "loss": 1.4489, - "step": 605000 - }, - { - "epoch": 3.937187073281748, - "grad_norm": 1.453125, - "learning_rate": 0.00184251251706873, - "loss": 1.4473, - "step": 605500 - }, - { - "epoch": 3.9404382599648873, - "grad_norm": 1.0234375, - "learning_rate": 0.0018423824696014047, - "loss": 1.4407, - "step": 606000 - }, - { - "epoch": 3.9436894466480266, - "grad_norm": 0.765625, - "learning_rate": 0.0018422524221340792, - "loss": 1.4481, - "step": 606500 - }, - { - "epoch": 3.946940633331166, - "grad_norm": 0.66796875, - "learning_rate": 0.0018421223746667534, - "loss": 1.4457, - "step": 607000 - }, - { - "epoch": 3.950191820014305, - "grad_norm": 1.0, - "learning_rate": 0.001841992327199428, - "loss": 1.4491, - "step": 607500 - }, - { - "epoch": 3.9534430066974444, - "grad_norm": 0.82421875, - "learning_rate": 0.0018418622797321024, - "loss": 1.443, - "step": 608000 - }, - { - "epoch": 3.956694193380584, - "grad_norm": 1.0390625, - "learning_rate": 0.0018417322322647766, - "loss": 1.4424, - "step": 608500 - }, - { - "epoch": 3.959945380063723, - "grad_norm": 0.6640625, - "learning_rate": 0.0018416021847974511, - "loss": 1.4371, - "step": 609000 - }, - { - "epoch": 3.9631965667468627, - "grad_norm": 1.015625, - "learning_rate": 0.0018414721373301256, - "loss": 1.439, - "step": 609500 - }, - { - "epoch": 3.966447753430002, - "grad_norm": 0.87109375, - "learning_rate": 0.0018413420898627999, - "loss": 1.4452, - "step": 610000 - }, - { - "epoch": 3.9696989401131413, - "grad_norm": 0.73046875, - "learning_rate": 0.0018412120423954743, - "loss": 1.4444, - "step": 610500 - }, - { - "epoch": 3.9729501267962806, - "grad_norm": 0.75390625, - "learning_rate": 0.0018410819949281488, - "loss": 1.4489, - "step": 611000 - }, - { - "epoch": 3.97620131347942, - "grad_norm": 0.67578125, - "learning_rate": 0.001840951947460823, - "loss": 1.4499, - "step": 611500 - }, - { - "epoch": 3.9794525001625596, - "grad_norm": 0.6953125, - "learning_rate": 0.0018408218999934976, - "loss": 1.4398, - "step": 612000 - }, - { - "epoch": 3.9827036868456984, - "grad_norm": 1.140625, - "learning_rate": 0.001840691852526172, - "loss": 1.4407, - "step": 612500 - }, - { - "epoch": 3.985954873528838, - "grad_norm": 0.64453125, - "learning_rate": 0.0018405618050588467, - "loss": 1.4432, - "step": 613000 - }, - { - "epoch": 3.9892060602119774, - "grad_norm": 0.73828125, - "learning_rate": 0.001840431757591521, - "loss": 1.4482, - "step": 613500 - }, - { - "epoch": 3.9924572468951167, - "grad_norm": 1.390625, - "learning_rate": 0.0018403017101241955, - "loss": 1.4452, - "step": 614000 - }, - { - "epoch": 3.995708433578256, - "grad_norm": 0.60546875, - "learning_rate": 0.00184017166265687, - "loss": 1.4436, - "step": 614500 - }, - { - "epoch": 3.9989596202613953, - "grad_norm": 0.8828125, - "learning_rate": 0.0018400416151895442, - "loss": 1.4495, - "step": 615000 - }, - { - "epoch": 4.0, - "eval_loss": 1.4305795431137085, - "eval_runtime": 0.5339, - "eval_samples_per_second": 1873.085, - "eval_steps_per_second": 29.969, - "step": 615160 - }, - { - "epoch": 4.002210806944535, - "grad_norm": 0.83984375, - "learning_rate": 0.0018399115677222187, - "loss": 1.4472, - "step": 615500 - }, - { - "epoch": 4.005461993627674, - "grad_norm": 2.375, - "learning_rate": 0.0018397815202548932, - "loss": 1.4458, - "step": 616000 - }, - { - "epoch": 4.008713180310814, - "grad_norm": 0.90625, - "learning_rate": 0.0018396514727875674, - "loss": 1.4438, - "step": 616500 - }, - { - "epoch": 4.011964366993952, - "grad_norm": 0.76171875, - "learning_rate": 0.001839521425320242, - "loss": 1.4449, - "step": 617000 - }, - { - "epoch": 4.015215553677092, - "grad_norm": 0.77734375, - "learning_rate": 0.0018393913778529164, - "loss": 1.4383, - "step": 617500 - }, - { - "epoch": 4.018466740360232, - "grad_norm": 0.7421875, - "learning_rate": 0.0018392613303855907, - "loss": 1.4475, - "step": 618000 - }, - { - "epoch": 4.021717927043371, - "grad_norm": 0.79296875, - "learning_rate": 0.0018391312829182651, - "loss": 1.4417, - "step": 618500 - }, - { - "epoch": 4.0249691137265105, - "grad_norm": 1.0078125, - "learning_rate": 0.0018390012354509396, - "loss": 1.4449, - "step": 619000 - }, - { - "epoch": 4.028220300409649, - "grad_norm": 2.34375, - "learning_rate": 0.0018388711879836139, - "loss": 1.4429, - "step": 619500 - }, - { - "epoch": 4.031471487092789, - "grad_norm": 0.8125, - "learning_rate": 0.0018387411405162883, - "loss": 1.4423, - "step": 620000 - }, - { - "epoch": 4.034722673775928, - "grad_norm": 0.60546875, - "learning_rate": 0.001838611093048963, - "loss": 1.4394, - "step": 620500 - }, - { - "epoch": 4.037973860459068, - "grad_norm": 3.515625, - "learning_rate": 0.0018384810455816375, - "loss": 1.4429, - "step": 621000 - }, - { - "epoch": 4.041225047142207, - "grad_norm": 0.6484375, - "learning_rate": 0.0018383509981143118, - "loss": 1.4435, - "step": 621500 - }, - { - "epoch": 4.044476233825346, - "grad_norm": 0.57421875, - "learning_rate": 0.0018382209506469863, - "loss": 1.444, - "step": 622000 - }, - { - "epoch": 4.047727420508486, - "grad_norm": 0.7265625, - "learning_rate": 0.0018380909031796607, - "loss": 1.4436, - "step": 622500 - }, - { - "epoch": 4.050978607191625, - "grad_norm": 0.6953125, - "learning_rate": 0.001837960855712335, - "loss": 1.4432, - "step": 623000 - }, - { - "epoch": 4.0542297938747645, - "grad_norm": 1.6171875, - "learning_rate": 0.0018378308082450095, - "loss": 1.4503, - "step": 623500 - }, - { - "epoch": 4.057480980557903, - "grad_norm": 4.625, - "learning_rate": 0.001837700760777684, - "loss": 1.4511, - "step": 624000 - }, - { - "epoch": 4.060732167241043, - "grad_norm": 0.76171875, - "learning_rate": 0.0018375707133103582, - "loss": 1.444, - "step": 624500 - }, - { - "epoch": 4.063983353924183, - "grad_norm": 0.89453125, - "learning_rate": 0.0018374406658430327, - "loss": 1.4435, - "step": 625000 - }, - { - "epoch": 4.067234540607322, - "grad_norm": 0.65234375, - "learning_rate": 0.0018373106183757072, - "loss": 1.4421, - "step": 625500 - }, - { - "epoch": 4.070485727290461, - "grad_norm": 0.72265625, - "learning_rate": 0.0018371805709083814, - "loss": 1.4455, - "step": 626000 - }, - { - "epoch": 4.0737369139736, - "grad_norm": 1.25, - "learning_rate": 0.001837050523441056, - "loss": 1.4422, - "step": 626500 - }, - { - "epoch": 4.07698810065674, - "grad_norm": 1.1953125, - "learning_rate": 0.0018369204759737304, - "loss": 1.4459, - "step": 627000 - }, - { - "epoch": 4.080239287339879, - "grad_norm": 0.81640625, - "learning_rate": 0.001836790428506405, - "loss": 1.4457, - "step": 627500 - }, - { - "epoch": 4.0834904740230185, - "grad_norm": 0.8515625, - "learning_rate": 0.0018366603810390794, - "loss": 1.4458, - "step": 628000 - }, - { - "epoch": 4.086741660706158, - "grad_norm": 1.2890625, - "learning_rate": 0.0018365303335717538, - "loss": 1.4424, - "step": 628500 - }, - { - "epoch": 4.089992847389297, - "grad_norm": 0.67578125, - "learning_rate": 0.0018364002861044283, - "loss": 1.4486, - "step": 629000 - }, - { - "epoch": 4.093244034072437, - "grad_norm": 1.828125, - "learning_rate": 0.0018362702386371026, - "loss": 1.4482, - "step": 629500 - }, - { - "epoch": 4.096495220755576, - "grad_norm": 1.0078125, - "learning_rate": 0.001836140191169777, - "loss": 1.4395, - "step": 630000 - }, - { - "epoch": 4.099746407438715, - "grad_norm": 1.1953125, - "learning_rate": 0.0018360101437024515, - "loss": 1.4456, - "step": 630500 - }, - { - "epoch": 4.102997594121854, - "grad_norm": 0.69140625, - "learning_rate": 0.0018358800962351258, - "loss": 1.4444, - "step": 631000 - }, - { - "epoch": 4.106248780804994, - "grad_norm": 0.671875, - "learning_rate": 0.0018357500487678003, - "loss": 1.443, - "step": 631500 - }, - { - "epoch": 4.109499967488133, - "grad_norm": 0.72265625, - "learning_rate": 0.0018356200013004747, - "loss": 1.4445, - "step": 632000 - }, - { - "epoch": 4.1127511541712725, - "grad_norm": 1.3671875, - "learning_rate": 0.001835489953833149, - "loss": 1.4599, - "step": 632500 - }, - { - "epoch": 4.116002340854412, - "grad_norm": 2.1875, - "learning_rate": 0.0018353599063658235, - "loss": 1.448, - "step": 633000 - }, - { - "epoch": 4.119253527537551, - "grad_norm": 0.62890625, - "learning_rate": 0.001835229858898498, - "loss": 1.4514, - "step": 633500 - }, - { - "epoch": 4.122504714220691, - "grad_norm": 0.69140625, - "learning_rate": 0.0018350998114311722, - "loss": 1.4508, - "step": 634000 - }, - { - "epoch": 4.12575590090383, - "grad_norm": 9.625, - "learning_rate": 0.0018349697639638467, - "loss": 1.4499, - "step": 634500 - }, - { - "epoch": 4.129007087586969, - "grad_norm": 1.0, - "learning_rate": 0.0018348397164965214, - "loss": 1.4394, - "step": 635000 - }, - { - "epoch": 4.132258274270108, - "grad_norm": 1.28125, - "learning_rate": 0.0018347096690291959, - "loss": 1.4405, - "step": 635500 - }, - { - "epoch": 4.135509460953248, - "grad_norm": 0.6796875, - "learning_rate": 0.0018345796215618701, - "loss": 1.4426, - "step": 636000 - }, - { - "epoch": 4.138760647636388, - "grad_norm": 0.85546875, - "learning_rate": 0.0018344495740945446, - "loss": 1.4407, - "step": 636500 - }, - { - "epoch": 4.1420118343195265, - "grad_norm": 0.859375, - "learning_rate": 0.001834319526627219, - "loss": 1.4451, - "step": 637000 - }, - { - "epoch": 4.145263021002666, - "grad_norm": 0.92578125, - "learning_rate": 0.0018341894791598934, - "loss": 1.4492, - "step": 637500 - }, - { - "epoch": 4.148514207685805, - "grad_norm": 0.6875, - "learning_rate": 0.0018340594316925678, - "loss": 1.4399, - "step": 638000 - }, - { - "epoch": 4.151765394368945, - "grad_norm": 0.91796875, - "learning_rate": 0.0018339293842252423, - "loss": 1.4504, - "step": 638500 - }, - { - "epoch": 4.155016581052084, - "grad_norm": 1.5390625, - "learning_rate": 0.0018337993367579166, - "loss": 1.4557, - "step": 639000 - }, - { - "epoch": 4.158267767735223, - "grad_norm": 0.8671875, - "learning_rate": 0.001833669289290591, - "loss": 1.4523, - "step": 639500 - }, - { - "epoch": 4.161518954418363, - "grad_norm": 0.65625, - "learning_rate": 0.0018335392418232655, - "loss": 1.4516, - "step": 640000 - }, - { - "epoch": 4.164770141101502, - "grad_norm": 2.984375, - "learning_rate": 0.0018334091943559398, - "loss": 1.4507, - "step": 640500 - }, - { - "epoch": 4.168021327784642, - "grad_norm": 0.66015625, - "learning_rate": 0.0018332791468886143, - "loss": 1.4546, - "step": 641000 - }, - { - "epoch": 4.1712725144677805, - "grad_norm": 0.82421875, - "learning_rate": 0.0018331490994212888, - "loss": 1.4501, - "step": 641500 - }, - { - "epoch": 4.17452370115092, - "grad_norm": 0.6796875, - "learning_rate": 0.001833019051953963, - "loss": 1.4429, - "step": 642000 - }, - { - "epoch": 4.177774887834059, - "grad_norm": 0.80078125, - "learning_rate": 0.0018328890044866377, - "loss": 1.4459, - "step": 642500 - }, - { - "epoch": 4.181026074517199, - "grad_norm": 0.7890625, - "learning_rate": 0.0018327589570193122, - "loss": 1.4485, - "step": 643000 - }, - { - "epoch": 4.1842772612003385, - "grad_norm": 1.1484375, - "learning_rate": 0.0018326289095519867, - "loss": 1.4511, - "step": 643500 - }, - { - "epoch": 4.187528447883477, - "grad_norm": 1.8828125, - "learning_rate": 0.001832498862084661, - "loss": 1.4479, - "step": 644000 - }, - { - "epoch": 4.190779634566617, - "grad_norm": 0.6953125, - "learning_rate": 0.0018323688146173354, - "loss": 1.451, - "step": 644500 - }, - { - "epoch": 4.194030821249756, - "grad_norm": 0.59765625, - "learning_rate": 0.0018322387671500099, - "loss": 1.4497, - "step": 645000 - }, - { - "epoch": 4.197282007932896, - "grad_norm": 0.73046875, - "learning_rate": 0.0018321087196826842, - "loss": 1.4446, - "step": 645500 - }, - { - "epoch": 4.2005331946160345, - "grad_norm": 0.6640625, - "learning_rate": 0.0018319786722153586, - "loss": 1.4413, - "step": 646000 - }, - { - "epoch": 4.203784381299174, - "grad_norm": 0.65625, - "learning_rate": 0.001831848624748033, - "loss": 1.445, - "step": 646500 - }, - { - "epoch": 4.207035567982314, - "grad_norm": 0.82421875, - "learning_rate": 0.0018317185772807074, - "loss": 1.4409, - "step": 647000 - }, - { - "epoch": 4.210286754665453, - "grad_norm": 0.609375, - "learning_rate": 0.0018315885298133818, - "loss": 1.4424, - "step": 647500 - }, - { - "epoch": 4.2135379413485925, - "grad_norm": 0.95703125, - "learning_rate": 0.0018314584823460563, - "loss": 1.4467, - "step": 648000 - }, - { - "epoch": 4.216789128031731, - "grad_norm": 0.70703125, - "learning_rate": 0.0018313284348787306, - "loss": 1.4442, - "step": 648500 - }, - { - "epoch": 4.220040314714871, - "grad_norm": 1.2734375, - "learning_rate": 0.001831198387411405, - "loss": 1.4404, - "step": 649000 - }, - { - "epoch": 4.22329150139801, - "grad_norm": 0.87890625, - "learning_rate": 0.0018310683399440798, - "loss": 1.441, - "step": 649500 - }, - { - "epoch": 4.22654268808115, - "grad_norm": 0.71484375, - "learning_rate": 0.0018309382924767542, - "loss": 1.442, - "step": 650000 - }, - { - "epoch": 4.229793874764289, - "grad_norm": 3.046875, - "learning_rate": 0.0018308082450094285, - "loss": 1.4432, - "step": 650500 - }, - { - "epoch": 4.233045061447428, - "grad_norm": 2.25, - "learning_rate": 0.001830678197542103, - "loss": 1.4433, - "step": 651000 - }, - { - "epoch": 4.236296248130568, - "grad_norm": 0.7109375, - "learning_rate": 0.0018305481500747775, - "loss": 1.4452, - "step": 651500 - }, - { - "epoch": 4.239547434813707, - "grad_norm": 0.64453125, - "learning_rate": 0.0018304181026074517, - "loss": 1.4406, - "step": 652000 - }, - { - "epoch": 4.2427986214968465, - "grad_norm": 0.62890625, - "learning_rate": 0.0018302880551401262, - "loss": 1.4419, - "step": 652500 - }, - { - "epoch": 4.246049808179985, - "grad_norm": 0.828125, - "learning_rate": 0.0018301580076728007, - "loss": 1.4422, - "step": 653000 - }, - { - "epoch": 4.249300994863125, - "grad_norm": 3.53125, - "learning_rate": 0.001830027960205475, - "loss": 1.4436, - "step": 653500 - }, - { - "epoch": 4.252552181546265, - "grad_norm": 0.859375, - "learning_rate": 0.0018298979127381494, - "loss": 1.4412, - "step": 654000 - }, - { - "epoch": 4.255803368229404, - "grad_norm": 0.8515625, - "learning_rate": 0.001829767865270824, - "loss": 1.4406, - "step": 654500 - }, - { - "epoch": 4.259054554912543, - "grad_norm": 0.78125, - "learning_rate": 0.0018296378178034982, - "loss": 1.4354, - "step": 655000 - }, - { - "epoch": 4.262305741595682, - "grad_norm": 0.66015625, - "learning_rate": 0.0018295077703361726, - "loss": 1.4452, - "step": 655500 - }, - { - "epoch": 4.265556928278822, - "grad_norm": 0.7265625, - "learning_rate": 0.0018293777228688471, - "loss": 1.4366, - "step": 656000 - }, - { - "epoch": 4.268808114961961, - "grad_norm": 0.6875, - "learning_rate": 0.0018292476754015214, - "loss": 1.4403, - "step": 656500 - }, - { - "epoch": 4.2720593016451005, - "grad_norm": 2.421875, - "learning_rate": 0.001829117627934196, - "loss": 1.4437, - "step": 657000 - }, - { - "epoch": 4.275310488328239, - "grad_norm": 0.8515625, - "learning_rate": 0.0018289875804668706, - "loss": 1.4443, - "step": 657500 - }, - { - "epoch": 4.278561675011379, - "grad_norm": 0.69140625, - "learning_rate": 0.001828857532999545, - "loss": 1.4399, - "step": 658000 - }, - { - "epoch": 4.281812861694519, - "grad_norm": 0.72265625, - "learning_rate": 0.0018287274855322193, - "loss": 1.4402, - "step": 658500 - }, - { - "epoch": 4.285064048377658, - "grad_norm": 0.91015625, - "learning_rate": 0.0018285974380648938, - "loss": 1.4419, - "step": 659000 - }, - { - "epoch": 4.288315235060797, - "grad_norm": 0.63671875, - "learning_rate": 0.0018284673905975682, - "loss": 1.441, - "step": 659500 - }, - { - "epoch": 4.291566421743936, - "grad_norm": 0.6796875, - "learning_rate": 0.0018283373431302425, - "loss": 1.4357, - "step": 660000 - }, - { - "epoch": 4.294817608427076, - "grad_norm": 2.453125, - "learning_rate": 0.001828207295662917, - "loss": 1.4341, - "step": 660500 - }, - { - "epoch": 4.298068795110215, - "grad_norm": 2.015625, - "learning_rate": 0.0018280772481955915, - "loss": 1.4512, - "step": 661000 - }, - { - "epoch": 4.3013199817933545, - "grad_norm": 0.63671875, - "learning_rate": 0.0018279472007282657, - "loss": 1.4393, - "step": 661500 - }, - { - "epoch": 4.304571168476494, - "grad_norm": 0.59765625, - "learning_rate": 0.0018278171532609402, - "loss": 1.4444, - "step": 662000 - }, - { - "epoch": 4.307822355159633, - "grad_norm": 0.65625, - "learning_rate": 0.0018276871057936147, - "loss": 1.4476, - "step": 662500 - }, - { - "epoch": 4.311073541842773, - "grad_norm": 0.8515625, - "learning_rate": 0.001827557058326289, - "loss": 1.4516, - "step": 663000 - }, - { - "epoch": 4.314324728525912, - "grad_norm": 0.8046875, - "learning_rate": 0.0018274270108589634, - "loss": 1.4479, - "step": 663500 - }, - { - "epoch": 4.317575915209051, - "grad_norm": 0.7734375, - "learning_rate": 0.0018272969633916381, - "loss": 1.4526, - "step": 664000 - }, - { - "epoch": 4.32082710189219, - "grad_norm": 0.89453125, - "learning_rate": 0.0018271669159243126, - "loss": 1.4503, - "step": 664500 - }, - { - "epoch": 4.32407828857533, - "grad_norm": 1.0234375, - "learning_rate": 0.0018270368684569869, - "loss": 1.4487, - "step": 665000 - }, - { - "epoch": 4.32732947525847, - "grad_norm": 0.734375, - "learning_rate": 0.0018269068209896613, - "loss": 1.4459, - "step": 665500 - }, - { - "epoch": 4.3305806619416085, - "grad_norm": 0.64453125, - "learning_rate": 0.0018267767735223358, - "loss": 1.4458, - "step": 666000 - }, - { - "epoch": 4.333831848624748, - "grad_norm": 0.80859375, - "learning_rate": 0.00182664672605501, - "loss": 1.4371, - "step": 666500 - }, - { - "epoch": 4.337083035307887, - "grad_norm": 1.625, - "learning_rate": 0.0018265166785876846, - "loss": 1.447, - "step": 667000 - }, - { - "epoch": 4.340334221991027, - "grad_norm": 0.9140625, - "learning_rate": 0.001826386631120359, - "loss": 1.4428, - "step": 667500 - }, - { - "epoch": 4.343585408674166, - "grad_norm": 0.68359375, - "learning_rate": 0.0018262565836530333, - "loss": 1.4406, - "step": 668000 - }, - { - "epoch": 4.346836595357305, - "grad_norm": 1.2734375, - "learning_rate": 0.0018261265361857078, - "loss": 1.4477, - "step": 668500 - }, - { - "epoch": 4.350087782040445, - "grad_norm": 1.5390625, - "learning_rate": 0.0018259964887183823, - "loss": 1.445, - "step": 669000 - }, - { - "epoch": 4.353338968723584, - "grad_norm": 0.62890625, - "learning_rate": 0.0018258664412510565, - "loss": 1.442, - "step": 669500 - }, - { - "epoch": 4.356590155406724, - "grad_norm": 0.62890625, - "learning_rate": 0.001825736393783731, - "loss": 1.449, - "step": 670000 - }, - { - "epoch": 4.3598413420898625, - "grad_norm": 0.7109375, - "learning_rate": 0.0018256063463164055, - "loss": 1.4543, - "step": 670500 - }, - { - "epoch": 4.363092528773002, - "grad_norm": 0.8046875, - "learning_rate": 0.0018254762988490797, - "loss": 1.4485, - "step": 671000 - }, - { - "epoch": 4.366343715456141, - "grad_norm": 2.265625, - "learning_rate": 0.0018253462513817544, - "loss": 1.4508, - "step": 671500 - }, - { - "epoch": 4.369594902139281, - "grad_norm": 0.7109375, - "learning_rate": 0.001825216203914429, - "loss": 1.4501, - "step": 672000 - }, - { - "epoch": 4.3728460888224205, - "grad_norm": 0.94921875, - "learning_rate": 0.0018250861564471034, - "loss": 1.4442, - "step": 672500 - }, - { - "epoch": 4.376097275505559, - "grad_norm": 0.98828125, - "learning_rate": 0.0018249561089797777, - "loss": 1.4462, - "step": 673000 - }, - { - "epoch": 4.379348462188699, - "grad_norm": 3.125, - "learning_rate": 0.0018248260615124521, - "loss": 1.4463, - "step": 673500 - }, - { - "epoch": 4.382599648871838, - "grad_norm": 1.1953125, - "learning_rate": 0.0018246960140451266, - "loss": 1.4506, - "step": 674000 - }, - { - "epoch": 4.385850835554978, - "grad_norm": 0.63671875, - "learning_rate": 0.0018245659665778009, - "loss": 1.4447, - "step": 674500 - }, - { - "epoch": 4.3891020222381165, - "grad_norm": 0.60546875, - "learning_rate": 0.0018244359191104753, - "loss": 1.4543, - "step": 675000 - }, - { - "epoch": 4.392353208921256, - "grad_norm": 0.77734375, - "learning_rate": 0.0018243058716431498, - "loss": 1.4488, - "step": 675500 - }, - { - "epoch": 4.395604395604396, - "grad_norm": 3.1875, - "learning_rate": 0.001824175824175824, - "loss": 1.4386, - "step": 676000 - }, - { - "epoch": 4.398855582287535, - "grad_norm": 0.8671875, - "learning_rate": 0.0018240457767084986, - "loss": 1.4409, - "step": 676500 - }, - { - "epoch": 4.4021067689706745, - "grad_norm": 0.65625, - "learning_rate": 0.001823915729241173, - "loss": 1.4445, - "step": 677000 - }, - { - "epoch": 4.405357955653813, - "grad_norm": 0.6875, - "learning_rate": 0.0018237856817738473, - "loss": 1.4439, - "step": 677500 - }, - { - "epoch": 4.408609142336953, - "grad_norm": 0.6328125, - "learning_rate": 0.0018236556343065218, - "loss": 1.4467, - "step": 678000 - }, - { - "epoch": 4.411860329020092, - "grad_norm": 0.6953125, - "learning_rate": 0.0018235255868391965, - "loss": 1.4435, - "step": 678500 - }, - { - "epoch": 4.415111515703232, - "grad_norm": 0.8359375, - "learning_rate": 0.001823395539371871, - "loss": 1.4387, - "step": 679000 - }, - { - "epoch": 4.418362702386371, - "grad_norm": 0.87109375, - "learning_rate": 0.0018232654919045452, - "loss": 1.4442, - "step": 679500 - }, - { - "epoch": 4.42161388906951, - "grad_norm": 0.78125, - "learning_rate": 0.0018231354444372197, - "loss": 1.4385, - "step": 680000 - }, - { - "epoch": 4.42486507575265, - "grad_norm": 2.40625, - "learning_rate": 0.0018230053969698942, - "loss": 1.4436, - "step": 680500 - }, - { - "epoch": 4.428116262435789, - "grad_norm": 0.75390625, - "learning_rate": 0.0018228753495025684, - "loss": 1.4407, - "step": 681000 - }, - { - "epoch": 4.4313674491189285, - "grad_norm": 0.6171875, - "learning_rate": 0.001822745302035243, - "loss": 1.4402, - "step": 681500 - }, - { - "epoch": 4.434618635802067, - "grad_norm": 0.71875, - "learning_rate": 0.0018226152545679174, - "loss": 1.4414, - "step": 682000 - }, - { - "epoch": 4.437869822485207, - "grad_norm": 0.890625, - "learning_rate": 0.0018224852071005917, - "loss": 1.4379, - "step": 682500 - }, - { - "epoch": 4.441121009168347, - "grad_norm": 0.84375, - "learning_rate": 0.0018223551596332661, - "loss": 1.4403, - "step": 683000 - }, - { - "epoch": 4.444372195851486, - "grad_norm": 0.7109375, - "learning_rate": 0.0018222251121659406, - "loss": 1.4432, - "step": 683500 - }, - { - "epoch": 4.447623382534625, - "grad_norm": 0.8203125, - "learning_rate": 0.0018220950646986149, - "loss": 1.4378, - "step": 684000 - }, - { - "epoch": 4.450874569217764, - "grad_norm": 0.7578125, - "learning_rate": 0.0018219650172312894, - "loss": 1.439, - "step": 684500 - }, - { - "epoch": 4.454125755900904, - "grad_norm": 0.69921875, - "learning_rate": 0.0018218349697639638, - "loss": 1.435, - "step": 685000 - }, - { - "epoch": 4.457376942584043, - "grad_norm": 0.80078125, - "learning_rate": 0.001821704922296638, - "loss": 1.4391, - "step": 685500 - }, - { - "epoch": 4.4606281292671826, - "grad_norm": 0.7265625, - "learning_rate": 0.0018215748748293128, - "loss": 1.4392, - "step": 686000 - }, - { - "epoch": 4.463879315950322, - "grad_norm": 0.734375, - "learning_rate": 0.0018214448273619873, - "loss": 1.4382, - "step": 686500 - }, - { - "epoch": 4.467130502633461, - "grad_norm": 1.203125, - "learning_rate": 0.0018213147798946617, - "loss": 1.4411, - "step": 687000 - }, - { - "epoch": 4.470381689316601, - "grad_norm": 0.6796875, - "learning_rate": 0.001821184732427336, - "loss": 1.4439, - "step": 687500 - }, - { - "epoch": 4.47363287599974, - "grad_norm": 0.796875, - "learning_rate": 0.0018210546849600105, - "loss": 1.4478, - "step": 688000 - }, - { - "epoch": 4.476884062682879, - "grad_norm": 0.6640625, - "learning_rate": 0.001820924637492685, - "loss": 1.4437, - "step": 688500 - }, - { - "epoch": 4.480135249366018, - "grad_norm": 0.703125, - "learning_rate": 0.0018207945900253592, - "loss": 1.4461, - "step": 689000 - }, - { - "epoch": 4.483386436049158, - "grad_norm": 0.67578125, - "learning_rate": 0.0018206645425580337, - "loss": 1.4388, - "step": 689500 - }, - { - "epoch": 4.486637622732298, - "grad_norm": 1.5078125, - "learning_rate": 0.0018205344950907082, - "loss": 1.4433, - "step": 690000 - }, - { - "epoch": 4.4898888094154366, - "grad_norm": 0.72265625, - "learning_rate": 0.0018204044476233824, - "loss": 1.4442, - "step": 690500 - }, - { - "epoch": 4.493139996098576, - "grad_norm": 0.734375, - "learning_rate": 0.001820274400156057, - "loss": 1.4312, - "step": 691000 - }, - { - "epoch": 4.496391182781715, - "grad_norm": 0.59375, - "learning_rate": 0.0018201443526887314, - "loss": 1.4469, - "step": 691500 - }, - { - "epoch": 4.499642369464855, - "grad_norm": 0.67578125, - "learning_rate": 0.0018200143052214057, - "loss": 1.4389, - "step": 692000 - }, - { - "epoch": 4.502893556147994, - "grad_norm": 0.75, - "learning_rate": 0.0018198842577540801, - "loss": 1.4402, - "step": 692500 - }, - { - "epoch": 4.506144742831133, - "grad_norm": 1.3984375, - "learning_rate": 0.0018197542102867548, - "loss": 1.4366, - "step": 693000 - }, - { - "epoch": 4.509395929514273, - "grad_norm": 0.796875, - "learning_rate": 0.0018196241628194293, - "loss": 1.4418, - "step": 693500 - }, - { - "epoch": 4.512647116197412, - "grad_norm": 0.71875, - "learning_rate": 0.0018194941153521036, - "loss": 1.439, - "step": 694000 - }, - { - "epoch": 4.515898302880552, - "grad_norm": 0.59375, - "learning_rate": 0.001819364067884778, - "loss": 1.4341, - "step": 694500 - }, - { - "epoch": 4.519149489563691, - "grad_norm": 0.6875, - "learning_rate": 0.0018192340204174525, - "loss": 1.4412, - "step": 695000 - }, - { - "epoch": 4.52240067624683, - "grad_norm": 0.65234375, - "learning_rate": 0.0018191039729501268, - "loss": 1.4458, - "step": 695500 - }, - { - "epoch": 4.525651862929969, - "grad_norm": 0.765625, - "learning_rate": 0.0018189739254828013, - "loss": 1.4464, - "step": 696000 - }, - { - "epoch": 4.528903049613109, - "grad_norm": 0.84375, - "learning_rate": 0.0018188438780154758, - "loss": 1.4463, - "step": 696500 - }, - { - "epoch": 4.532154236296249, - "grad_norm": 0.88671875, - "learning_rate": 0.00181871383054815, - "loss": 1.4492, - "step": 697000 - }, - { - "epoch": 4.535405422979387, - "grad_norm": 0.6796875, - "learning_rate": 0.0018185837830808245, - "loss": 1.4423, - "step": 697500 - }, - { - "epoch": 4.538656609662527, - "grad_norm": 0.96484375, - "learning_rate": 0.001818453735613499, - "loss": 1.4375, - "step": 698000 - }, - { - "epoch": 4.541907796345666, - "grad_norm": 4.5, - "learning_rate": 0.0018183236881461732, - "loss": 1.445, - "step": 698500 - }, - { - "epoch": 4.545158983028806, - "grad_norm": 0.87890625, - "learning_rate": 0.0018181936406788477, - "loss": 1.4405, - "step": 699000 - }, - { - "epoch": 4.548410169711945, - "grad_norm": 0.6796875, - "learning_rate": 0.0018180635932115222, - "loss": 1.4406, - "step": 699500 - }, - { - "epoch": 4.551661356395084, - "grad_norm": 0.734375, - "learning_rate": 0.0018179335457441965, - "loss": 1.442, - "step": 700000 - }, - { - "epoch": 4.554912543078224, - "grad_norm": 0.7421875, - "learning_rate": 0.0018178034982768711, - "loss": 1.4395, - "step": 700500 - }, - { - "epoch": 4.558163729761363, - "grad_norm": 0.71875, - "learning_rate": 0.0018176734508095456, - "loss": 1.4396, - "step": 701000 - }, - { - "epoch": 4.561414916444503, - "grad_norm": 1.3984375, - "learning_rate": 0.00181754340334222, - "loss": 1.4441, - "step": 701500 - }, - { - "epoch": 4.564666103127641, - "grad_norm": 2.03125, - "learning_rate": 0.0018174133558748944, - "loss": 1.4435, - "step": 702000 - }, - { - "epoch": 4.567917289810781, - "grad_norm": 0.796875, - "learning_rate": 0.0018172833084075688, - "loss": 1.4422, - "step": 702500 - }, - { - "epoch": 4.57116847649392, - "grad_norm": 0.77734375, - "learning_rate": 0.0018171532609402433, - "loss": 1.4483, - "step": 703000 - }, - { - "epoch": 4.57441966317706, - "grad_norm": 0.77734375, - "learning_rate": 0.0018170232134729176, - "loss": 1.4467, - "step": 703500 - }, - { - "epoch": 4.5776708498601995, - "grad_norm": 0.66796875, - "learning_rate": 0.001816893166005592, - "loss": 1.4474, - "step": 704000 - }, - { - "epoch": 4.580922036543338, - "grad_norm": 0.6171875, - "learning_rate": 0.0018167631185382665, - "loss": 1.4412, - "step": 704500 - }, - { - "epoch": 4.584173223226478, - "grad_norm": 1.15625, - "learning_rate": 0.0018166330710709408, - "loss": 1.4412, - "step": 705000 - }, - { - "epoch": 4.587424409909617, - "grad_norm": 1.015625, - "learning_rate": 0.0018165030236036153, - "loss": 1.4442, - "step": 705500 - }, - { - "epoch": 4.590675596592757, - "grad_norm": 0.8359375, - "learning_rate": 0.0018163729761362898, - "loss": 1.4428, - "step": 706000 - }, - { - "epoch": 4.593926783275895, - "grad_norm": 0.8828125, - "learning_rate": 0.001816242928668964, - "loss": 1.4454, - "step": 706500 - }, - { - "epoch": 4.597177969959035, - "grad_norm": 0.81640625, - "learning_rate": 0.0018161128812016385, - "loss": 1.4456, - "step": 707000 - }, - { - "epoch": 4.600429156642175, - "grad_norm": 0.74609375, - "learning_rate": 0.0018159828337343132, - "loss": 1.4355, - "step": 707500 - }, - { - "epoch": 4.603680343325314, - "grad_norm": 0.828125, - "learning_rate": 0.0018158527862669877, - "loss": 1.4477, - "step": 708000 - }, - { - "epoch": 4.6069315300084535, - "grad_norm": 0.8515625, - "learning_rate": 0.001815722738799662, - "loss": 1.4491, - "step": 708500 - }, - { - "epoch": 4.610182716691592, - "grad_norm": 3.078125, - "learning_rate": 0.0018155926913323364, - "loss": 1.4513, - "step": 709000 - }, - { - "epoch": 4.613433903374732, - "grad_norm": 0.66796875, - "learning_rate": 0.001815462643865011, - "loss": 1.4504, - "step": 709500 - }, - { - "epoch": 4.616685090057871, - "grad_norm": 0.84375, - "learning_rate": 0.0018153325963976852, - "loss": 1.455, - "step": 710000 - }, - { - "epoch": 4.619936276741011, - "grad_norm": 0.6171875, - "learning_rate": 0.0018152025489303596, - "loss": 1.4542, - "step": 710500 - }, - { - "epoch": 4.623187463424149, - "grad_norm": 0.55859375, - "learning_rate": 0.0018150725014630341, - "loss": 1.4562, - "step": 711000 - }, - { - "epoch": 4.626438650107289, - "grad_norm": 0.89453125, - "learning_rate": 0.0018149424539957084, - "loss": 1.4514, - "step": 711500 - }, - { - "epoch": 4.629689836790429, - "grad_norm": 0.59375, - "learning_rate": 0.0018148124065283829, - "loss": 1.444, - "step": 712000 - }, - { - "epoch": 4.632941023473568, - "grad_norm": 0.69921875, - "learning_rate": 0.0018146823590610573, - "loss": 1.4446, - "step": 712500 - }, - { - "epoch": 4.6361922101567075, - "grad_norm": 0.70703125, - "learning_rate": 0.0018145523115937316, - "loss": 1.4515, - "step": 713000 - }, - { - "epoch": 4.639443396839846, - "grad_norm": 0.6015625, - "learning_rate": 0.001814422264126406, - "loss": 1.449, - "step": 713500 - }, - { - "epoch": 4.642694583522986, - "grad_norm": 0.59375, - "learning_rate": 0.0018142922166590806, - "loss": 1.4478, - "step": 714000 - }, - { - "epoch": 4.645945770206125, - "grad_norm": 0.66796875, - "learning_rate": 0.0018141621691917548, - "loss": 1.4463, - "step": 714500 - }, - { - "epoch": 4.649196956889265, - "grad_norm": 0.62109375, - "learning_rate": 0.0018140321217244295, - "loss": 1.4528, - "step": 715000 - }, - { - "epoch": 4.652448143572403, - "grad_norm": 0.6640625, - "learning_rate": 0.001813902074257104, - "loss": 1.4528, - "step": 715500 - }, - { - "epoch": 4.655699330255543, - "grad_norm": 0.65625, - "learning_rate": 0.0018137720267897785, - "loss": 1.4492, - "step": 716000 - }, - { - "epoch": 4.658950516938683, - "grad_norm": 0.73046875, - "learning_rate": 0.0018136419793224527, - "loss": 1.4516, - "step": 716500 - }, - { - "epoch": 4.662201703621822, - "grad_norm": 0.8984375, - "learning_rate": 0.0018135119318551272, - "loss": 1.4532, - "step": 717000 - }, - { - "epoch": 4.6654528903049615, - "grad_norm": 0.75390625, - "learning_rate": 0.0018133818843878017, - "loss": 1.4468, - "step": 717500 - }, - { - "epoch": 4.6687040769881, - "grad_norm": 0.6171875, - "learning_rate": 0.001813251836920476, - "loss": 1.4498, - "step": 718000 - }, - { - "epoch": 4.67195526367124, - "grad_norm": 0.73828125, - "learning_rate": 0.0018131217894531504, - "loss": 1.4473, - "step": 718500 - }, - { - "epoch": 4.675206450354379, - "grad_norm": 1.0, - "learning_rate": 0.001812991741985825, - "loss": 1.4454, - "step": 719000 - }, - { - "epoch": 4.678457637037519, - "grad_norm": 2.921875, - "learning_rate": 0.0018128616945184992, - "loss": 1.4448, - "step": 719500 - }, - { - "epoch": 4.681708823720658, - "grad_norm": 0.79296875, - "learning_rate": 0.0018127316470511736, - "loss": 1.4431, - "step": 720000 - }, - { - "epoch": 4.684960010403797, - "grad_norm": 0.796875, - "learning_rate": 0.0018126015995838481, - "loss": 1.4483, - "step": 720500 - }, - { - "epoch": 4.688211197086937, - "grad_norm": 0.9921875, - "learning_rate": 0.0018124715521165224, - "loss": 1.4487, - "step": 721000 - }, - { - "epoch": 4.691462383770076, - "grad_norm": 0.734375, - "learning_rate": 0.0018123415046491969, - "loss": 1.4505, - "step": 721500 - }, - { - "epoch": 4.6947135704532155, - "grad_norm": 0.7421875, - "learning_rate": 0.0018122114571818716, - "loss": 1.4404, - "step": 722000 - }, - { - "epoch": 4.697964757136354, - "grad_norm": 1.65625, - "learning_rate": 0.001812081409714546, - "loss": 1.4502, - "step": 722500 - }, - { - "epoch": 4.701215943819494, - "grad_norm": 0.7578125, - "learning_rate": 0.0018119513622472203, - "loss": 1.4484, - "step": 723000 - }, - { - "epoch": 4.704467130502634, - "grad_norm": 0.78515625, - "learning_rate": 0.0018118213147798948, - "loss": 1.4488, - "step": 723500 - }, - { - "epoch": 4.707718317185773, - "grad_norm": 2.296875, - "learning_rate": 0.0018116912673125693, - "loss": 1.4532, - "step": 724000 - }, - { - "epoch": 4.710969503868912, - "grad_norm": 1.1796875, - "learning_rate": 0.0018115612198452435, - "loss": 1.4492, - "step": 724500 - }, - { - "epoch": 4.714220690552051, - "grad_norm": 1.6484375, - "learning_rate": 0.001811431172377918, - "loss": 1.4479, - "step": 725000 - }, - { - "epoch": 4.717471877235191, - "grad_norm": 1.1640625, - "learning_rate": 0.0018113011249105925, - "loss": 1.4552, - "step": 725500 - }, - { - "epoch": 4.72072306391833, - "grad_norm": 0.58203125, - "learning_rate": 0.0018111710774432667, - "loss": 1.4542, - "step": 726000 - }, - { - "epoch": 4.7239742506014695, - "grad_norm": 0.61328125, - "learning_rate": 0.0018110410299759412, - "loss": 1.4606, - "step": 726500 - }, - { - "epoch": 4.727225437284609, - "grad_norm": 1.125, - "learning_rate": 0.0018109109825086157, - "loss": 1.4569, - "step": 727000 - }, - { - "epoch": 4.730476623967748, - "grad_norm": 0.78515625, - "learning_rate": 0.00181078093504129, - "loss": 1.4634, - "step": 727500 - }, - { - "epoch": 4.733727810650888, - "grad_norm": 0.65625, - "learning_rate": 0.0018106508875739644, - "loss": 1.461, - "step": 728000 - }, - { - "epoch": 4.736978997334027, - "grad_norm": 0.66015625, - "learning_rate": 0.001810520840106639, - "loss": 1.4565, - "step": 728500 - }, - { - "epoch": 4.740230184017166, - "grad_norm": 0.58984375, - "learning_rate": 0.0018103907926393132, - "loss": 1.4479, - "step": 729000 - }, - { - "epoch": 4.743481370700305, - "grad_norm": 0.72265625, - "learning_rate": 0.0018102607451719879, - "loss": 1.452, - "step": 729500 - }, - { - "epoch": 4.746732557383445, - "grad_norm": 0.7265625, - "learning_rate": 0.0018101306977046623, - "loss": 1.4529, - "step": 730000 - }, - { - "epoch": 4.749983744066585, - "grad_norm": 0.6796875, - "learning_rate": 0.0018100006502373368, - "loss": 1.4525, - "step": 730500 - }, - { - "epoch": 4.7532349307497235, - "grad_norm": 1.3125, - "learning_rate": 0.001809870602770011, - "loss": 1.4505, - "step": 731000 - }, - { - "epoch": 4.756486117432863, - "grad_norm": 0.73828125, - "learning_rate": 0.0018097405553026856, - "loss": 1.4475, - "step": 731500 - }, - { - "epoch": 4.759737304116002, - "grad_norm": 0.8046875, - "learning_rate": 0.00180961050783536, - "loss": 1.4491, - "step": 732000 - }, - { - "epoch": 4.762988490799142, - "grad_norm": 0.80078125, - "learning_rate": 0.0018094804603680343, - "loss": 1.4475, - "step": 732500 - }, - { - "epoch": 4.766239677482281, - "grad_norm": 1.078125, - "learning_rate": 0.0018093504129007088, - "loss": 1.4493, - "step": 733000 - }, - { - "epoch": 4.76949086416542, - "grad_norm": 0.86328125, - "learning_rate": 0.0018092203654333833, - "loss": 1.446, - "step": 733500 - }, - { - "epoch": 4.77274205084856, - "grad_norm": 0.71875, - "learning_rate": 0.0018090903179660575, - "loss": 1.4519, - "step": 734000 - }, - { - "epoch": 4.775993237531699, - "grad_norm": 0.7265625, - "learning_rate": 0.001808960270498732, - "loss": 1.4519, - "step": 734500 - }, - { - "epoch": 4.779244424214839, - "grad_norm": 0.9609375, - "learning_rate": 0.0018088302230314065, - "loss": 1.4519, - "step": 735000 - }, - { - "epoch": 4.7824956108979775, - "grad_norm": 0.6875, - "learning_rate": 0.0018087001755640807, - "loss": 1.4519, - "step": 735500 - }, - { - "epoch": 4.785746797581117, - "grad_norm": 0.74609375, - "learning_rate": 0.0018085701280967552, - "loss": 1.4503, - "step": 736000 - }, - { - "epoch": 4.788997984264256, - "grad_norm": 0.65234375, - "learning_rate": 0.00180844008062943, - "loss": 1.4491, - "step": 736500 - }, - { - "epoch": 4.792249170947396, - "grad_norm": 0.98046875, - "learning_rate": 0.0018083100331621044, - "loss": 1.4455, - "step": 737000 - }, - { - "epoch": 4.7955003576305355, - "grad_norm": 1.28125, - "learning_rate": 0.0018081799856947787, - "loss": 1.4475, - "step": 737500 - }, - { - "epoch": 4.798751544313674, - "grad_norm": 0.90234375, - "learning_rate": 0.0018080499382274531, - "loss": 1.4454, - "step": 738000 - }, - { - "epoch": 4.802002730996814, - "grad_norm": 0.7265625, - "learning_rate": 0.0018079198907601276, - "loss": 1.4446, - "step": 738500 - }, - { - "epoch": 4.805253917679953, - "grad_norm": 3.140625, - "learning_rate": 0.0018077898432928019, - "loss": 1.4439, - "step": 739000 - }, - { - "epoch": 4.808505104363093, - "grad_norm": 0.78125, - "learning_rate": 0.0018076597958254764, - "loss": 1.4392, - "step": 739500 - }, - { - "epoch": 4.8117562910462315, - "grad_norm": 1.21875, - "learning_rate": 0.0018075297483581508, - "loss": 1.4405, - "step": 740000 - }, - { - "epoch": 4.815007477729371, - "grad_norm": 0.8046875, - "learning_rate": 0.001807399700890825, - "loss": 1.4409, - "step": 740500 - }, - { - "epoch": 4.818258664412511, - "grad_norm": 0.65625, - "learning_rate": 0.0018072696534234996, - "loss": 1.4425, - "step": 741000 - }, - { - "epoch": 4.82150985109565, - "grad_norm": 0.65234375, - "learning_rate": 0.001807139605956174, - "loss": 1.4373, - "step": 741500 - }, - { - "epoch": 4.8247610377787895, - "grad_norm": 0.765625, - "learning_rate": 0.0018070095584888483, - "loss": 1.4415, - "step": 742000 - }, - { - "epoch": 4.828012224461928, - "grad_norm": 1.03125, - "learning_rate": 0.0018068795110215228, - "loss": 1.4401, - "step": 742500 - }, - { - "epoch": 4.831263411145068, - "grad_norm": 0.8046875, - "learning_rate": 0.0018067494635541973, - "loss": 1.4448, - "step": 743000 - }, - { - "epoch": 4.834514597828207, - "grad_norm": 0.9375, - "learning_rate": 0.0018066194160868715, - "loss": 1.4407, - "step": 743500 - }, - { - "epoch": 4.837765784511347, - "grad_norm": 0.8203125, - "learning_rate": 0.0018064893686195462, - "loss": 1.4454, - "step": 744000 - }, - { - "epoch": 4.841016971194486, - "grad_norm": 3.75, - "learning_rate": 0.0018063593211522207, - "loss": 1.4433, - "step": 744500 - }, - { - "epoch": 4.844268157877625, - "grad_norm": 0.6796875, - "learning_rate": 0.0018062292736848952, - "loss": 1.441, - "step": 745000 - }, - { - "epoch": 4.847519344560765, - "grad_norm": 1.78125, - "learning_rate": 0.0018060992262175694, - "loss": 1.4374, - "step": 745500 - }, - { - "epoch": 4.850770531243904, - "grad_norm": 0.7734375, - "learning_rate": 0.001805969178750244, - "loss": 1.4399, - "step": 746000 - }, - { - "epoch": 4.8540217179270435, - "grad_norm": 4.46875, - "learning_rate": 0.0018058391312829184, - "loss": 1.4392, - "step": 746500 - }, - { - "epoch": 4.857272904610182, - "grad_norm": 1.7578125, - "learning_rate": 0.0018057090838155927, - "loss": 1.4403, - "step": 747000 - }, - { - "epoch": 4.860524091293322, - "grad_norm": 0.74609375, - "learning_rate": 0.0018055790363482671, - "loss": 1.4378, - "step": 747500 - }, - { - "epoch": 4.863775277976462, - "grad_norm": 0.734375, - "learning_rate": 0.0018054489888809416, - "loss": 1.4415, - "step": 748000 - }, - { - "epoch": 4.867026464659601, - "grad_norm": 0.66796875, - "learning_rate": 0.0018053189414136159, - "loss": 1.4444, - "step": 748500 - }, - { - "epoch": 4.87027765134274, - "grad_norm": 1.359375, - "learning_rate": 0.0018051888939462904, - "loss": 1.441, - "step": 749000 - }, - { - "epoch": 4.873528838025879, - "grad_norm": 0.8515625, - "learning_rate": 0.0018050588464789648, - "loss": 1.4387, - "step": 749500 - }, - { - "epoch": 4.876780024709019, - "grad_norm": 0.625, - "learning_rate": 0.001804928799011639, - "loss": 1.4346, - "step": 750000 - }, - { - "epoch": 4.880031211392158, - "grad_norm": 0.7578125, - "learning_rate": 0.0018047987515443136, - "loss": 1.4347, - "step": 750500 - }, - { - "epoch": 4.8832823980752975, - "grad_norm": 0.83203125, - "learning_rate": 0.0018046687040769883, - "loss": 1.4413, - "step": 751000 - }, - { - "epoch": 4.886533584758437, - "grad_norm": 0.72265625, - "learning_rate": 0.0018045386566096628, - "loss": 1.4479, - "step": 751500 - }, - { - "epoch": 4.889784771441576, - "grad_norm": 0.84765625, - "learning_rate": 0.001804408609142337, - "loss": 1.446, - "step": 752000 - }, - { - "epoch": 4.893035958124716, - "grad_norm": 0.8828125, - "learning_rate": 0.0018042785616750115, - "loss": 1.4515, - "step": 752500 - }, - { - "epoch": 4.896287144807855, - "grad_norm": 0.78125, - "learning_rate": 0.001804148514207686, - "loss": 1.448, - "step": 753000 - }, - { - "epoch": 4.899538331490994, - "grad_norm": 1.2109375, - "learning_rate": 0.0018040184667403602, - "loss": 1.4473, - "step": 753500 - }, - { - "epoch": 4.902789518174133, - "grad_norm": 0.71875, - "learning_rate": 0.0018038884192730347, - "loss": 1.448, - "step": 754000 - }, - { - "epoch": 4.906040704857273, - "grad_norm": 2.140625, - "learning_rate": 0.0018037583718057092, - "loss": 1.4512, - "step": 754500 - }, - { - "epoch": 4.909291891540413, - "grad_norm": 0.71875, - "learning_rate": 0.0018036283243383835, - "loss": 1.4474, - "step": 755000 - }, - { - "epoch": 4.9125430782235515, - "grad_norm": 1.015625, - "learning_rate": 0.001803498276871058, - "loss": 1.4573, - "step": 755500 - }, - { - "epoch": 4.915794264906691, - "grad_norm": 0.6640625, - "learning_rate": 0.0018033682294037324, - "loss": 1.4442, - "step": 756000 - }, - { - "epoch": 4.91904545158983, - "grad_norm": 0.921875, - "learning_rate": 0.0018032381819364067, - "loss": 1.4525, - "step": 756500 - }, - { - "epoch": 4.92229663827297, - "grad_norm": 0.70703125, - "learning_rate": 0.0018031081344690812, - "loss": 1.4477, - "step": 757000 - }, - { - "epoch": 4.925547824956109, - "grad_norm": 1.0625, - "learning_rate": 0.0018029780870017556, - "loss": 1.4477, - "step": 757500 - }, - { - "epoch": 4.928799011639248, - "grad_norm": 2.0, - "learning_rate": 0.0018028480395344299, - "loss": 1.4513, - "step": 758000 - }, - { - "epoch": 4.932050198322388, - "grad_norm": 0.68359375, - "learning_rate": 0.0018027179920671046, - "loss": 1.4502, - "step": 758500 - }, - { - "epoch": 4.935301385005527, - "grad_norm": 0.67578125, - "learning_rate": 0.001802587944599779, - "loss": 1.4486, - "step": 759000 - }, - { - "epoch": 4.938552571688667, - "grad_norm": 0.96875, - "learning_rate": 0.0018024578971324535, - "loss": 1.4453, - "step": 759500 - }, - { - "epoch": 4.9418037583718055, - "grad_norm": 4.0, - "learning_rate": 0.0018023278496651278, - "loss": 1.4467, - "step": 760000 - }, - { - "epoch": 4.945054945054945, - "grad_norm": 0.74609375, - "learning_rate": 0.0018021978021978023, - "loss": 1.4401, - "step": 760500 - }, - { - "epoch": 4.948306131738084, - "grad_norm": 1.328125, - "learning_rate": 0.0018020677547304768, - "loss": 1.4461, - "step": 761000 - }, - { - "epoch": 4.951557318421224, - "grad_norm": 0.875, - "learning_rate": 0.001801937707263151, - "loss": 1.4454, - "step": 761500 - }, - { - "epoch": 4.9548085051043635, - "grad_norm": 1.1640625, - "learning_rate": 0.0018018076597958255, - "loss": 1.4468, - "step": 762000 - }, - { - "epoch": 4.958059691787502, - "grad_norm": 0.81640625, - "learning_rate": 0.0018016776123285, - "loss": 1.4547, - "step": 762500 - }, - { - "epoch": 4.961310878470642, - "grad_norm": 0.74609375, - "learning_rate": 0.0018015475648611742, - "loss": 1.4528, - "step": 763000 - }, - { - "epoch": 4.964562065153781, - "grad_norm": 0.64453125, - "learning_rate": 0.0018014175173938487, - "loss": 1.4477, - "step": 763500 - }, - { - "epoch": 4.967813251836921, - "grad_norm": 0.6796875, - "learning_rate": 0.0018012874699265232, - "loss": 1.4547, - "step": 764000 - }, - { - "epoch": 4.9710644385200595, - "grad_norm": 0.63671875, - "learning_rate": 0.0018011574224591975, - "loss": 1.4555, - "step": 764500 - }, - { - "epoch": 4.974315625203199, - "grad_norm": 0.6953125, - "learning_rate": 0.001801027374991872, - "loss": 1.4538, - "step": 765000 - }, - { - "epoch": 4.977566811886339, - "grad_norm": 0.9296875, - "learning_rate": 0.0018008973275245466, - "loss": 1.4523, - "step": 765500 - }, - { - "epoch": 4.980817998569478, - "grad_norm": 0.67578125, - "learning_rate": 0.0018007672800572211, - "loss": 1.4484, - "step": 766000 - }, - { - "epoch": 4.9840691852526176, - "grad_norm": 0.65625, - "learning_rate": 0.0018006372325898954, - "loss": 1.4489, - "step": 766500 - }, - { - "epoch": 4.987320371935756, - "grad_norm": 0.80859375, - "learning_rate": 0.0018005071851225699, - "loss": 1.4465, - "step": 767000 - }, - { - "epoch": 4.990571558618896, - "grad_norm": 0.69140625, - "learning_rate": 0.0018003771376552443, - "loss": 1.4522, - "step": 767500 - }, - { - "epoch": 4.993822745302035, - "grad_norm": 0.6796875, - "learning_rate": 0.0018002470901879186, - "loss": 1.453, - "step": 768000 - }, - { - "epoch": 4.997073931985175, - "grad_norm": 1.375, - "learning_rate": 0.001800117042720593, - "loss": 1.4571, - "step": 768500 - }, - { - "epoch": 5.0, - "eval_loss": 1.441786766052246, - "eval_runtime": 0.5442, - "eval_samples_per_second": 1837.612, - "eval_steps_per_second": 29.402, - "step": 768950 - }, - { - "epoch": 5.0003251186683135, - "grad_norm": 0.5703125, - "learning_rate": 0.0017999869952532676, - "loss": 1.45, - "step": 769000 - }, - { - "epoch": 5.003576305351453, - "grad_norm": 1.6953125, - "learning_rate": 0.0017998569477859418, - "loss": 1.4452, - "step": 769500 - }, - { - "epoch": 5.006827492034593, - "grad_norm": 0.75390625, - "learning_rate": 0.0017997269003186163, - "loss": 1.4399, - "step": 770000 - }, - { - "epoch": 5.010078678717732, - "grad_norm": 0.6015625, - "learning_rate": 0.0017995968528512908, - "loss": 1.4414, - "step": 770500 - }, - { - "epoch": 5.0133298654008716, - "grad_norm": 0.82421875, - "learning_rate": 0.001799466805383965, - "loss": 1.4448, - "step": 771000 - }, - { - "epoch": 5.01658105208401, - "grad_norm": 0.75390625, - "learning_rate": 0.0017993367579166395, - "loss": 1.4429, - "step": 771500 - }, - { - "epoch": 5.01983223876715, - "grad_norm": 2.015625, - "learning_rate": 0.001799206710449314, - "loss": 1.4388, - "step": 772000 - }, - { - "epoch": 5.023083425450289, - "grad_norm": 0.68359375, - "learning_rate": 0.0017990766629819882, - "loss": 1.442, - "step": 772500 - }, - { - "epoch": 5.026334612133429, - "grad_norm": 0.65234375, - "learning_rate": 0.001798946615514663, - "loss": 1.4433, - "step": 773000 - }, - { - "epoch": 5.029585798816568, - "grad_norm": 1.4609375, - "learning_rate": 0.0017988165680473374, - "loss": 1.443, - "step": 773500 - }, - { - "epoch": 5.032836985499707, - "grad_norm": 0.66015625, - "learning_rate": 0.001798686520580012, - "loss": 1.448, - "step": 774000 - }, - { - "epoch": 5.036088172182847, - "grad_norm": 0.671875, - "learning_rate": 0.0017985564731126862, - "loss": 1.4465, - "step": 774500 - }, - { - "epoch": 5.039339358865986, - "grad_norm": 1.265625, - "learning_rate": 0.0017984264256453606, - "loss": 1.4448, - "step": 775000 - }, - { - "epoch": 5.042590545549126, - "grad_norm": 1.234375, - "learning_rate": 0.0017982963781780351, - "loss": 1.4418, - "step": 775500 - }, - { - "epoch": 5.045841732232264, - "grad_norm": 0.8671875, - "learning_rate": 0.0017981663307107094, - "loss": 1.4425, - "step": 776000 - }, - { - "epoch": 5.049092918915404, - "grad_norm": 1.6171875, - "learning_rate": 0.0017980362832433839, - "loss": 1.444, - "step": 776500 - }, - { - "epoch": 5.052344105598544, - "grad_norm": 0.7109375, - "learning_rate": 0.0017979062357760583, - "loss": 1.4431, - "step": 777000 - }, - { - "epoch": 5.055595292281683, - "grad_norm": 2.078125, - "learning_rate": 0.0017977761883087326, - "loss": 1.4459, - "step": 777500 - }, - { - "epoch": 5.058846478964822, - "grad_norm": 0.90234375, - "learning_rate": 0.001797646140841407, - "loss": 1.4453, - "step": 778000 - }, - { - "epoch": 5.062097665647961, - "grad_norm": 0.60546875, - "learning_rate": 0.0017975160933740816, - "loss": 1.452, - "step": 778500 - }, - { - "epoch": 5.065348852331101, - "grad_norm": 1.5, - "learning_rate": 0.0017973860459067558, - "loss": 1.4492, - "step": 779000 - }, - { - "epoch": 5.06860003901424, - "grad_norm": 0.7734375, - "learning_rate": 0.0017972559984394303, - "loss": 1.4533, - "step": 779500 - }, - { - "epoch": 5.07185122569738, - "grad_norm": 0.609375, - "learning_rate": 0.001797125950972105, - "loss": 1.4462, - "step": 780000 - }, - { - "epoch": 5.075102412380519, - "grad_norm": 1.2734375, - "learning_rate": 0.0017969959035047795, - "loss": 1.4468, - "step": 780500 - }, - { - "epoch": 5.078353599063658, - "grad_norm": 0.8671875, - "learning_rate": 0.0017968658560374537, - "loss": 1.4464, - "step": 781000 - }, - { - "epoch": 5.081604785746798, - "grad_norm": 0.953125, - "learning_rate": 0.0017967358085701282, - "loss": 1.4535, - "step": 781500 - }, - { - "epoch": 5.084855972429937, - "grad_norm": 0.64453125, - "learning_rate": 0.0017966057611028027, - "loss": 1.4473, - "step": 782000 - }, - { - "epoch": 5.088107159113076, - "grad_norm": 0.67578125, - "learning_rate": 0.001796475713635477, - "loss": 1.4522, - "step": 782500 - }, - { - "epoch": 5.091358345796215, - "grad_norm": 1.4140625, - "learning_rate": 0.0017963456661681514, - "loss": 1.4525, - "step": 783000 - }, - { - "epoch": 5.094609532479355, - "grad_norm": 0.96875, - "learning_rate": 0.001796215618700826, - "loss": 1.447, - "step": 783500 - }, - { - "epoch": 5.097860719162495, - "grad_norm": 0.75390625, - "learning_rate": 0.0017960855712335002, - "loss": 1.4464, - "step": 784000 - }, - { - "epoch": 5.101111905845634, - "grad_norm": 0.9296875, - "learning_rate": 0.0017959555237661746, - "loss": 1.4419, - "step": 784500 - }, - { - "epoch": 5.104363092528773, - "grad_norm": 0.625, - "learning_rate": 0.0017958254762988491, - "loss": 1.4459, - "step": 785000 - }, - { - "epoch": 5.107614279211912, - "grad_norm": 0.63671875, - "learning_rate": 0.0017956954288315234, - "loss": 1.4475, - "step": 785500 - }, - { - "epoch": 5.110865465895052, - "grad_norm": 0.66796875, - "learning_rate": 0.0017955653813641979, - "loss": 1.4389, - "step": 786000 - }, - { - "epoch": 5.114116652578191, - "grad_norm": 1.0859375, - "learning_rate": 0.0017954353338968723, - "loss": 1.4455, - "step": 786500 - }, - { - "epoch": 5.11736783926133, - "grad_norm": 0.5859375, - "learning_rate": 0.0017953052864295466, - "loss": 1.4381, - "step": 787000 - }, - { - "epoch": 5.12061902594447, - "grad_norm": 0.60546875, - "learning_rate": 0.0017951752389622213, - "loss": 1.4444, - "step": 787500 - }, - { - "epoch": 5.123870212627609, - "grad_norm": 9.1875, - "learning_rate": 0.0017950451914948958, - "loss": 1.4436, - "step": 788000 - }, - { - "epoch": 5.127121399310749, - "grad_norm": 0.76953125, - "learning_rate": 0.0017949151440275703, - "loss": 1.4437, - "step": 788500 - }, - { - "epoch": 5.130372585993888, - "grad_norm": 0.703125, - "learning_rate": 0.0017947850965602445, - "loss": 1.4385, - "step": 789000 - }, - { - "epoch": 5.133623772677027, - "grad_norm": 0.8359375, - "learning_rate": 0.001794655049092919, - "loss": 1.4424, - "step": 789500 - }, - { - "epoch": 5.136874959360166, - "grad_norm": 0.5859375, - "learning_rate": 0.0017945250016255935, - "loss": 1.4419, - "step": 790000 - }, - { - "epoch": 5.140126146043306, - "grad_norm": 2.765625, - "learning_rate": 0.0017943949541582677, - "loss": 1.4416, - "step": 790500 - }, - { - "epoch": 5.143377332726445, - "grad_norm": 1.5546875, - "learning_rate": 0.0017942649066909422, - "loss": 1.4391, - "step": 791000 - }, - { - "epoch": 5.146628519409584, - "grad_norm": 0.70703125, - "learning_rate": 0.0017941348592236167, - "loss": 1.4361, - "step": 791500 - }, - { - "epoch": 5.149879706092724, - "grad_norm": 0.91796875, - "learning_rate": 0.001794004811756291, - "loss": 1.4397, - "step": 792000 - }, - { - "epoch": 5.153130892775863, - "grad_norm": 0.58984375, - "learning_rate": 0.0017938747642889654, - "loss": 1.4351, - "step": 792500 - }, - { - "epoch": 5.156382079459003, - "grad_norm": 0.6484375, - "learning_rate": 0.00179374471682164, - "loss": 1.4365, - "step": 793000 - }, - { - "epoch": 5.159633266142142, - "grad_norm": 6.6875, - "learning_rate": 0.0017936146693543142, - "loss": 1.4359, - "step": 793500 - }, - { - "epoch": 5.162884452825281, - "grad_norm": 0.6953125, - "learning_rate": 0.0017934846218869887, - "loss": 1.4403, - "step": 794000 - }, - { - "epoch": 5.16613563950842, - "grad_norm": 0.76171875, - "learning_rate": 0.0017933545744196634, - "loss": 1.4368, - "step": 794500 - }, - { - "epoch": 5.16938682619156, - "grad_norm": 0.7265625, - "learning_rate": 0.0017932245269523378, - "loss": 1.4316, - "step": 795000 - }, - { - "epoch": 5.1726380128747, - "grad_norm": 0.8046875, - "learning_rate": 0.001793094479485012, - "loss": 1.4363, - "step": 795500 - }, - { - "epoch": 5.175889199557838, - "grad_norm": 1.1015625, - "learning_rate": 0.0017929644320176866, - "loss": 1.4426, - "step": 796000 - }, - { - "epoch": 5.179140386240978, - "grad_norm": 0.68359375, - "learning_rate": 0.001792834384550361, - "loss": 1.4405, - "step": 796500 - }, - { - "epoch": 5.182391572924117, - "grad_norm": 1.15625, - "learning_rate": 0.0017927043370830353, - "loss": 1.4329, - "step": 797000 - }, - { - "epoch": 5.185642759607257, - "grad_norm": 0.66015625, - "learning_rate": 0.0017925742896157098, - "loss": 1.4337, - "step": 797500 - }, - { - "epoch": 5.188893946290396, - "grad_norm": 1.1484375, - "learning_rate": 0.0017924442421483843, - "loss": 1.4318, - "step": 798000 - }, - { - "epoch": 5.192145132973535, - "grad_norm": 0.703125, - "learning_rate": 0.0017923141946810585, - "loss": 1.4362, - "step": 798500 - }, - { - "epoch": 5.195396319656675, - "grad_norm": 0.83984375, - "learning_rate": 0.001792184147213733, - "loss": 1.4352, - "step": 799000 - }, - { - "epoch": 5.198647506339814, - "grad_norm": 0.796875, - "learning_rate": 0.0017920540997464075, - "loss": 1.4379, - "step": 799500 - }, - { - "epoch": 5.201898693022954, - "grad_norm": 1.4296875, - "learning_rate": 0.0017919240522790817, - "loss": 1.4319, - "step": 800000 - }, - { - "epoch": 5.205149879706092, - "grad_norm": 0.80859375, - "learning_rate": 0.0017917940048117562, - "loss": 1.4339, - "step": 800500 - }, - { - "epoch": 5.208401066389232, - "grad_norm": 0.7890625, - "learning_rate": 0.0017916639573444307, - "loss": 1.4374, - "step": 801000 - }, - { - "epoch": 5.211652253072371, - "grad_norm": 1.5, - "learning_rate": 0.001791533909877105, - "loss": 1.4314, - "step": 801500 - }, - { - "epoch": 5.214903439755511, - "grad_norm": 0.8515625, - "learning_rate": 0.0017914038624097797, - "loss": 1.4339, - "step": 802000 - }, - { - "epoch": 5.2181546264386505, - "grad_norm": 0.74609375, - "learning_rate": 0.0017912738149424541, - "loss": 1.4345, - "step": 802500 - }, - { - "epoch": 5.221405813121789, - "grad_norm": 1.0234375, - "learning_rate": 0.0017911437674751286, - "loss": 1.4352, - "step": 803000 - }, - { - "epoch": 5.224656999804929, - "grad_norm": 1.265625, - "learning_rate": 0.0017910137200078029, - "loss": 1.4291, - "step": 803500 - }, - { - "epoch": 5.227908186488068, - "grad_norm": 0.67578125, - "learning_rate": 0.0017908836725404774, - "loss": 1.4321, - "step": 804000 - }, - { - "epoch": 5.231159373171208, - "grad_norm": 0.984375, - "learning_rate": 0.0017907536250731518, - "loss": 1.4279, - "step": 804500 - }, - { - "epoch": 5.2344105598543464, - "grad_norm": 0.70703125, - "learning_rate": 0.001790623577605826, - "loss": 1.4371, - "step": 805000 - }, - { - "epoch": 5.237661746537486, - "grad_norm": 0.77734375, - "learning_rate": 0.0017904935301385006, - "loss": 1.431, - "step": 805500 - }, - { - "epoch": 5.240912933220626, - "grad_norm": 0.72265625, - "learning_rate": 0.001790363482671175, - "loss": 1.4389, - "step": 806000 - }, - { - "epoch": 5.244164119903765, - "grad_norm": 1.3359375, - "learning_rate": 0.0017902334352038493, - "loss": 1.433, - "step": 806500 - }, - { - "epoch": 5.2474153065869045, - "grad_norm": 0.71484375, - "learning_rate": 0.0017901033877365238, - "loss": 1.4342, - "step": 807000 - }, - { - "epoch": 5.250666493270043, - "grad_norm": 4.71875, - "learning_rate": 0.0017899733402691983, - "loss": 1.4353, - "step": 807500 - }, - { - "epoch": 5.253917679953183, - "grad_norm": 0.87109375, - "learning_rate": 0.0017898432928018725, - "loss": 1.4306, - "step": 808000 - }, - { - "epoch": 5.257168866636322, - "grad_norm": 1.109375, - "learning_rate": 0.001789713245334547, - "loss": 1.4336, - "step": 808500 - }, - { - "epoch": 5.260420053319462, - "grad_norm": 0.625, - "learning_rate": 0.0017895831978672217, - "loss": 1.4366, - "step": 809000 - }, - { - "epoch": 5.263671240002601, - "grad_norm": 0.84765625, - "learning_rate": 0.0017894531503998962, - "loss": 1.4348, - "step": 809500 - }, - { - "epoch": 5.26692242668574, - "grad_norm": 1.015625, - "learning_rate": 0.0017893231029325705, - "loss": 1.4361, - "step": 810000 - }, - { - "epoch": 5.27017361336888, - "grad_norm": 1.0078125, - "learning_rate": 0.001789193055465245, - "loss": 1.4286, - "step": 810500 - }, - { - "epoch": 5.273424800052019, - "grad_norm": 0.640625, - "learning_rate": 0.0017890630079979194, - "loss": 1.4307, - "step": 811000 - }, - { - "epoch": 5.2766759867351585, - "grad_norm": 1.046875, - "learning_rate": 0.0017889329605305937, - "loss": 1.4324, - "step": 811500 - }, - { - "epoch": 5.279927173418297, - "grad_norm": 1.296875, - "learning_rate": 0.0017888029130632681, - "loss": 1.4263, - "step": 812000 - }, - { - "epoch": 5.283178360101437, - "grad_norm": 0.7734375, - "learning_rate": 0.0017886728655959426, - "loss": 1.4316, - "step": 812500 - }, - { - "epoch": 5.286429546784577, - "grad_norm": 1.171875, - "learning_rate": 0.0017885428181286169, - "loss": 1.4346, - "step": 813000 - }, - { - "epoch": 5.289680733467716, - "grad_norm": 0.625, - "learning_rate": 0.0017884127706612914, - "loss": 1.4345, - "step": 813500 - }, - { - "epoch": 5.292931920150855, - "grad_norm": 0.625, - "learning_rate": 0.0017882827231939658, - "loss": 1.4316, - "step": 814000 - }, - { - "epoch": 5.296183106833994, - "grad_norm": 1.8671875, - "learning_rate": 0.00178815267572664, - "loss": 1.4313, - "step": 814500 - }, - { - "epoch": 5.299434293517134, - "grad_norm": 0.69921875, - "learning_rate": 0.0017880226282593146, - "loss": 1.4309, - "step": 815000 - }, - { - "epoch": 5.302685480200273, - "grad_norm": 0.73046875, - "learning_rate": 0.001787892580791989, - "loss": 1.4336, - "step": 815500 - }, - { - "epoch": 5.3059366668834125, - "grad_norm": 0.71484375, - "learning_rate": 0.0017877625333246633, - "loss": 1.4283, - "step": 816000 - }, - { - "epoch": 5.309187853566552, - "grad_norm": 0.9765625, - "learning_rate": 0.001787632485857338, - "loss": 1.43, - "step": 816500 - }, - { - "epoch": 5.312439040249691, - "grad_norm": 0.86328125, - "learning_rate": 0.0017875024383900125, - "loss": 1.4354, - "step": 817000 - }, - { - "epoch": 5.315690226932831, - "grad_norm": 0.67578125, - "learning_rate": 0.001787372390922687, - "loss": 1.4303, - "step": 817500 - }, - { - "epoch": 5.31894141361597, - "grad_norm": 0.61328125, - "learning_rate": 0.0017872423434553612, - "loss": 1.4338, - "step": 818000 - }, - { - "epoch": 5.322192600299109, - "grad_norm": 0.75390625, - "learning_rate": 0.0017871122959880357, - "loss": 1.4386, - "step": 818500 - }, - { - "epoch": 5.325443786982248, - "grad_norm": 1.1171875, - "learning_rate": 0.0017869822485207102, - "loss": 1.4364, - "step": 819000 - }, - { - "epoch": 5.328694973665388, - "grad_norm": 0.62109375, - "learning_rate": 0.0017868522010533845, - "loss": 1.4372, - "step": 819500 - }, - { - "epoch": 5.331946160348528, - "grad_norm": 0.7265625, - "learning_rate": 0.001786722153586059, - "loss": 1.4403, - "step": 820000 - }, - { - "epoch": 5.3351973470316665, - "grad_norm": 0.73828125, - "learning_rate": 0.0017865921061187334, - "loss": 1.432, - "step": 820500 - }, - { - "epoch": 5.338448533714806, - "grad_norm": 0.890625, - "learning_rate": 0.0017864620586514077, - "loss": 1.429, - "step": 821000 - }, - { - "epoch": 5.341699720397945, - "grad_norm": 0.76953125, - "learning_rate": 0.0017863320111840822, - "loss": 1.4301, - "step": 821500 - }, - { - "epoch": 5.344950907081085, - "grad_norm": 1.0078125, - "learning_rate": 0.0017862019637167566, - "loss": 1.4332, - "step": 822000 - }, - { - "epoch": 5.348202093764224, - "grad_norm": 0.61328125, - "learning_rate": 0.001786071916249431, - "loss": 1.4334, - "step": 822500 - }, - { - "epoch": 5.351453280447363, - "grad_norm": 0.7421875, - "learning_rate": 0.0017859418687821054, - "loss": 1.4321, - "step": 823000 - }, - { - "epoch": 5.354704467130503, - "grad_norm": 0.84375, - "learning_rate": 0.00178581182131478, - "loss": 1.4315, - "step": 823500 - }, - { - "epoch": 5.357955653813642, - "grad_norm": 1.203125, - "learning_rate": 0.0017856817738474545, - "loss": 1.4369, - "step": 824000 - }, - { - "epoch": 5.361206840496782, - "grad_norm": 2.40625, - "learning_rate": 0.0017855517263801288, - "loss": 1.4303, - "step": 824500 - }, - { - "epoch": 5.3644580271799205, - "grad_norm": 0.640625, - "learning_rate": 0.0017854216789128033, - "loss": 1.4386, - "step": 825000 - }, - { - "epoch": 5.36770921386306, - "grad_norm": 0.84765625, - "learning_rate": 0.0017852916314454778, - "loss": 1.4326, - "step": 825500 - }, - { - "epoch": 5.370960400546199, - "grad_norm": 0.6953125, - "learning_rate": 0.001785161583978152, - "loss": 1.4357, - "step": 826000 - }, - { - "epoch": 5.374211587229339, - "grad_norm": 0.64453125, - "learning_rate": 0.0017850315365108265, - "loss": 1.4397, - "step": 826500 - }, - { - "epoch": 5.3774627739124785, - "grad_norm": 0.96875, - "learning_rate": 0.001784901489043501, - "loss": 1.4392, - "step": 827000 - }, - { - "epoch": 5.380713960595617, - "grad_norm": 1.0625, - "learning_rate": 0.0017847714415761752, - "loss": 1.4344, - "step": 827500 - }, - { - "epoch": 5.383965147278757, - "grad_norm": 0.62890625, - "learning_rate": 0.0017846413941088497, - "loss": 1.4347, - "step": 828000 - }, - { - "epoch": 5.387216333961896, - "grad_norm": 0.625, - "learning_rate": 0.0017845113466415242, - "loss": 1.4388, - "step": 828500 - }, - { - "epoch": 5.390467520645036, - "grad_norm": 0.75390625, - "learning_rate": 0.0017843812991741985, - "loss": 1.4361, - "step": 829000 - }, - { - "epoch": 5.3937187073281745, - "grad_norm": 1.046875, - "learning_rate": 0.001784251251706873, - "loss": 1.4407, - "step": 829500 - }, - { - "epoch": 5.396969894011314, - "grad_norm": 1.4375, - "learning_rate": 0.0017841212042395474, - "loss": 1.4423, - "step": 830000 - }, - { - "epoch": 5.400221080694454, - "grad_norm": 1.1015625, - "learning_rate": 0.0017839911567722217, - "loss": 1.4417, - "step": 830500 - }, - { - "epoch": 5.403472267377593, - "grad_norm": 1.7265625, - "learning_rate": 0.0017838611093048964, - "loss": 1.4446, - "step": 831000 - }, - { - "epoch": 5.4067234540607325, - "grad_norm": 0.62890625, - "learning_rate": 0.0017837310618375709, - "loss": 1.4417, - "step": 831500 - }, - { - "epoch": 5.409974640743871, - "grad_norm": 0.984375, - "learning_rate": 0.0017836010143702453, - "loss": 1.4407, - "step": 832000 - }, - { - "epoch": 5.413225827427011, - "grad_norm": 0.80859375, - "learning_rate": 0.0017834709669029196, - "loss": 1.4398, - "step": 832500 - }, - { - "epoch": 5.41647701411015, - "grad_norm": 0.9765625, - "learning_rate": 0.001783340919435594, - "loss": 1.444, - "step": 833000 - }, - { - "epoch": 5.41972820079329, - "grad_norm": 0.69921875, - "learning_rate": 0.0017832108719682686, - "loss": 1.4429, - "step": 833500 - }, - { - "epoch": 5.4229793874764285, - "grad_norm": 0.68359375, - "learning_rate": 0.0017830808245009428, - "loss": 1.448, - "step": 834000 - }, - { - "epoch": 5.426230574159568, - "grad_norm": 0.71484375, - "learning_rate": 0.0017829507770336173, - "loss": 1.4424, - "step": 834500 - }, - { - "epoch": 5.429481760842708, - "grad_norm": 1.578125, - "learning_rate": 0.0017828207295662918, - "loss": 1.4442, - "step": 835000 - }, - { - "epoch": 5.432732947525847, - "grad_norm": 0.78515625, - "learning_rate": 0.001782690682098966, - "loss": 1.4364, - "step": 835500 - }, - { - "epoch": 5.4359841342089865, - "grad_norm": 0.62890625, - "learning_rate": 0.0017825606346316405, - "loss": 1.4351, - "step": 836000 - }, - { - "epoch": 5.439235320892125, - "grad_norm": 0.65234375, - "learning_rate": 0.001782430587164315, - "loss": 1.4341, - "step": 836500 - }, - { - "epoch": 5.442486507575265, - "grad_norm": 0.58984375, - "learning_rate": 0.0017823005396969893, - "loss": 1.4395, - "step": 837000 - }, - { - "epoch": 5.445737694258404, - "grad_norm": 2.65625, - "learning_rate": 0.0017821704922296637, - "loss": 1.4416, - "step": 837500 - }, - { - "epoch": 5.448988880941544, - "grad_norm": 1.0078125, - "learning_rate": 0.0017820404447623384, - "loss": 1.4371, - "step": 838000 - }, - { - "epoch": 5.452240067624683, - "grad_norm": 0.8046875, - "learning_rate": 0.001781910397295013, - "loss": 1.4336, - "step": 838500 - }, - { - "epoch": 5.455491254307822, - "grad_norm": 1.046875, - "learning_rate": 0.0017817803498276872, - "loss": 1.4373, - "step": 839000 - }, - { - "epoch": 5.458742440990962, - "grad_norm": 0.76953125, - "learning_rate": 0.0017816503023603616, - "loss": 1.4375, - "step": 839500 - }, - { - "epoch": 5.461993627674101, - "grad_norm": 3.96875, - "learning_rate": 0.0017815202548930361, - "loss": 1.4356, - "step": 840000 - }, - { - "epoch": 5.4652448143572405, - "grad_norm": 3.21875, - "learning_rate": 0.0017813902074257104, - "loss": 1.4346, - "step": 840500 - }, - { - "epoch": 5.468496001040379, - "grad_norm": 0.765625, - "learning_rate": 0.0017812601599583849, - "loss": 1.4327, - "step": 841000 - }, - { - "epoch": 5.471747187723519, - "grad_norm": 0.671875, - "learning_rate": 0.0017811301124910593, - "loss": 1.4347, - "step": 841500 - }, - { - "epoch": 5.474998374406659, - "grad_norm": 0.59765625, - "learning_rate": 0.0017810000650237336, - "loss": 1.439, - "step": 842000 - }, - { - "epoch": 5.478249561089798, - "grad_norm": 0.84765625, - "learning_rate": 0.001780870017556408, - "loss": 1.4383, - "step": 842500 - }, - { - "epoch": 5.481500747772937, - "grad_norm": 0.734375, - "learning_rate": 0.0017807399700890826, - "loss": 1.4395, - "step": 843000 - }, - { - "epoch": 5.484751934456076, - "grad_norm": 0.6875, - "learning_rate": 0.0017806099226217568, - "loss": 1.4374, - "step": 843500 - }, - { - "epoch": 5.488003121139216, - "grad_norm": 0.71875, - "learning_rate": 0.0017804798751544313, - "loss": 1.445, - "step": 844000 - }, - { - "epoch": 5.491254307822355, - "grad_norm": 0.79296875, - "learning_rate": 0.0017803498276871058, - "loss": 1.4482, - "step": 844500 - }, - { - "epoch": 5.4945054945054945, - "grad_norm": 1.1171875, - "learning_rate": 0.00178021978021978, - "loss": 1.4499, - "step": 845000 - }, - { - "epoch": 5.497756681188634, - "grad_norm": 0.6484375, - "learning_rate": 0.0017800897327524547, - "loss": 1.4462, - "step": 845500 - }, - { - "epoch": 5.501007867871773, - "grad_norm": 3.71875, - "learning_rate": 0.0017799596852851292, - "loss": 1.4478, - "step": 846000 - }, - { - "epoch": 5.504259054554913, - "grad_norm": 0.62109375, - "learning_rate": 0.0017798296378178037, - "loss": 1.4479, - "step": 846500 - }, - { - "epoch": 5.507510241238052, - "grad_norm": 0.72265625, - "learning_rate": 0.001779699590350478, - "loss": 1.4523, - "step": 847000 - }, - { - "epoch": 5.510761427921191, - "grad_norm": 0.8203125, - "learning_rate": 0.0017795695428831524, - "loss": 1.4498, - "step": 847500 - }, - { - "epoch": 5.51401261460433, - "grad_norm": 0.85546875, - "learning_rate": 0.001779439495415827, - "loss": 1.4493, - "step": 848000 - }, - { - "epoch": 5.51726380128747, - "grad_norm": 0.6640625, - "learning_rate": 0.0017793094479485012, - "loss": 1.454, - "step": 848500 - }, - { - "epoch": 5.520514987970609, - "grad_norm": 0.72265625, - "learning_rate": 0.0017791794004811757, - "loss": 1.4481, - "step": 849000 - }, - { - "epoch": 5.5237661746537485, - "grad_norm": 0.81640625, - "learning_rate": 0.0017790493530138501, - "loss": 1.4474, - "step": 849500 - }, - { - "epoch": 5.527017361336888, - "grad_norm": 0.89453125, - "learning_rate": 0.0017789193055465244, - "loss": 1.4459, - "step": 850000 - }, - { - "epoch": 5.530268548020027, - "grad_norm": 0.6953125, - "learning_rate": 0.0017787892580791989, - "loss": 1.4437, - "step": 850500 - }, - { - "epoch": 5.533519734703167, - "grad_norm": 0.671875, - "learning_rate": 0.0017786592106118734, - "loss": 1.4456, - "step": 851000 - }, - { - "epoch": 5.536770921386306, - "grad_norm": 1.1953125, - "learning_rate": 0.0017785291631445476, - "loss": 1.44, - "step": 851500 - }, - { - "epoch": 5.540022108069445, - "grad_norm": 1.015625, - "learning_rate": 0.001778399115677222, - "loss": 1.4362, - "step": 852000 - }, - { - "epoch": 5.543273294752584, - "grad_norm": 0.87109375, - "learning_rate": 0.0017782690682098968, - "loss": 1.4389, - "step": 852500 - }, - { - "epoch": 5.546524481435724, - "grad_norm": 0.86328125, - "learning_rate": 0.0017781390207425713, - "loss": 1.4365, - "step": 853000 - }, - { - "epoch": 5.549775668118864, - "grad_norm": 0.6875, - "learning_rate": 0.0017780089732752455, - "loss": 1.4347, - "step": 853500 - }, - { - "epoch": 5.5530268548020025, - "grad_norm": 0.74609375, - "learning_rate": 0.00177787892580792, - "loss": 1.433, - "step": 854000 - }, - { - "epoch": 5.556278041485142, - "grad_norm": 1.0390625, - "learning_rate": 0.0017777488783405945, - "loss": 1.4389, - "step": 854500 - }, - { - "epoch": 5.559529228168281, - "grad_norm": 1.3046875, - "learning_rate": 0.0017776188308732687, - "loss": 1.4277, - "step": 855000 - }, - { - "epoch": 5.562780414851421, - "grad_norm": 0.62109375, - "learning_rate": 0.0017774887834059432, - "loss": 1.4311, - "step": 855500 - }, - { - "epoch": 5.56603160153456, - "grad_norm": 0.69140625, - "learning_rate": 0.0017773587359386177, - "loss": 1.4269, - "step": 856000 - }, - { - "epoch": 5.569282788217699, - "grad_norm": 0.6171875, - "learning_rate": 0.001777228688471292, - "loss": 1.4299, - "step": 856500 - }, - { - "epoch": 5.572533974900839, - "grad_norm": 4.78125, - "learning_rate": 0.0017770986410039664, - "loss": 1.4351, - "step": 857000 - }, - { - "epoch": 5.575785161583978, - "grad_norm": 2.921875, - "learning_rate": 0.001776968593536641, - "loss": 1.4311, - "step": 857500 - }, - { - "epoch": 5.579036348267118, - "grad_norm": 1.125, - "learning_rate": 0.0017768385460693152, - "loss": 1.4392, - "step": 858000 - }, - { - "epoch": 5.5822875349502565, - "grad_norm": 1.65625, - "learning_rate": 0.0017767084986019897, - "loss": 1.4358, - "step": 858500 - }, - { - "epoch": 5.585538721633396, - "grad_norm": 1.09375, - "learning_rate": 0.0017765784511346641, - "loss": 1.4362, - "step": 859000 - }, - { - "epoch": 5.588789908316535, - "grad_norm": 0.7421875, - "learning_rate": 0.0017764484036673384, - "loss": 1.4434, - "step": 859500 - }, - { - "epoch": 5.592041094999675, - "grad_norm": 0.58984375, - "learning_rate": 0.001776318356200013, - "loss": 1.4344, - "step": 860000 - }, - { - "epoch": 5.595292281682815, - "grad_norm": 1.0625, - "learning_rate": 0.0017761883087326876, - "loss": 1.4339, - "step": 860500 - }, - { - "epoch": 5.598543468365953, - "grad_norm": 0.65234375, - "learning_rate": 0.001776058261265362, - "loss": 1.4316, - "step": 861000 - }, - { - "epoch": 5.601794655049093, - "grad_norm": 0.75390625, - "learning_rate": 0.0017759282137980363, - "loss": 1.4352, - "step": 861500 - }, - { - "epoch": 5.605045841732232, - "grad_norm": 0.97265625, - "learning_rate": 0.0017757981663307108, - "loss": 1.4341, - "step": 862000 - }, - { - "epoch": 5.608297028415372, - "grad_norm": 3.8125, - "learning_rate": 0.0017756681188633853, - "loss": 1.4367, - "step": 862500 - }, - { - "epoch": 5.6115482150985105, - "grad_norm": 0.7421875, - "learning_rate": 0.0017755380713960595, - "loss": 1.4371, - "step": 863000 - }, - { - "epoch": 5.61479940178165, - "grad_norm": 0.7734375, - "learning_rate": 0.001775408023928734, - "loss": 1.434, - "step": 863500 - }, - { - "epoch": 5.61805058846479, - "grad_norm": 0.7578125, - "learning_rate": 0.0017752779764614085, - "loss": 1.4328, - "step": 864000 - }, - { - "epoch": 5.621301775147929, - "grad_norm": 1.6171875, - "learning_rate": 0.0017751479289940828, - "loss": 1.4349, - "step": 864500 - }, - { - "epoch": 5.624552961831069, - "grad_norm": 1.0, - "learning_rate": 0.0017750178815267572, - "loss": 1.4316, - "step": 865000 - }, - { - "epoch": 5.627804148514207, - "grad_norm": 0.7421875, - "learning_rate": 0.0017748878340594317, - "loss": 1.4332, - "step": 865500 - }, - { - "epoch": 5.631055335197347, - "grad_norm": 1.1484375, - "learning_rate": 0.001774757786592106, - "loss": 1.4338, - "step": 866000 - }, - { - "epoch": 5.634306521880486, - "grad_norm": 0.9375, - "learning_rate": 0.0017746277391247805, - "loss": 1.4292, - "step": 866500 - }, - { - "epoch": 5.637557708563626, - "grad_norm": 0.7734375, - "learning_rate": 0.0017744976916574551, - "loss": 1.4338, - "step": 867000 - }, - { - "epoch": 5.640808895246765, - "grad_norm": 0.859375, - "learning_rate": 0.0017743676441901296, - "loss": 1.4284, - "step": 867500 - }, - { - "epoch": 5.644060081929904, - "grad_norm": 0.7109375, - "learning_rate": 0.0017742375967228039, - "loss": 1.4261, - "step": 868000 - }, - { - "epoch": 5.647311268613044, - "grad_norm": 1.390625, - "learning_rate": 0.0017741075492554784, - "loss": 1.4281, - "step": 868500 - }, - { - "epoch": 5.650562455296183, - "grad_norm": 0.71484375, - "learning_rate": 0.0017739775017881528, - "loss": 1.4258, - "step": 869000 - }, - { - "epoch": 5.653813641979323, - "grad_norm": 0.640625, - "learning_rate": 0.001773847454320827, - "loss": 1.4271, - "step": 869500 - }, - { - "epoch": 5.657064828662461, - "grad_norm": 0.74609375, - "learning_rate": 0.0017737174068535016, - "loss": 1.4319, - "step": 870000 - }, - { - "epoch": 5.660316015345601, - "grad_norm": 0.86328125, - "learning_rate": 0.001773587359386176, - "loss": 1.4392, - "step": 870500 - }, - { - "epoch": 5.663567202028741, - "grad_norm": 0.65625, - "learning_rate": 0.0017734573119188503, - "loss": 1.4391, - "step": 871000 - }, - { - "epoch": 5.66681838871188, - "grad_norm": 0.57421875, - "learning_rate": 0.0017733272644515248, - "loss": 1.4384, - "step": 871500 - }, - { - "epoch": 5.670069575395019, - "grad_norm": 0.69140625, - "learning_rate": 0.0017731972169841993, - "loss": 1.433, - "step": 872000 - }, - { - "epoch": 5.673320762078158, - "grad_norm": 0.671875, - "learning_rate": 0.0017730671695168735, - "loss": 1.4417, - "step": 872500 - }, - { - "epoch": 5.676571948761298, - "grad_norm": 0.62890625, - "learning_rate": 0.001772937122049548, - "loss": 1.4341, - "step": 873000 - }, - { - "epoch": 5.679823135444437, - "grad_norm": 1.03125, - "learning_rate": 0.0017728070745822225, - "loss": 1.4381, - "step": 873500 - }, - { - "epoch": 5.683074322127577, - "grad_norm": 0.63671875, - "learning_rate": 0.0017726770271148968, - "loss": 1.4375, - "step": 874000 - }, - { - "epoch": 5.686325508810716, - "grad_norm": 0.88671875, - "learning_rate": 0.0017725469796475715, - "loss": 1.4401, - "step": 874500 - }, - { - "epoch": 5.689576695493855, - "grad_norm": 0.85546875, - "learning_rate": 0.001772416932180246, - "loss": 1.4388, - "step": 875000 - }, - { - "epoch": 5.692827882176995, - "grad_norm": 1.6328125, - "learning_rate": 0.0017722868847129204, - "loss": 1.4357, - "step": 875500 - }, - { - "epoch": 5.696079068860134, - "grad_norm": 0.8671875, - "learning_rate": 0.0017721568372455947, - "loss": 1.4377, - "step": 876000 - }, - { - "epoch": 5.699330255543273, - "grad_norm": 0.94140625, - "learning_rate": 0.0017720267897782692, - "loss": 1.433, - "step": 876500 - }, - { - "epoch": 5.702581442226412, - "grad_norm": 4.15625, - "learning_rate": 0.0017718967423109436, - "loss": 1.4328, - "step": 877000 - }, - { - "epoch": 5.705832628909552, - "grad_norm": 0.89453125, - "learning_rate": 0.001771766694843618, - "loss": 1.4349, - "step": 877500 - }, - { - "epoch": 5.709083815592692, - "grad_norm": 0.71484375, - "learning_rate": 0.0017716366473762924, - "loss": 1.4363, - "step": 878000 - }, - { - "epoch": 5.712335002275831, - "grad_norm": 1.5546875, - "learning_rate": 0.0017715065999089669, - "loss": 1.4388, - "step": 878500 - }, - { - "epoch": 5.71558618895897, - "grad_norm": 0.78125, - "learning_rate": 0.0017713765524416411, - "loss": 1.4395, - "step": 879000 - }, - { - "epoch": 5.718837375642109, - "grad_norm": 0.62890625, - "learning_rate": 0.0017712465049743156, - "loss": 1.4353, - "step": 879500 - }, - { - "epoch": 5.722088562325249, - "grad_norm": 0.8515625, - "learning_rate": 0.00177111645750699, - "loss": 1.4373, - "step": 880000 - }, - { - "epoch": 5.725339749008388, - "grad_norm": 0.6953125, - "learning_rate": 0.0017709864100396643, - "loss": 1.4349, - "step": 880500 - }, - { - "epoch": 5.728590935691527, - "grad_norm": 0.65625, - "learning_rate": 0.0017708563625723388, - "loss": 1.4382, - "step": 881000 - }, - { - "epoch": 5.731842122374667, - "grad_norm": 0.71875, - "learning_rate": 0.0017707263151050135, - "loss": 1.4353, - "step": 881500 - }, - { - "epoch": 5.735093309057806, - "grad_norm": 0.7421875, - "learning_rate": 0.001770596267637688, - "loss": 1.4369, - "step": 882000 - }, - { - "epoch": 5.738344495740946, - "grad_norm": 0.69921875, - "learning_rate": 0.0017704662201703622, - "loss": 1.4347, - "step": 882500 - }, - { - "epoch": 5.741595682424085, - "grad_norm": 0.6484375, - "learning_rate": 0.0017703361727030367, - "loss": 1.4345, - "step": 883000 - }, - { - "epoch": 5.744846869107224, - "grad_norm": 0.82421875, - "learning_rate": 0.0017702061252357112, - "loss": 1.4395, - "step": 883500 - }, - { - "epoch": 5.748098055790363, - "grad_norm": 0.58984375, - "learning_rate": 0.0017700760777683855, - "loss": 1.4383, - "step": 884000 - }, - { - "epoch": 5.751349242473503, - "grad_norm": 0.68359375, - "learning_rate": 0.00176994603030106, - "loss": 1.4386, - "step": 884500 - }, - { - "epoch": 5.754600429156643, - "grad_norm": 1.25, - "learning_rate": 0.0017698159828337344, - "loss": 1.4323, - "step": 885000 - }, - { - "epoch": 5.7578516158397814, - "grad_norm": 0.6796875, - "learning_rate": 0.0017696859353664087, - "loss": 1.4471, - "step": 885500 - }, - { - "epoch": 5.761102802522921, - "grad_norm": 0.6953125, - "learning_rate": 0.0017695558878990832, - "loss": 1.437, - "step": 886000 - }, - { - "epoch": 5.76435398920606, - "grad_norm": 0.640625, - "learning_rate": 0.0017694258404317576, - "loss": 1.4366, - "step": 886500 - }, - { - "epoch": 5.7676051758892, - "grad_norm": 0.74609375, - "learning_rate": 0.001769295792964432, - "loss": 1.4318, - "step": 887000 - }, - { - "epoch": 5.770856362572339, - "grad_norm": 0.8671875, - "learning_rate": 0.0017691657454971064, - "loss": 1.4392, - "step": 887500 - }, - { - "epoch": 5.774107549255478, - "grad_norm": 0.80078125, - "learning_rate": 0.0017690356980297809, - "loss": 1.437, - "step": 888000 - }, - { - "epoch": 5.777358735938618, - "grad_norm": 0.8203125, - "learning_rate": 0.0017689056505624551, - "loss": 1.4328, - "step": 888500 - }, - { - "epoch": 5.780609922621757, - "grad_norm": 0.74609375, - "learning_rate": 0.0017687756030951298, - "loss": 1.4395, - "step": 889000 - }, - { - "epoch": 5.783861109304897, - "grad_norm": 0.96875, - "learning_rate": 0.0017686455556278043, - "loss": 1.4352, - "step": 889500 - }, - { - "epoch": 5.7871122959880354, - "grad_norm": 1.15625, - "learning_rate": 0.0017685155081604788, - "loss": 1.4312, - "step": 890000 - }, - { - "epoch": 5.790363482671175, - "grad_norm": 0.875, - "learning_rate": 0.001768385460693153, - "loss": 1.4387, - "step": 890500 - }, - { - "epoch": 5.793614669354314, - "grad_norm": 1.109375, - "learning_rate": 0.0017682554132258275, - "loss": 1.4412, - "step": 891000 - }, - { - "epoch": 5.796865856037454, - "grad_norm": 0.6796875, - "learning_rate": 0.001768125365758502, - "loss": 1.4367, - "step": 891500 - }, - { - "epoch": 5.8001170427205935, - "grad_norm": 0.8359375, - "learning_rate": 0.0017679953182911763, - "loss": 1.4418, - "step": 892000 - }, - { - "epoch": 5.803368229403732, - "grad_norm": 0.7578125, - "learning_rate": 0.0017678652708238507, - "loss": 1.4393, - "step": 892500 - }, - { - "epoch": 5.806619416086872, - "grad_norm": 0.71875, - "learning_rate": 0.0017677352233565252, - "loss": 1.4412, - "step": 893000 - }, - { - "epoch": 5.809870602770011, - "grad_norm": 0.81640625, - "learning_rate": 0.0017676051758891995, - "loss": 1.4506, - "step": 893500 - }, - { - "epoch": 5.813121789453151, - "grad_norm": 0.73046875, - "learning_rate": 0.001767475128421874, - "loss": 1.4387, - "step": 894000 - }, - { - "epoch": 5.8163729761362895, - "grad_norm": 1.046875, - "learning_rate": 0.0017673450809545484, - "loss": 1.4368, - "step": 894500 - }, - { - "epoch": 5.819624162819429, - "grad_norm": 2.6875, - "learning_rate": 0.0017672150334872227, - "loss": 1.4371, - "step": 895000 - }, - { - "epoch": 5.822875349502569, - "grad_norm": 1.2109375, - "learning_rate": 0.0017670849860198972, - "loss": 1.4433, - "step": 895500 - }, - { - "epoch": 5.826126536185708, - "grad_norm": 0.73828125, - "learning_rate": 0.0017669549385525719, - "loss": 1.4398, - "step": 896000 - }, - { - "epoch": 5.8293777228688475, - "grad_norm": 0.9609375, - "learning_rate": 0.0017668248910852463, - "loss": 1.4315, - "step": 896500 - }, - { - "epoch": 5.832628909551986, - "grad_norm": 0.71875, - "learning_rate": 0.0017666948436179206, - "loss": 1.4369, - "step": 897000 - }, - { - "epoch": 5.835880096235126, - "grad_norm": 0.74609375, - "learning_rate": 0.001766564796150595, - "loss": 1.4403, - "step": 897500 - }, - { - "epoch": 5.839131282918265, - "grad_norm": 0.859375, - "learning_rate": 0.0017664347486832696, - "loss": 1.4389, - "step": 898000 - }, - { - "epoch": 5.842382469601405, - "grad_norm": 0.88671875, - "learning_rate": 0.0017663047012159438, - "loss": 1.44, - "step": 898500 - }, - { - "epoch": 5.845633656284544, - "grad_norm": 0.8359375, - "learning_rate": 0.0017661746537486183, - "loss": 1.4428, - "step": 899000 - }, - { - "epoch": 5.848884842967683, - "grad_norm": 0.7734375, - "learning_rate": 0.0017660446062812928, - "loss": 1.4401, - "step": 899500 - }, - { - "epoch": 5.852136029650823, - "grad_norm": 1.09375, - "learning_rate": 0.001765914558813967, - "loss": 1.4453, - "step": 900000 - }, - { - "epoch": 5.855387216333962, - "grad_norm": 0.8125, - "learning_rate": 0.0017657845113466415, - "loss": 1.4403, - "step": 900500 - }, - { - "epoch": 5.8586384030171015, - "grad_norm": 0.61328125, - "learning_rate": 0.001765654463879316, - "loss": 1.4371, - "step": 901000 - }, - { - "epoch": 5.86188958970024, - "grad_norm": 1.2890625, - "learning_rate": 0.0017655244164119903, - "loss": 1.4415, - "step": 901500 - }, - { - "epoch": 5.86514077638338, - "grad_norm": 0.76171875, - "learning_rate": 0.0017653943689446647, - "loss": 1.4538, - "step": 902000 - }, - { - "epoch": 5.86839196306652, - "grad_norm": 1.0625, - "learning_rate": 0.0017652643214773392, - "loss": 1.4442, - "step": 902500 - }, - { - "epoch": 5.871643149749659, - "grad_norm": 0.6484375, - "learning_rate": 0.0017651342740100135, - "loss": 1.4423, - "step": 903000 - }, - { - "epoch": 5.874894336432798, - "grad_norm": 0.66796875, - "learning_rate": 0.0017650042265426882, - "loss": 1.4468, - "step": 903500 - }, - { - "epoch": 5.878145523115937, - "grad_norm": 0.703125, - "learning_rate": 0.0017648741790753627, - "loss": 1.4504, - "step": 904000 - }, - { - "epoch": 5.881396709799077, - "grad_norm": 0.765625, - "learning_rate": 0.0017647441316080371, - "loss": 1.4504, - "step": 904500 - }, - { - "epoch": 5.884647896482216, - "grad_norm": 1.0390625, - "learning_rate": 0.0017646140841407114, - "loss": 1.4506, - "step": 905000 - }, - { - "epoch": 5.8878990831653555, - "grad_norm": 1.1015625, - "learning_rate": 0.0017644840366733859, - "loss": 1.4464, - "step": 905500 - }, - { - "epoch": 5.891150269848494, - "grad_norm": 0.796875, - "learning_rate": 0.0017643539892060604, - "loss": 1.4466, - "step": 906000 - }, - { - "epoch": 5.894401456531634, - "grad_norm": 0.859375, - "learning_rate": 0.0017642239417387346, - "loss": 1.4492, - "step": 906500 - }, - { - "epoch": 5.897652643214773, - "grad_norm": 1.2265625, - "learning_rate": 0.001764093894271409, - "loss": 1.4484, - "step": 907000 - }, - { - "epoch": 5.900903829897913, - "grad_norm": 0.75, - "learning_rate": 0.0017639638468040836, - "loss": 1.453, - "step": 907500 - }, - { - "epoch": 5.904155016581052, - "grad_norm": 1.234375, - "learning_rate": 0.0017638337993367578, - "loss": 1.4505, - "step": 908000 - }, - { - "epoch": 5.907406203264191, - "grad_norm": 0.96484375, - "learning_rate": 0.0017637037518694323, - "loss": 1.4462, - "step": 908500 - }, - { - "epoch": 5.910657389947331, - "grad_norm": 0.6796875, - "learning_rate": 0.0017635737044021068, - "loss": 1.4504, - "step": 909000 - }, - { - "epoch": 5.91390857663047, - "grad_norm": 0.66796875, - "learning_rate": 0.001763443656934781, - "loss": 1.4482, - "step": 909500 - }, - { - "epoch": 5.9171597633136095, - "grad_norm": 0.57421875, - "learning_rate": 0.0017633136094674555, - "loss": 1.448, - "step": 910000 - }, - { - "epoch": 5.920410949996748, - "grad_norm": 1.0078125, - "learning_rate": 0.0017631835620001302, - "loss": 1.4487, - "step": 910500 - }, - { - "epoch": 5.923662136679888, - "grad_norm": 0.8125, - "learning_rate": 0.0017630535145328047, - "loss": 1.4487, - "step": 911000 - }, - { - "epoch": 5.926913323363028, - "grad_norm": 2.140625, - "learning_rate": 0.001762923467065479, - "loss": 1.451, - "step": 911500 - }, - { - "epoch": 5.930164510046167, - "grad_norm": 0.6640625, - "learning_rate": 0.0017627934195981534, - "loss": 1.4477, - "step": 912000 - }, - { - "epoch": 5.933415696729306, - "grad_norm": 0.94140625, - "learning_rate": 0.001762663372130828, - "loss": 1.4422, - "step": 912500 - }, - { - "epoch": 5.936666883412445, - "grad_norm": 0.67578125, - "learning_rate": 0.0017625333246635022, - "loss": 1.4406, - "step": 913000 - }, - { - "epoch": 5.939918070095585, - "grad_norm": 0.65625, - "learning_rate": 0.0017624032771961767, - "loss": 1.4463, - "step": 913500 - }, - { - "epoch": 5.943169256778724, - "grad_norm": 0.66015625, - "learning_rate": 0.0017622732297288511, - "loss": 1.4441, - "step": 914000 - }, - { - "epoch": 5.9464204434618635, - "grad_norm": 0.96484375, - "learning_rate": 0.0017621431822615254, - "loss": 1.4461, - "step": 914500 - }, - { - "epoch": 5.949671630145003, - "grad_norm": 0.69921875, - "learning_rate": 0.0017620131347941999, - "loss": 1.4471, - "step": 915000 - }, - { - "epoch": 5.952922816828142, - "grad_norm": 3.171875, - "learning_rate": 0.0017618830873268744, - "loss": 1.4448, - "step": 915500 - }, - { - "epoch": 5.956174003511282, - "grad_norm": 0.6640625, - "learning_rate": 0.0017617530398595486, - "loss": 1.4441, - "step": 916000 - }, - { - "epoch": 5.959425190194421, - "grad_norm": 0.875, - "learning_rate": 0.001761622992392223, - "loss": 1.4413, - "step": 916500 - }, - { - "epoch": 5.96267637687756, - "grad_norm": 0.61328125, - "learning_rate": 0.0017614929449248976, - "loss": 1.4439, - "step": 917000 - }, - { - "epoch": 5.965927563560699, - "grad_norm": 0.75, - "learning_rate": 0.0017613628974575718, - "loss": 1.4338, - "step": 917500 - }, - { - "epoch": 5.969178750243839, - "grad_norm": 0.79296875, - "learning_rate": 0.0017612328499902465, - "loss": 1.4402, - "step": 918000 - }, - { - "epoch": 5.972429936926979, - "grad_norm": 0.77734375, - "learning_rate": 0.001761102802522921, - "loss": 1.4474, - "step": 918500 - }, - { - "epoch": 5.9756811236101175, - "grad_norm": 0.7890625, - "learning_rate": 0.0017609727550555955, - "loss": 1.4411, - "step": 919000 - }, - { - "epoch": 5.978932310293257, - "grad_norm": 1.5390625, - "learning_rate": 0.0017608427075882698, - "loss": 1.4379, - "step": 919500 - }, - { - "epoch": 5.982183496976396, - "grad_norm": 0.828125, - "learning_rate": 0.0017607126601209442, - "loss": 1.4487, - "step": 920000 - }, - { - "epoch": 5.985434683659536, - "grad_norm": 0.66015625, - "learning_rate": 0.0017605826126536187, - "loss": 1.4472, - "step": 920500 - }, - { - "epoch": 5.988685870342675, - "grad_norm": 1.015625, - "learning_rate": 0.001760452565186293, - "loss": 1.4468, - "step": 921000 - }, - { - "epoch": 5.991937057025814, - "grad_norm": 0.9296875, - "learning_rate": 0.0017603225177189675, - "loss": 1.443, - "step": 921500 - }, - { - "epoch": 5.995188243708954, - "grad_norm": 0.5859375, - "learning_rate": 0.001760192470251642, - "loss": 1.4388, - "step": 922000 - }, - { - "epoch": 5.998439430392093, - "grad_norm": 1.296875, - "learning_rate": 0.0017600624227843162, - "loss": 1.443, - "step": 922500 - }, - { - "epoch": 6.0, - "eval_loss": 1.4233994483947754, - "eval_runtime": 0.5293, - "eval_samples_per_second": 1889.415, - "eval_steps_per_second": 30.231, - "step": 922740 - }, - { - "epoch": 6.001690617075233, - "grad_norm": 0.6171875, - "learning_rate": 0.0017599323753169907, - "loss": 1.4447, - "step": 923000 - }, - { - "epoch": 6.0049418037583715, - "grad_norm": 0.89453125, - "learning_rate": 0.0017598023278496651, - "loss": 1.4429, - "step": 923500 - }, - { - "epoch": 6.008192990441511, - "grad_norm": 3.890625, - "learning_rate": 0.0017596722803823394, - "loss": 1.4379, - "step": 924000 - }, - { - "epoch": 6.01144417712465, - "grad_norm": 3.109375, - "learning_rate": 0.0017595422329150139, - "loss": 1.4337, - "step": 924500 - }, - { - "epoch": 6.01469536380779, - "grad_norm": 2.984375, - "learning_rate": 0.0017594121854476886, - "loss": 1.4372, - "step": 925000 - }, - { - "epoch": 6.0179465504909295, - "grad_norm": 0.94921875, - "learning_rate": 0.001759282137980363, - "loss": 1.4368, - "step": 925500 - }, - { - "epoch": 6.021197737174068, - "grad_norm": 0.921875, - "learning_rate": 0.0017591520905130373, - "loss": 1.4414, - "step": 926000 - }, - { - "epoch": 6.024448923857208, - "grad_norm": 0.7265625, - "learning_rate": 0.0017590220430457118, - "loss": 1.4377, - "step": 926500 - }, - { - "epoch": 6.027700110540347, - "grad_norm": 0.73046875, - "learning_rate": 0.0017588919955783863, - "loss": 1.4385, - "step": 927000 - }, - { - "epoch": 6.030951297223487, - "grad_norm": 0.8828125, - "learning_rate": 0.0017587619481110605, - "loss": 1.437, - "step": 927500 - }, - { - "epoch": 6.0342024839066255, - "grad_norm": 0.72265625, - "learning_rate": 0.001758631900643735, - "loss": 1.4363, - "step": 928000 - }, - { - "epoch": 6.037453670589765, - "grad_norm": 0.734375, - "learning_rate": 0.0017585018531764095, - "loss": 1.4425, - "step": 928500 - }, - { - "epoch": 6.040704857272905, - "grad_norm": 0.7890625, - "learning_rate": 0.0017583718057090838, - "loss": 1.4346, - "step": 929000 - }, - { - "epoch": 6.043956043956044, - "grad_norm": 1.0546875, - "learning_rate": 0.0017582417582417582, - "loss": 1.4351, - "step": 929500 - }, - { - "epoch": 6.0472072306391835, - "grad_norm": 0.734375, - "learning_rate": 0.0017581117107744327, - "loss": 1.4381, - "step": 930000 - }, - { - "epoch": 6.050458417322322, - "grad_norm": 1.2890625, - "learning_rate": 0.001757981663307107, - "loss": 1.4369, - "step": 930500 - }, - { - "epoch": 6.053709604005462, - "grad_norm": 0.68359375, - "learning_rate": 0.0017578516158397815, - "loss": 1.4339, - "step": 931000 - }, - { - "epoch": 6.056960790688601, - "grad_norm": 1.5234375, - "learning_rate": 0.001757721568372456, - "loss": 1.432, - "step": 931500 - }, - { - "epoch": 6.060211977371741, - "grad_norm": 0.64453125, - "learning_rate": 0.0017575915209051302, - "loss": 1.429, - "step": 932000 - }, - { - "epoch": 6.06346316405488, - "grad_norm": 0.72265625, - "learning_rate": 0.001757461473437805, - "loss": 1.4366, - "step": 932500 - }, - { - "epoch": 6.066714350738019, - "grad_norm": 1.3828125, - "learning_rate": 0.0017573314259704794, - "loss": 1.4358, - "step": 933000 - }, - { - "epoch": 6.069965537421159, - "grad_norm": 0.8984375, - "learning_rate": 0.0017572013785031539, - "loss": 1.4321, - "step": 933500 - }, - { - "epoch": 6.073216724104298, - "grad_norm": 0.6796875, - "learning_rate": 0.0017570713310358281, - "loss": 1.4307, - "step": 934000 - }, - { - "epoch": 6.0764679107874375, - "grad_norm": 0.734375, - "learning_rate": 0.0017569412835685026, - "loss": 1.4316, - "step": 934500 - }, - { - "epoch": 6.079719097470576, - "grad_norm": 0.5859375, - "learning_rate": 0.001756811236101177, - "loss": 1.4334, - "step": 935000 - }, - { - "epoch": 6.082970284153716, - "grad_norm": 15.6875, - "learning_rate": 0.0017566811886338513, - "loss": 1.4387, - "step": 935500 - }, - { - "epoch": 6.086221470836856, - "grad_norm": 1.28125, - "learning_rate": 0.0017565511411665258, - "loss": 1.4378, - "step": 936000 - }, - { - "epoch": 6.089472657519995, - "grad_norm": 1.3203125, - "learning_rate": 0.0017564210936992003, - "loss": 1.438, - "step": 936500 - }, - { - "epoch": 6.092723844203134, - "grad_norm": 0.94921875, - "learning_rate": 0.0017562910462318746, - "loss": 1.4442, - "step": 937000 - }, - { - "epoch": 6.095975030886273, - "grad_norm": 0.6953125, - "learning_rate": 0.001756160998764549, - "loss": 1.4395, - "step": 937500 - }, - { - "epoch": 6.099226217569413, - "grad_norm": 0.69140625, - "learning_rate": 0.0017560309512972235, - "loss": 1.4356, - "step": 938000 - }, - { - "epoch": 6.102477404252552, - "grad_norm": 0.8125, - "learning_rate": 0.0017559009038298978, - "loss": 1.4354, - "step": 938500 - }, - { - "epoch": 6.1057285909356915, - "grad_norm": 1.59375, - "learning_rate": 0.0017557708563625722, - "loss": 1.4383, - "step": 939000 - }, - { - "epoch": 6.108979777618831, - "grad_norm": 0.78515625, - "learning_rate": 0.001755640808895247, - "loss": 1.4348, - "step": 939500 - }, - { - "epoch": 6.11223096430197, - "grad_norm": 0.82421875, - "learning_rate": 0.0017555107614279214, - "loss": 1.4367, - "step": 940000 - }, - { - "epoch": 6.11548215098511, - "grad_norm": 1.3828125, - "learning_rate": 0.0017553807139605957, - "loss": 1.4381, - "step": 940500 - }, - { - "epoch": 6.118733337668249, - "grad_norm": 0.89453125, - "learning_rate": 0.0017552506664932702, - "loss": 1.4336, - "step": 941000 - }, - { - "epoch": 6.121984524351388, - "grad_norm": 0.9140625, - "learning_rate": 0.0017551206190259446, - "loss": 1.4387, - "step": 941500 - }, - { - "epoch": 6.125235711034527, - "grad_norm": 0.640625, - "learning_rate": 0.001754990571558619, - "loss": 1.4386, - "step": 942000 - }, - { - "epoch": 6.128486897717667, - "grad_norm": 0.96875, - "learning_rate": 0.0017548605240912934, - "loss": 1.4418, - "step": 942500 - }, - { - "epoch": 6.131738084400807, - "grad_norm": 0.87109375, - "learning_rate": 0.0017547304766239679, - "loss": 1.4401, - "step": 943000 - }, - { - "epoch": 6.1349892710839455, - "grad_norm": 4.8125, - "learning_rate": 0.0017546004291566421, - "loss": 1.4466, - "step": 943500 - }, - { - "epoch": 6.138240457767085, - "grad_norm": 0.8203125, - "learning_rate": 0.0017544703816893166, - "loss": 1.4419, - "step": 944000 - }, - { - "epoch": 6.141491644450224, - "grad_norm": 0.69140625, - "learning_rate": 0.001754340334221991, - "loss": 1.4366, - "step": 944500 - }, - { - "epoch": 6.144742831133364, - "grad_norm": 0.99609375, - "learning_rate": 0.0017542102867546653, - "loss": 1.4378, - "step": 945000 - }, - { - "epoch": 6.147994017816503, - "grad_norm": 2.484375, - "learning_rate": 0.0017540802392873398, - "loss": 1.4372, - "step": 945500 - }, - { - "epoch": 6.151245204499642, - "grad_norm": 0.92578125, - "learning_rate": 0.0017539501918200143, - "loss": 1.4418, - "step": 946000 - }, - { - "epoch": 6.154496391182782, - "grad_norm": 0.70703125, - "learning_rate": 0.0017538201443526886, - "loss": 1.4448, - "step": 946500 - }, - { - "epoch": 6.157747577865921, - "grad_norm": 0.82421875, - "learning_rate": 0.0017536900968853633, - "loss": 1.4402, - "step": 947000 - }, - { - "epoch": 6.160998764549061, - "grad_norm": 0.73046875, - "learning_rate": 0.0017535600494180377, - "loss": 1.44, - "step": 947500 - }, - { - "epoch": 6.1642499512321995, - "grad_norm": 0.7421875, - "learning_rate": 0.0017534300019507122, - "loss": 1.4443, - "step": 948000 - }, - { - "epoch": 6.167501137915339, - "grad_norm": 0.59765625, - "learning_rate": 0.0017532999544833865, - "loss": 1.4419, - "step": 948500 - }, - { - "epoch": 6.170752324598478, - "grad_norm": 1.5234375, - "learning_rate": 0.001753169907016061, - "loss": 1.4433, - "step": 949000 - }, - { - "epoch": 6.174003511281618, - "grad_norm": 0.6796875, - "learning_rate": 0.0017530398595487354, - "loss": 1.4432, - "step": 949500 - }, - { - "epoch": 6.177254697964758, - "grad_norm": 0.83203125, - "learning_rate": 0.0017529098120814097, - "loss": 1.4434, - "step": 950000 - }, - { - "epoch": 6.180505884647896, - "grad_norm": 0.8828125, - "learning_rate": 0.0017527797646140842, - "loss": 1.4415, - "step": 950500 - }, - { - "epoch": 6.183757071331036, - "grad_norm": 1.4296875, - "learning_rate": 0.0017526497171467586, - "loss": 1.4471, - "step": 951000 - }, - { - "epoch": 6.187008258014175, - "grad_norm": 0.875, - "learning_rate": 0.001752519669679433, - "loss": 1.4471, - "step": 951500 - }, - { - "epoch": 6.190259444697315, - "grad_norm": 0.796875, - "learning_rate": 0.0017523896222121074, - "loss": 1.4398, - "step": 952000 - }, - { - "epoch": 6.1935106313804535, - "grad_norm": 0.6328125, - "learning_rate": 0.0017522595747447819, - "loss": 1.4425, - "step": 952500 - }, - { - "epoch": 6.196761818063593, - "grad_norm": 0.73046875, - "learning_rate": 0.0017521295272774561, - "loss": 1.4445, - "step": 953000 - }, - { - "epoch": 6.200013004746733, - "grad_norm": 0.609375, - "learning_rate": 0.0017519994798101306, - "loss": 1.447, - "step": 953500 - }, - { - "epoch": 6.203264191429872, - "grad_norm": 0.65234375, - "learning_rate": 0.0017518694323428053, - "loss": 1.4354, - "step": 954000 - }, - { - "epoch": 6.206515378113012, - "grad_norm": 1.0234375, - "learning_rate": 0.0017517393848754798, - "loss": 1.436, - "step": 954500 - }, - { - "epoch": 6.20976656479615, - "grad_norm": 1.140625, - "learning_rate": 0.001751609337408154, - "loss": 1.4396, - "step": 955000 - }, - { - "epoch": 6.21301775147929, - "grad_norm": 0.6171875, - "learning_rate": 0.0017514792899408285, - "loss": 1.4427, - "step": 955500 - }, - { - "epoch": 6.216268938162429, - "grad_norm": 0.8203125, - "learning_rate": 0.001751349242473503, - "loss": 1.4362, - "step": 956000 - }, - { - "epoch": 6.219520124845569, - "grad_norm": 0.7890625, - "learning_rate": 0.0017512191950061773, - "loss": 1.4419, - "step": 956500 - }, - { - "epoch": 6.2227713115287075, - "grad_norm": 0.7578125, - "learning_rate": 0.0017510891475388517, - "loss": 1.4488, - "step": 957000 - }, - { - "epoch": 6.226022498211847, - "grad_norm": 0.76171875, - "learning_rate": 0.0017509591000715262, - "loss": 1.4408, - "step": 957500 - }, - { - "epoch": 6.229273684894987, - "grad_norm": 0.796875, - "learning_rate": 0.0017508290526042005, - "loss": 1.4444, - "step": 958000 - }, - { - "epoch": 6.232524871578126, - "grad_norm": 0.90234375, - "learning_rate": 0.001750699005136875, - "loss": 1.448, - "step": 958500 - }, - { - "epoch": 6.235776058261266, - "grad_norm": 2.6875, - "learning_rate": 0.0017505689576695494, - "loss": 1.4426, - "step": 959000 - }, - { - "epoch": 6.239027244944404, - "grad_norm": 0.7734375, - "learning_rate": 0.0017504389102022237, - "loss": 1.4409, - "step": 959500 - }, - { - "epoch": 6.242278431627544, - "grad_norm": 0.625, - "learning_rate": 0.0017503088627348982, - "loss": 1.4349, - "step": 960000 - }, - { - "epoch": 6.245529618310683, - "grad_norm": 16.875, - "learning_rate": 0.0017501788152675727, - "loss": 1.4433, - "step": 960500 - }, - { - "epoch": 6.248780804993823, - "grad_norm": 1.5390625, - "learning_rate": 0.001750048767800247, - "loss": 1.4417, - "step": 961000 - }, - { - "epoch": 6.252031991676962, - "grad_norm": 0.79296875, - "learning_rate": 0.0017499187203329216, - "loss": 1.4414, - "step": 961500 - }, - { - "epoch": 6.255283178360101, - "grad_norm": 0.69921875, - "learning_rate": 0.001749788672865596, - "loss": 1.4424, - "step": 962000 - }, - { - "epoch": 6.258534365043241, - "grad_norm": 0.83984375, - "learning_rate": 0.0017496586253982706, - "loss": 1.4417, - "step": 962500 - }, - { - "epoch": 6.26178555172638, - "grad_norm": 0.65234375, - "learning_rate": 0.0017495285779309448, - "loss": 1.4416, - "step": 963000 - }, - { - "epoch": 6.26503673840952, - "grad_norm": 1.7421875, - "learning_rate": 0.0017493985304636193, - "loss": 1.4448, - "step": 963500 - }, - { - "epoch": 6.268287925092658, - "grad_norm": 1.1484375, - "learning_rate": 0.0017492684829962938, - "loss": 1.4438, - "step": 964000 - }, - { - "epoch": 6.271539111775798, - "grad_norm": 1.0625, - "learning_rate": 0.001749138435528968, - "loss": 1.443, - "step": 964500 - }, - { - "epoch": 6.274790298458938, - "grad_norm": 0.92578125, - "learning_rate": 0.0017490083880616425, - "loss": 1.4432, - "step": 965000 - }, - { - "epoch": 6.278041485142077, - "grad_norm": 1.8046875, - "learning_rate": 0.001748878340594317, - "loss": 1.4418, - "step": 965500 - }, - { - "epoch": 6.2812926718252164, - "grad_norm": 0.78515625, - "learning_rate": 0.0017487482931269913, - "loss": 1.4437, - "step": 966000 - }, - { - "epoch": 6.284543858508355, - "grad_norm": 0.7109375, - "learning_rate": 0.0017486182456596657, - "loss": 1.4367, - "step": 966500 - }, - { - "epoch": 6.287795045191495, - "grad_norm": 0.9375, - "learning_rate": 0.0017484881981923402, - "loss": 1.4362, - "step": 967000 - }, - { - "epoch": 6.291046231874634, - "grad_norm": 0.765625, - "learning_rate": 0.0017483581507250145, - "loss": 1.443, - "step": 967500 - }, - { - "epoch": 6.294297418557774, - "grad_norm": 0.8046875, - "learning_rate": 0.001748228103257689, - "loss": 1.4402, - "step": 968000 - }, - { - "epoch": 6.297548605240913, - "grad_norm": 0.8203125, - "learning_rate": 0.0017480980557903637, - "loss": 1.4419, - "step": 968500 - }, - { - "epoch": 6.300799791924052, - "grad_norm": 1.0390625, - "learning_rate": 0.0017479680083230381, - "loss": 1.4365, - "step": 969000 - }, - { - "epoch": 6.304050978607192, - "grad_norm": 1.203125, - "learning_rate": 0.0017478379608557124, - "loss": 1.436, - "step": 969500 - }, - { - "epoch": 6.307302165290331, - "grad_norm": 0.71484375, - "learning_rate": 0.0017477079133883869, - "loss": 1.4406, - "step": 970000 - }, - { - "epoch": 6.3105533519734704, - "grad_norm": 0.9375, - "learning_rate": 0.0017475778659210614, - "loss": 1.4384, - "step": 970500 - }, - { - "epoch": 6.313804538656609, - "grad_norm": 0.671875, - "learning_rate": 0.0017474478184537356, - "loss": 1.4433, - "step": 971000 - }, - { - "epoch": 6.317055725339749, - "grad_norm": 3.265625, - "learning_rate": 0.00174731777098641, - "loss": 1.4363, - "step": 971500 - }, - { - "epoch": 6.320306912022889, - "grad_norm": 0.71484375, - "learning_rate": 0.0017471877235190846, - "loss": 1.4395, - "step": 972000 - }, - { - "epoch": 6.323558098706028, - "grad_norm": 1.4453125, - "learning_rate": 0.0017470576760517588, - "loss": 1.4361, - "step": 972500 - }, - { - "epoch": 6.326809285389167, - "grad_norm": 0.68359375, - "learning_rate": 0.0017469276285844333, - "loss": 1.4345, - "step": 973000 - }, - { - "epoch": 6.330060472072306, - "grad_norm": 0.6484375, - "learning_rate": 0.0017467975811171078, - "loss": 1.4347, - "step": 973500 - }, - { - "epoch": 6.333311658755446, - "grad_norm": 0.9921875, - "learning_rate": 0.001746667533649782, - "loss": 1.4364, - "step": 974000 - }, - { - "epoch": 6.336562845438585, - "grad_norm": 1.1953125, - "learning_rate": 0.0017465374861824565, - "loss": 1.4367, - "step": 974500 - }, - { - "epoch": 6.3398140321217245, - "grad_norm": 5.71875, - "learning_rate": 0.001746407438715131, - "loss": 1.4332, - "step": 975000 - }, - { - "epoch": 6.343065218804864, - "grad_norm": 0.9921875, - "learning_rate": 0.0017462773912478053, - "loss": 1.4422, - "step": 975500 - }, - { - "epoch": 6.346316405488003, - "grad_norm": 0.62109375, - "learning_rate": 0.00174614734378048, - "loss": 1.4358, - "step": 976000 - }, - { - "epoch": 6.349567592171143, - "grad_norm": 1.5234375, - "learning_rate": 0.0017460172963131545, - "loss": 1.4357, - "step": 976500 - }, - { - "epoch": 6.352818778854282, - "grad_norm": 1.15625, - "learning_rate": 0.001745887248845829, - "loss": 1.4352, - "step": 977000 - }, - { - "epoch": 6.356069965537421, - "grad_norm": 0.98046875, - "learning_rate": 0.0017457572013785032, - "loss": 1.4334, - "step": 977500 - }, - { - "epoch": 6.35932115222056, - "grad_norm": 0.796875, - "learning_rate": 0.0017456271539111777, - "loss": 1.4317, - "step": 978000 - }, - { - "epoch": 6.3625723389037, - "grad_norm": 0.86328125, - "learning_rate": 0.0017454971064438521, - "loss": 1.4379, - "step": 978500 - }, - { - "epoch": 6.365823525586839, - "grad_norm": 0.7265625, - "learning_rate": 0.0017453670589765264, - "loss": 1.4336, - "step": 979000 - }, - { - "epoch": 6.3690747122699785, - "grad_norm": 0.72265625, - "learning_rate": 0.0017452370115092009, - "loss": 1.4379, - "step": 979500 - }, - { - "epoch": 6.372325898953118, - "grad_norm": 0.71484375, - "learning_rate": 0.0017451069640418754, - "loss": 1.4391, - "step": 980000 - }, - { - "epoch": 6.375577085636257, - "grad_norm": 0.72265625, - "learning_rate": 0.0017449769165745496, - "loss": 1.4391, - "step": 980500 - }, - { - "epoch": 6.378828272319397, - "grad_norm": 4.0625, - "learning_rate": 0.001744846869107224, - "loss": 1.4431, - "step": 981000 - }, - { - "epoch": 6.382079459002536, - "grad_norm": 0.95703125, - "learning_rate": 0.0017447168216398986, - "loss": 1.4351, - "step": 981500 - }, - { - "epoch": 6.385330645685675, - "grad_norm": 0.67578125, - "learning_rate": 0.0017445867741725728, - "loss": 1.4387, - "step": 982000 - }, - { - "epoch": 6.388581832368814, - "grad_norm": 0.67578125, - "learning_rate": 0.0017444567267052473, - "loss": 1.4328, - "step": 982500 - }, - { - "epoch": 6.391833019051954, - "grad_norm": 0.6328125, - "learning_rate": 0.001744326679237922, - "loss": 1.4343, - "step": 983000 - }, - { - "epoch": 6.395084205735094, - "grad_norm": 0.73828125, - "learning_rate": 0.0017441966317705965, - "loss": 1.4335, - "step": 983500 - }, - { - "epoch": 6.3983353924182325, - "grad_norm": 0.8359375, - "learning_rate": 0.0017440665843032708, - "loss": 1.435, - "step": 984000 - }, - { - "epoch": 6.401586579101372, - "grad_norm": 0.6484375, - "learning_rate": 0.0017439365368359452, - "loss": 1.4415, - "step": 984500 - }, - { - "epoch": 6.404837765784511, - "grad_norm": 0.69921875, - "learning_rate": 0.0017438064893686197, - "loss": 1.443, - "step": 985000 - }, - { - "epoch": 6.408088952467651, - "grad_norm": 1.71875, - "learning_rate": 0.001743676441901294, - "loss": 1.4426, - "step": 985500 - }, - { - "epoch": 6.41134013915079, - "grad_norm": 15.4375, - "learning_rate": 0.0017435463944339685, - "loss": 1.4405, - "step": 986000 - }, - { - "epoch": 6.414591325833929, - "grad_norm": 0.7734375, - "learning_rate": 0.001743416346966643, - "loss": 1.4408, - "step": 986500 - }, - { - "epoch": 6.417842512517069, - "grad_norm": 0.6875, - "learning_rate": 0.0017432862994993172, - "loss": 1.4509, - "step": 987000 - }, - { - "epoch": 6.421093699200208, - "grad_norm": 0.95703125, - "learning_rate": 0.0017431562520319917, - "loss": 1.4435, - "step": 987500 - }, - { - "epoch": 6.424344885883348, - "grad_norm": 1.0234375, - "learning_rate": 0.0017430262045646662, - "loss": 1.4417, - "step": 988000 - }, - { - "epoch": 6.4275960725664865, - "grad_norm": 0.71875, - "learning_rate": 0.0017428961570973404, - "loss": 1.4566, - "step": 988500 - }, - { - "epoch": 6.430847259249626, - "grad_norm": 0.66015625, - "learning_rate": 0.001742766109630015, - "loss": 1.4679, - "step": 989000 - }, - { - "epoch": 6.434098445932765, - "grad_norm": 0.65234375, - "learning_rate": 0.0017426360621626894, - "loss": 1.4754, - "step": 989500 - }, - { - "epoch": 6.437349632615905, - "grad_norm": 0.68359375, - "learning_rate": 0.0017425060146953636, - "loss": 1.4804, - "step": 990000 - }, - { - "epoch": 6.4406008192990445, - "grad_norm": 0.87109375, - "learning_rate": 0.0017423759672280383, - "loss": 1.4847, - "step": 990500 - }, - { - "epoch": 6.443852005982183, - "grad_norm": 1.765625, - "learning_rate": 0.0017422459197607128, - "loss": 1.4757, - "step": 991000 - }, - { - "epoch": 6.447103192665323, - "grad_norm": 0.91796875, - "learning_rate": 0.0017421158722933873, - "loss": 1.4749, - "step": 991500 - }, - { - "epoch": 6.450354379348462, - "grad_norm": 0.8515625, - "learning_rate": 0.0017419858248260615, - "loss": 1.4674, - "step": 992000 - }, - { - "epoch": 6.453605566031602, - "grad_norm": 1.21875, - "learning_rate": 0.001741855777358736, - "loss": 1.4648, - "step": 992500 - }, - { - "epoch": 6.4568567527147405, - "grad_norm": 0.69921875, - "learning_rate": 0.0017417257298914105, - "loss": 1.453, - "step": 993000 - }, - { - "epoch": 6.46010793939788, - "grad_norm": 3.25, - "learning_rate": 0.0017415956824240848, - "loss": 1.4638, - "step": 993500 - }, - { - "epoch": 6.46335912608102, - "grad_norm": 0.8828125, - "learning_rate": 0.0017414656349567592, - "loss": 1.4796, - "step": 994000 - }, - { - "epoch": 6.466610312764159, - "grad_norm": 0.75, - "learning_rate": 0.0017413355874894337, - "loss": 1.4722, - "step": 994500 - }, - { - "epoch": 6.4698614994472985, - "grad_norm": 0.671875, - "learning_rate": 0.001741205540022108, - "loss": 1.474, - "step": 995000 - }, - { - "epoch": 6.473112686130437, - "grad_norm": 0.65625, - "learning_rate": 0.0017410754925547825, - "loss": 1.4831, - "step": 995500 - }, - { - "epoch": 6.476363872813577, - "grad_norm": 0.71875, - "learning_rate": 0.001740945445087457, - "loss": 1.4832, - "step": 996000 - }, - { - "epoch": 6.479615059496716, - "grad_norm": 0.93359375, - "learning_rate": 0.0017408153976201312, - "loss": 1.4926, - "step": 996500 - }, - { - "epoch": 6.482866246179856, - "grad_norm": 0.77734375, - "learning_rate": 0.0017406853501528057, - "loss": 1.489, - "step": 997000 - }, - { - "epoch": 6.486117432862995, - "grad_norm": 0.71484375, - "learning_rate": 0.0017405553026854804, - "loss": 1.494, - "step": 997500 - }, - { - "epoch": 6.489368619546134, - "grad_norm": 0.65625, - "learning_rate": 0.0017404252552181549, - "loss": 1.4693, - "step": 998000 - }, - { - "epoch": 6.492619806229274, - "grad_norm": 1.3828125, - "learning_rate": 0.0017402952077508291, - "loss": 1.457, - "step": 998500 - }, - { - "epoch": 6.495870992912413, - "grad_norm": 0.76953125, - "learning_rate": 0.0017401651602835036, - "loss": 1.4526, - "step": 999000 - }, - { - "epoch": 6.4991221795955525, - "grad_norm": 2.296875, - "learning_rate": 0.001740035112816178, - "loss": 1.4506, - "step": 999500 - }, - { - "epoch": 6.502373366278691, - "grad_norm": 19.5, - "learning_rate": 0.0017399050653488523, - "loss": 1.4474, - "step": 1000000 - }, - { - "epoch": 6.505624552961831, - "grad_norm": 0.6875, - "learning_rate": 0.0017397750178815268, - "loss": 1.4499, - "step": 1000500 - }, - { - "epoch": 6.508875739644971, - "grad_norm": 0.8984375, - "learning_rate": 0.0017396449704142013, - "loss": 1.4509, - "step": 1001000 - }, - { - "epoch": 6.51212692632811, - "grad_norm": 0.6484375, - "learning_rate": 0.0017395149229468756, - "loss": 1.4505, - "step": 1001500 - }, - { - "epoch": 6.515378113011249, - "grad_norm": 0.89453125, - "learning_rate": 0.00173938487547955, - "loss": 1.4481, - "step": 1002000 - }, - { - "epoch": 6.518629299694388, - "grad_norm": 0.76953125, - "learning_rate": 0.0017392548280122245, - "loss": 1.4461, - "step": 1002500 - }, - { - "epoch": 6.521880486377528, - "grad_norm": 0.89453125, - "learning_rate": 0.0017391247805448988, - "loss": 1.4411, - "step": 1003000 - }, - { - "epoch": 6.525131673060667, - "grad_norm": 0.76171875, - "learning_rate": 0.0017389947330775733, - "loss": 1.4491, - "step": 1003500 - }, - { - "epoch": 6.5283828597438065, - "grad_norm": 1.0703125, - "learning_rate": 0.0017388646856102477, - "loss": 1.4502, - "step": 1004000 - }, - { - "epoch": 6.531634046426946, - "grad_norm": 0.81640625, - "learning_rate": 0.001738734638142922, - "loss": 1.4517, - "step": 1004500 - }, - { - "epoch": 6.534885233110085, - "grad_norm": 1.3125, - "learning_rate": 0.0017386045906755967, - "loss": 1.4522, - "step": 1005000 - }, - { - "epoch": 6.538136419793225, - "grad_norm": 0.70703125, - "learning_rate": 0.0017384745432082712, - "loss": 1.4466, - "step": 1005500 - }, - { - "epoch": 6.541387606476364, - "grad_norm": 0.92578125, - "learning_rate": 0.0017383444957409456, - "loss": 1.4474, - "step": 1006000 - }, - { - "epoch": 6.544638793159503, - "grad_norm": 0.8046875, - "learning_rate": 0.00173821444827362, - "loss": 1.4479, - "step": 1006500 - }, - { - "epoch": 6.547889979842642, - "grad_norm": 0.703125, - "learning_rate": 0.0017380844008062944, - "loss": 1.4444, - "step": 1007000 - }, - { - "epoch": 6.551141166525782, - "grad_norm": 0.87890625, - "learning_rate": 0.0017379543533389689, - "loss": 1.4431, - "step": 1007500 - }, - { - "epoch": 6.554392353208922, - "grad_norm": 0.66015625, - "learning_rate": 0.0017378243058716431, - "loss": 1.4487, - "step": 1008000 - }, - { - "epoch": 6.5576435398920605, - "grad_norm": 0.75, - "learning_rate": 0.0017376942584043176, - "loss": 1.4422, - "step": 1008500 - }, - { - "epoch": 6.5608947265752, - "grad_norm": 0.68359375, - "learning_rate": 0.001737564210936992, - "loss": 1.44, - "step": 1009000 - }, - { - "epoch": 6.564145913258339, - "grad_norm": 0.9921875, - "learning_rate": 0.0017374341634696663, - "loss": 1.4404, - "step": 1009500 - }, - { - "epoch": 6.567397099941479, - "grad_norm": 0.73828125, - "learning_rate": 0.0017373041160023408, - "loss": 1.4407, - "step": 1010000 - }, - { - "epoch": 6.570648286624618, - "grad_norm": 1.6484375, - "learning_rate": 0.0017371740685350153, - "loss": 1.446, - "step": 1010500 - }, - { - "epoch": 6.573899473307757, - "grad_norm": 0.81640625, - "learning_rate": 0.0017370440210676896, - "loss": 1.449, - "step": 1011000 - }, - { - "epoch": 6.577150659990897, - "grad_norm": 0.7890625, - "learning_rate": 0.001736913973600364, - "loss": 1.449, - "step": 1011500 - }, - { - "epoch": 6.580401846674036, - "grad_norm": 0.90625, - "learning_rate": 0.0017367839261330387, - "loss": 1.4324, - "step": 1012000 - }, - { - "epoch": 6.583653033357176, - "grad_norm": 0.6875, - "learning_rate": 0.0017366538786657132, - "loss": 1.4414, - "step": 1012500 - }, - { - "epoch": 6.5869042200403145, - "grad_norm": 1.8984375, - "learning_rate": 0.0017365238311983875, - "loss": 1.4451, - "step": 1013000 - }, - { - "epoch": 6.590155406723454, - "grad_norm": 0.83984375, - "learning_rate": 0.001736393783731062, - "loss": 1.4398, - "step": 1013500 - }, - { - "epoch": 6.593406593406593, - "grad_norm": 0.7578125, - "learning_rate": 0.0017362637362637364, - "loss": 1.446, - "step": 1014000 - }, - { - "epoch": 6.596657780089733, - "grad_norm": 0.7578125, - "learning_rate": 0.0017361336887964107, - "loss": 1.4452, - "step": 1014500 - }, - { - "epoch": 6.5999089667728725, - "grad_norm": 0.66015625, - "learning_rate": 0.0017360036413290852, - "loss": 1.4497, - "step": 1015000 - }, - { - "epoch": 6.603160153456011, - "grad_norm": 0.6875, - "learning_rate": 0.0017358735938617597, - "loss": 1.4545, - "step": 1015500 - }, - { - "epoch": 6.606411340139151, - "grad_norm": 0.6484375, - "learning_rate": 0.001735743546394434, - "loss": 1.4642, - "step": 1016000 - }, - { - "epoch": 6.60966252682229, - "grad_norm": 0.79296875, - "learning_rate": 0.0017356134989271084, - "loss": 1.4545, - "step": 1016500 - }, - { - "epoch": 6.61291371350543, - "grad_norm": 0.6640625, - "learning_rate": 0.0017354834514597829, - "loss": 1.4551, - "step": 1017000 - }, - { - "epoch": 6.6161649001885685, - "grad_norm": 1.484375, - "learning_rate": 0.0017353534039924571, - "loss": 1.4558, - "step": 1017500 - }, - { - "epoch": 6.619416086871708, - "grad_norm": 1.1484375, - "learning_rate": 0.0017352233565251316, - "loss": 1.4509, - "step": 1018000 - }, - { - "epoch": 6.622667273554848, - "grad_norm": 0.64453125, - "learning_rate": 0.001735093309057806, - "loss": 1.4542, - "step": 1018500 - }, - { - "epoch": 6.625918460237987, - "grad_norm": 1.125, - "learning_rate": 0.0017349632615904804, - "loss": 1.448, - "step": 1019000 - }, - { - "epoch": 6.6291696469211265, - "grad_norm": 2.78125, - "learning_rate": 0.001734833214123155, - "loss": 1.4441, - "step": 1019500 - }, - { - "epoch": 6.632420833604265, - "grad_norm": 0.7265625, - "learning_rate": 0.0017347031666558295, - "loss": 1.4479, - "step": 1020000 - }, - { - "epoch": 6.635672020287405, - "grad_norm": 0.67578125, - "learning_rate": 0.001734573119188504, - "loss": 1.4445, - "step": 1020500 - }, - { - "epoch": 6.638923206970544, - "grad_norm": 0.734375, - "learning_rate": 0.0017344430717211783, - "loss": 1.4484, - "step": 1021000 - }, - { - "epoch": 6.642174393653684, - "grad_norm": 0.9609375, - "learning_rate": 0.0017343130242538527, - "loss": 1.4464, - "step": 1021500 - }, - { - "epoch": 6.645425580336823, - "grad_norm": 1.71875, - "learning_rate": 0.0017341829767865272, - "loss": 1.4434, - "step": 1022000 - }, - { - "epoch": 6.648676767019962, - "grad_norm": 0.87109375, - "learning_rate": 0.0017340529293192015, - "loss": 1.4531, - "step": 1022500 - }, - { - "epoch": 6.651927953703102, - "grad_norm": 1.46875, - "learning_rate": 0.001733922881851876, - "loss": 1.454, - "step": 1023000 - }, - { - "epoch": 6.655179140386241, - "grad_norm": 0.734375, - "learning_rate": 0.0017337928343845504, - "loss": 1.4707, - "step": 1023500 - }, - { - "epoch": 6.6584303270693805, - "grad_norm": 0.78125, - "learning_rate": 0.0017336627869172247, - "loss": 1.4892, - "step": 1024000 - }, - { - "epoch": 6.661681513752519, - "grad_norm": 0.6640625, - "learning_rate": 0.0017335327394498992, - "loss": 1.5023, - "step": 1024500 - }, - { - "epoch": 6.664932700435659, - "grad_norm": 0.57421875, - "learning_rate": 0.0017334026919825737, - "loss": 1.5322, - "step": 1025000 - }, - { - "epoch": 6.668183887118799, - "grad_norm": 0.6328125, - "learning_rate": 0.001733272644515248, - "loss": 1.5527, - "step": 1025500 - }, - { - "epoch": 6.671435073801938, - "grad_norm": 0.59375, - "learning_rate": 0.0017331425970479224, - "loss": 1.523, - "step": 1026000 - }, - { - "epoch": 6.674686260485077, - "grad_norm": 0.98046875, - "learning_rate": 0.001733012549580597, - "loss": 1.5219, - "step": 1026500 - }, - { - "epoch": 6.677937447168216, - "grad_norm": 1.375, - "learning_rate": 0.0017328825021132716, - "loss": 1.5343, - "step": 1027000 - }, - { - "epoch": 6.681188633851356, - "grad_norm": 0.73046875, - "learning_rate": 0.0017327524546459458, - "loss": 1.5245, - "step": 1027500 - }, - { - "epoch": 6.684439820534495, - "grad_norm": 0.66015625, - "learning_rate": 0.0017326224071786203, - "loss": 1.5177, - "step": 1028000 - }, - { - "epoch": 6.6876910072176345, - "grad_norm": 0.765625, - "learning_rate": 0.0017324923597112948, - "loss": 1.513, - "step": 1028500 - }, - { - "epoch": 6.690942193900774, - "grad_norm": 0.75, - "learning_rate": 0.001732362312243969, - "loss": 1.5196, - "step": 1029000 - }, - { - "epoch": 6.694193380583913, - "grad_norm": 0.66796875, - "learning_rate": 0.0017322322647766435, - "loss": 1.5089, - "step": 1029500 - }, - { - "epoch": 6.697444567267053, - "grad_norm": 0.72265625, - "learning_rate": 0.001732102217309318, - "loss": 1.5089, - "step": 1030000 - }, - { - "epoch": 6.700695753950192, - "grad_norm": 0.671875, - "learning_rate": 0.0017319721698419923, - "loss": 1.5037, - "step": 1030500 - }, - { - "epoch": 6.703946940633331, - "grad_norm": 1.6796875, - "learning_rate": 0.0017318421223746668, - "loss": 1.4995, - "step": 1031000 - }, - { - "epoch": 6.70719812731647, - "grad_norm": 0.7734375, - "learning_rate": 0.0017317120749073412, - "loss": 1.4928, - "step": 1031500 - }, - { - "epoch": 6.71044931399961, - "grad_norm": 0.78125, - "learning_rate": 0.0017315820274400155, - "loss": 1.4962, - "step": 1032000 - }, - { - "epoch": 6.713700500682749, - "grad_norm": 0.6171875, - "learning_rate": 0.00173145197997269, - "loss": 1.4943, - "step": 1032500 - }, - { - "epoch": 6.7169516873658885, - "grad_norm": 0.7421875, - "learning_rate": 0.0017313219325053645, - "loss": 1.4925, - "step": 1033000 - }, - { - "epoch": 6.720202874049028, - "grad_norm": 0.75, - "learning_rate": 0.0017311918850380387, - "loss": 1.4849, - "step": 1033500 - }, - { - "epoch": 6.723454060732167, - "grad_norm": 0.71875, - "learning_rate": 0.0017310618375707134, - "loss": 1.479, - "step": 1034000 - }, - { - "epoch": 6.726705247415307, - "grad_norm": 0.875, - "learning_rate": 0.0017309317901033879, - "loss": 1.4838, - "step": 1034500 - }, - { - "epoch": 6.729956434098446, - "grad_norm": 1.09375, - "learning_rate": 0.0017308017426360624, - "loss": 1.4778, - "step": 1035000 - }, - { - "epoch": 6.733207620781585, - "grad_norm": 1.046875, - "learning_rate": 0.0017306716951687366, - "loss": 1.4825, - "step": 1035500 - }, - { - "epoch": 6.736458807464724, - "grad_norm": 0.890625, - "learning_rate": 0.001730541647701411, - "loss": 1.4769, - "step": 1036000 - }, - { - "epoch": 6.739709994147864, - "grad_norm": 1.046875, - "learning_rate": 0.0017304116002340856, - "loss": 1.4705, - "step": 1036500 - }, - { - "epoch": 6.742961180831003, - "grad_norm": 2.203125, - "learning_rate": 0.0017302815527667598, - "loss": 1.4729, - "step": 1037000 - }, - { - "epoch": 6.7462123675141425, - "grad_norm": 1.0, - "learning_rate": 0.0017301515052994343, - "loss": 1.4738, - "step": 1037500 - }, - { - "epoch": 6.749463554197282, - "grad_norm": 0.64453125, - "learning_rate": 0.0017300214578321088, - "loss": 1.4775, - "step": 1038000 - }, - { - "epoch": 6.752714740880421, - "grad_norm": 0.81640625, - "learning_rate": 0.001729891410364783, - "loss": 1.4787, - "step": 1038500 - }, - { - "epoch": 6.755965927563561, - "grad_norm": 1.796875, - "learning_rate": 0.0017297613628974575, - "loss": 1.4751, - "step": 1039000 - }, - { - "epoch": 6.7592171142467, - "grad_norm": 0.92578125, - "learning_rate": 0.001729631315430132, - "loss": 1.4772, - "step": 1039500 - }, - { - "epoch": 6.762468300929839, - "grad_norm": 0.89453125, - "learning_rate": 0.0017295012679628063, - "loss": 1.4735, - "step": 1040000 - }, - { - "epoch": 6.765719487612978, - "grad_norm": 0.87109375, - "learning_rate": 0.0017293712204954808, - "loss": 1.4709, - "step": 1040500 - }, - { - "epoch": 6.768970674296118, - "grad_norm": 0.7421875, - "learning_rate": 0.0017292411730281555, - "loss": 1.4708, - "step": 1041000 - }, - { - "epoch": 6.772221860979258, - "grad_norm": 0.828125, - "learning_rate": 0.00172911112556083, - "loss": 1.4705, - "step": 1041500 - }, - { - "epoch": 6.7754730476623966, - "grad_norm": 0.62109375, - "learning_rate": 0.0017289810780935042, - "loss": 1.4681, - "step": 1042000 - }, - { - "epoch": 6.778724234345536, - "grad_norm": 0.69140625, - "learning_rate": 0.0017288510306261787, - "loss": 1.4618, - "step": 1042500 - }, - { - "epoch": 6.781975421028675, - "grad_norm": 0.90625, - "learning_rate": 0.0017287209831588532, - "loss": 1.4675, - "step": 1043000 - }, - { - "epoch": 6.785226607711815, - "grad_norm": 0.6875, - "learning_rate": 0.0017285909356915274, - "loss": 1.472, - "step": 1043500 - }, - { - "epoch": 6.788477794394954, - "grad_norm": 0.66796875, - "learning_rate": 0.001728460888224202, - "loss": 1.4722, - "step": 1044000 - }, - { - "epoch": 6.791728981078093, - "grad_norm": 0.60546875, - "learning_rate": 0.0017283308407568764, - "loss": 1.4638, - "step": 1044500 - }, - { - "epoch": 6.794980167761233, - "grad_norm": 0.97265625, - "learning_rate": 0.0017282007932895506, - "loss": 1.4743, - "step": 1045000 - }, - { - "epoch": 6.798231354444372, - "grad_norm": 0.65625, - "learning_rate": 0.0017280707458222251, - "loss": 1.472, - "step": 1045500 - }, - { - "epoch": 6.801482541127512, - "grad_norm": 1.5390625, - "learning_rate": 0.0017279406983548996, - "loss": 1.4667, - "step": 1046000 - }, - { - "epoch": 6.804733727810651, - "grad_norm": 0.95703125, - "learning_rate": 0.0017278106508875739, - "loss": 1.4613, - "step": 1046500 - }, - { - "epoch": 6.80798491449379, - "grad_norm": 0.76171875, - "learning_rate": 0.0017276806034202483, - "loss": 1.4652, - "step": 1047000 - }, - { - "epoch": 6.811236101176929, - "grad_norm": 1.9609375, - "learning_rate": 0.0017275505559529228, - "loss": 1.4641, - "step": 1047500 - }, - { - "epoch": 6.814487287860069, - "grad_norm": 1.7109375, - "learning_rate": 0.001727420508485597, - "loss": 1.4573, - "step": 1048000 - }, - { - "epoch": 6.817738474543209, - "grad_norm": 0.765625, - "learning_rate": 0.0017272904610182718, - "loss": 1.4655, - "step": 1048500 - }, - { - "epoch": 6.820989661226347, - "grad_norm": 0.83984375, - "learning_rate": 0.0017271604135509462, - "loss": 1.4632, - "step": 1049000 - }, - { - "epoch": 6.824240847909487, - "grad_norm": 0.65625, - "learning_rate": 0.0017270303660836207, - "loss": 1.461, - "step": 1049500 - }, - { - "epoch": 6.827492034592626, - "grad_norm": 1.296875, - "learning_rate": 0.001726900318616295, - "loss": 1.4637, - "step": 1050000 - }, - { - "epoch": 6.830743221275766, - "grad_norm": 0.6953125, - "learning_rate": 0.0017267702711489695, - "loss": 1.4603, - "step": 1050500 - }, - { - "epoch": 6.833994407958905, - "grad_norm": 0.72265625, - "learning_rate": 0.001726640223681644, - "loss": 1.4622, - "step": 1051000 - }, - { - "epoch": 6.837245594642044, - "grad_norm": 0.75, - "learning_rate": 0.0017265101762143182, - "loss": 1.4617, - "step": 1051500 - }, - { - "epoch": 6.840496781325184, - "grad_norm": 0.9296875, - "learning_rate": 0.0017263801287469927, - "loss": 1.4625, - "step": 1052000 - }, - { - "epoch": 6.843747968008323, - "grad_norm": 0.80859375, - "learning_rate": 0.0017262500812796672, - "loss": 1.459, - "step": 1052500 - }, - { - "epoch": 6.846999154691463, - "grad_norm": 0.69140625, - "learning_rate": 0.0017261200338123414, - "loss": 1.4576, - "step": 1053000 - }, - { - "epoch": 6.850250341374601, - "grad_norm": 1.265625, - "learning_rate": 0.001725989986345016, - "loss": 1.4597, - "step": 1053500 - }, - { - "epoch": 6.853501528057741, - "grad_norm": 0.91796875, - "learning_rate": 0.0017258599388776904, - "loss": 1.4643, - "step": 1054000 - }, - { - "epoch": 6.85675271474088, - "grad_norm": 0.71484375, - "learning_rate": 0.0017257298914103646, - "loss": 1.4575, - "step": 1054500 - }, - { - "epoch": 6.86000390142402, - "grad_norm": 0.6953125, - "learning_rate": 0.0017255998439430391, - "loss": 1.4588, - "step": 1055000 - }, - { - "epoch": 6.8632550881071595, - "grad_norm": 0.609375, - "learning_rate": 0.0017254697964757138, - "loss": 1.4612, - "step": 1055500 - }, - { - "epoch": 6.866506274790298, - "grad_norm": 0.63671875, - "learning_rate": 0.0017253397490083883, - "loss": 1.4574, - "step": 1056000 - }, - { - "epoch": 6.869757461473438, - "grad_norm": 0.71484375, - "learning_rate": 0.0017252097015410626, - "loss": 1.4544, - "step": 1056500 - }, - { - "epoch": 6.873008648156577, - "grad_norm": 0.76171875, - "learning_rate": 0.001725079654073737, - "loss": 1.4566, - "step": 1057000 - }, - { - "epoch": 6.876259834839717, - "grad_norm": 0.78515625, - "learning_rate": 0.0017249496066064115, - "loss": 1.4514, - "step": 1057500 - }, - { - "epoch": 6.879511021522855, - "grad_norm": 1.9453125, - "learning_rate": 0.0017248195591390858, - "loss": 1.4564, - "step": 1058000 - }, - { - "epoch": 6.882762208205995, - "grad_norm": 0.78515625, - "learning_rate": 0.0017246895116717603, - "loss": 1.452, - "step": 1058500 - }, - { - "epoch": 6.886013394889135, - "grad_norm": 1.0234375, - "learning_rate": 0.0017245594642044347, - "loss": 1.4514, - "step": 1059000 - }, - { - "epoch": 6.889264581572274, - "grad_norm": 0.6796875, - "learning_rate": 0.001724429416737109, - "loss": 1.4594, - "step": 1059500 - }, - { - "epoch": 6.8925157682554135, - "grad_norm": 0.9609375, - "learning_rate": 0.0017242993692697835, - "loss": 1.4545, - "step": 1060000 - }, - { - "epoch": 6.895766954938552, - "grad_norm": 0.625, - "learning_rate": 0.001724169321802458, - "loss": 1.4563, - "step": 1060500 - }, - { - "epoch": 6.899018141621692, - "grad_norm": 1.375, - "learning_rate": 0.0017240392743351322, - "loss": 1.4481, - "step": 1061000 - }, - { - "epoch": 6.902269328304831, - "grad_norm": 0.87109375, - "learning_rate": 0.0017239092268678067, - "loss": 1.4557, - "step": 1061500 - }, - { - "epoch": 6.905520514987971, - "grad_norm": 0.6484375, - "learning_rate": 0.0017237791794004812, - "loss": 1.4574, - "step": 1062000 - }, - { - "epoch": 6.90877170167111, - "grad_norm": 1.0, - "learning_rate": 0.0017236491319331554, - "loss": 1.4529, - "step": 1062500 - }, - { - "epoch": 6.912022888354249, - "grad_norm": 1.1875, - "learning_rate": 0.0017235190844658301, - "loss": 1.4563, - "step": 1063000 - }, - { - "epoch": 6.915274075037389, - "grad_norm": 0.69921875, - "learning_rate": 0.0017233890369985046, - "loss": 1.4571, - "step": 1063500 - }, - { - "epoch": 6.918525261720528, - "grad_norm": 0.80859375, - "learning_rate": 0.001723258989531179, - "loss": 1.4568, - "step": 1064000 - }, - { - "epoch": 6.9217764484036675, - "grad_norm": 1.421875, - "learning_rate": 0.0017231289420638533, - "loss": 1.4533, - "step": 1064500 - }, - { - "epoch": 6.925027635086806, - "grad_norm": 0.8984375, - "learning_rate": 0.0017229988945965278, - "loss": 1.4548, - "step": 1065000 - }, - { - "epoch": 6.928278821769946, - "grad_norm": 0.87109375, - "learning_rate": 0.0017228688471292023, - "loss": 1.4596, - "step": 1065500 - }, - { - "epoch": 6.931530008453086, - "grad_norm": 0.9375, - "learning_rate": 0.0017227387996618766, - "loss": 1.4569, - "step": 1066000 - }, - { - "epoch": 6.934781195136225, - "grad_norm": 1.3828125, - "learning_rate": 0.001722608752194551, - "loss": 1.4575, - "step": 1066500 - }, - { - "epoch": 6.938032381819364, - "grad_norm": 0.75, - "learning_rate": 0.0017224787047272255, - "loss": 1.4544, - "step": 1067000 - }, - { - "epoch": 6.941283568502503, - "grad_norm": 0.78515625, - "learning_rate": 0.0017223486572598998, - "loss": 1.4564, - "step": 1067500 - }, - { - "epoch": 6.944534755185643, - "grad_norm": 0.84765625, - "learning_rate": 0.0017222186097925743, - "loss": 1.4541, - "step": 1068000 - }, - { - "epoch": 6.947785941868782, - "grad_norm": 0.80859375, - "learning_rate": 0.0017220885623252487, - "loss": 1.4547, - "step": 1068500 - }, - { - "epoch": 6.9510371285519215, - "grad_norm": 0.92578125, - "learning_rate": 0.001721958514857923, - "loss": 1.4605, - "step": 1069000 - }, - { - "epoch": 6.954288315235061, - "grad_norm": 2.390625, - "learning_rate": 0.0017218284673905975, - "loss": 1.4535, - "step": 1069500 - }, - { - "epoch": 6.9575395019182, - "grad_norm": 1.0703125, - "learning_rate": 0.001721698419923272, - "loss": 1.4535, - "step": 1070000 - }, - { - "epoch": 6.96079068860134, - "grad_norm": 0.62890625, - "learning_rate": 0.0017215683724559467, - "loss": 1.4582, - "step": 1070500 - }, - { - "epoch": 6.964041875284479, - "grad_norm": 0.734375, - "learning_rate": 0.001721438324988621, - "loss": 1.4554, - "step": 1071000 - }, - { - "epoch": 6.967293061967618, - "grad_norm": 0.97265625, - "learning_rate": 0.0017213082775212954, - "loss": 1.4507, - "step": 1071500 - }, - { - "epoch": 6.970544248650757, - "grad_norm": 0.75390625, - "learning_rate": 0.0017211782300539699, - "loss": 1.449, - "step": 1072000 - }, - { - "epoch": 6.973795435333897, - "grad_norm": 0.625, - "learning_rate": 0.0017210481825866441, - "loss": 1.4487, - "step": 1072500 - }, - { - "epoch": 6.977046622017037, - "grad_norm": 1.2265625, - "learning_rate": 0.0017209181351193186, - "loss": 1.4501, - "step": 1073000 - }, - { - "epoch": 6.9802978087001755, - "grad_norm": 1.40625, - "learning_rate": 0.001720788087651993, - "loss": 1.447, - "step": 1073500 - }, - { - "epoch": 6.983548995383315, - "grad_norm": 3.203125, - "learning_rate": 0.0017206580401846674, - "loss": 1.4483, - "step": 1074000 - }, - { - "epoch": 6.986800182066454, - "grad_norm": 1.0, - "learning_rate": 0.0017205279927173418, - "loss": 1.4508, - "step": 1074500 - }, - { - "epoch": 6.990051368749594, - "grad_norm": 0.80859375, - "learning_rate": 0.0017203979452500163, - "loss": 1.4445, - "step": 1075000 - }, - { - "epoch": 6.993302555432733, - "grad_norm": 0.69140625, - "learning_rate": 0.0017202678977826906, - "loss": 1.4496, - "step": 1075500 - }, - { - "epoch": 6.996553742115872, - "grad_norm": 0.734375, - "learning_rate": 0.001720137850315365, - "loss": 1.4451, - "step": 1076000 - }, - { - "epoch": 6.999804928799012, - "grad_norm": 0.69140625, - "learning_rate": 0.0017200078028480395, - "loss": 1.4514, - "step": 1076500 - }, - { - "epoch": 7.0, - "eval_loss": 1.4298810958862305, - "eval_runtime": 0.5331, - "eval_samples_per_second": 1875.719, - "eval_steps_per_second": 30.011, - "step": 1076530 + "epoch": 24.0, + "eval_loss": 1.305156946182251, + "eval_runtime": 2.3203, + "eval_samples_per_second": 430.983, + "eval_steps_per_second": 0.431, + "step": 230688 } ], "logging_steps": 500, - "max_steps": 7689500, + "max_steps": 480600, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, - "total_flos": 1.3852790029150102e+19, - "train_batch_size": 64, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 3, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 5.304114629940989e+19, + "train_batch_size": 1024, "trial_name": null, "trial_params": null }