diff --git "a/trainer_state.json" "b/trainer_state.json"
--- "a/trainer_state.json"
+++ "b/trainer_state.json"
@@ -1,15148 +1,3461 @@
 {
-  "best_metric": 1.4233994483947754,
-  "best_model_checkpoint": "./results/models/checkpoint-922740",
-  "epoch": 7.0,
+  "best_metric": 1.305156946182251,
+  "best_model_checkpoint": "./results/models/checkpoint-230688",
+  "epoch": 24.0,
   "eval_steps": 500,
-  "global_step": 1076530,
+  "global_step": 230688,
   "is_hyper_param_search": false,
   "is_local_process_zero": true,
   "is_world_process_zero": true,
   "log_history": [
     {
-      "epoch": 0.003251186683139346,
-      "grad_norm": 1.3671875,
-      "learning_rate": 0.0019998699525326743,
-      "loss": 2.8127,
+      "epoch": 0.05201831044527674,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.001997919267582189,
+      "loss": 2.3383,
       "step": 500
     },
     {
-      "epoch": 0.006502373366278692,
-      "grad_norm": 67.0,
-      "learning_rate": 0.001999739905065349,
-      "loss": 2.4852,
+      "epoch": 0.10403662089055347,
+      "grad_norm": 0.93359375,
+      "learning_rate": 0.001995838535164378,
+      "loss": 1.9394,
       "step": 1000
     },
     {
-      "epoch": 0.009753560049418037,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0019996098575980233,
-      "loss": 2.3924,
+      "epoch": 0.1560549313358302,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.001993757802746567,
+      "loss": 1.8509,
       "step": 1500
     },
     {
-      "epoch": 0.013004746732557384,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0019994798101306975,
-      "loss": 2.3235,
+      "epoch": 0.20807324178110695,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.0019916770703287557,
+      "loss": 1.8119,
       "step": 2000
     },
     {
-      "epoch": 0.01625593341569673,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.0019993497626633722,
-      "loss": 2.2647,
+      "epoch": 0.2600915522263837,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0019895963379109446,
+      "loss": 1.746,
       "step": 2500
     },
     {
-      "epoch": 0.019507120098836074,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019992197151960465,
-      "loss": 2.2077,
+      "epoch": 0.3121098626716604,
+      "grad_norm": 0.4921875,
+      "learning_rate": 0.0019875156054931335,
+      "loss": 1.7113,
       "step": 3000
     },
     {
-      "epoch": 0.02275830678197542,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0019990896677287207,
-      "loss": 2.1543,
+      "epoch": 0.3641281731169372,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019854348730753224,
+      "loss": 1.6861,
       "step": 3500
     },
     {
-      "epoch": 0.026009493465114768,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0019989596202613954,
-      "loss": 2.1291,
+      "epoch": 0.4161464835622139,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019833541406575114,
+      "loss": 1.6518,
       "step": 4000
     },
     {
-      "epoch": 0.02926068014825411,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.00199882957279407,
-      "loss": 2.1703,
+      "epoch": 0.4681647940074906,
+      "grad_norm": 1.2578125,
+      "learning_rate": 0.0019812734082397003,
+      "loss": 1.6257,
       "step": 4500
     },
     {
-      "epoch": 0.03251186683139346,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0019986995253267444,
-      "loss": 2.1796,
+      "epoch": 0.5201831044527674,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.0019791926758218896,
+      "loss": 1.6184,
       "step": 5000
     },
     {
-      "epoch": 0.0357630535145328,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.0019985694778594187,
-      "loss": 2.1274,
+      "epoch": 0.5722014148980441,
+      "grad_norm": 0.2177734375,
+      "learning_rate": 0.001977111943404078,
+      "loss": 1.6034,
       "step": 5500
     },
     {
-      "epoch": 0.03901424019767215,
-      "grad_norm": 0.890625,
-      "learning_rate": 0.0019984394303920934,
-      "loss": 2.0801,
+      "epoch": 0.6242197253433208,
+      "grad_norm": 0.46875,
+      "learning_rate": 0.001975031210986267,
+      "loss": 1.5798,
       "step": 6000
     },
     {
-      "epoch": 0.042265426880811495,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0019983093829247676,
-      "loss": 2.0708,
+      "epoch": 0.6762380357885975,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0019729504785684564,
+      "loss": 1.6023,
       "step": 6500
     },
     {
-      "epoch": 0.04551661356395084,
-      "grad_norm": 1.0,
-      "learning_rate": 0.001998179335457442,
-      "loss": 2.0459,
+      "epoch": 0.7282563462338744,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0019708697461506453,
+      "loss": 1.6354,
       "step": 7000
     },
     {
-      "epoch": 0.04876780024709019,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019980492879901166,
-      "loss": 2.0342,
+      "epoch": 0.7802746566791511,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0019687890137328337,
+      "loss": 1.6039,
       "step": 7500
     },
     {
-      "epoch": 0.052018986930229535,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.001997919240522791,
-      "loss": 2.0241,
+      "epoch": 0.8322929671244278,
+      "grad_norm": 0.48046875,
+      "learning_rate": 0.001966708281315023,
+      "loss": 1.5863,
       "step": 8000
     },
     {
-      "epoch": 0.05527017361336888,
-      "grad_norm": 1.0,
-      "learning_rate": 0.001997789193055465,
-      "loss": 2.0065,
+      "epoch": 0.8843112775697045,
+      "grad_norm": 0.99609375,
+      "learning_rate": 0.001964627548897212,
+      "loss": 1.5758,
       "step": 8500
     },
     {
-      "epoch": 0.05852136029650822,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.00199765914558814,
-      "loss": 1.992,
+      "epoch": 0.9363295880149812,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0019625468164794005,
+      "loss": 1.5658,
       "step": 9000
     },
     {
-      "epoch": 0.06177254697964757,
-      "grad_norm": 1.1328125,
-      "learning_rate": 0.001997529098120814,
-      "loss": 1.9743,
+      "epoch": 0.9883478984602581,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.00196046608406159,
+      "loss": 1.5664,
       "step": 9500
     },
     {
-      "epoch": 0.06502373366278692,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0019973990506534883,
-      "loss": 1.9478,
+      "epoch": 1.0,
+      "eval_loss": 1.6215704679489136,
+      "eval_runtime": 1.5075,
+      "eval_samples_per_second": 663.37,
+      "eval_steps_per_second": 0.663,
+      "step": 9612
+    },
+    {
+      "epoch": 1.0403662089055348,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0019583853516437788,
+      "loss": 1.5684,
       "step": 10000
     },
     {
-      "epoch": 0.06827492034592626,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001997269003186163,
-      "loss": 1.943,
+      "epoch": 1.0923845193508115,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0019563046192259677,
+      "loss": 1.5536,
       "step": 10500
     },
     {
-      "epoch": 0.0715261070290656,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0019971389557188377,
-      "loss": 1.92,
+      "epoch": 1.1444028297960882,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0019542238868081566,
+      "loss": 1.5495,
       "step": 11000
     },
     {
-      "epoch": 0.07477729371220496,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.001997008908251512,
-      "loss": 1.9189,
+      "epoch": 1.196421140241365,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019521431543903455,
+      "loss": 1.529,
       "step": 11500
     },
     {
-      "epoch": 0.0780284803953443,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.0019968788607841862,
-      "loss": 1.9228,
+      "epoch": 1.2484394506866416,
+      "grad_norm": 0.38671875,
+      "learning_rate": 0.0019500624219725344,
+      "loss": 1.5307,
       "step": 12000
     },
     {
-      "epoch": 0.08127966707848365,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.001996748813316861,
-      "loss": 1.9151,
+      "epoch": 1.3004577611319184,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0019479816895547233,
+      "loss": 1.5422,
       "step": 12500
     },
     {
-      "epoch": 0.08453085376162299,
-      "grad_norm": 0.9453125,
-      "learning_rate": 0.001996618765849535,
-      "loss": 1.9197,
+      "epoch": 1.352476071577195,
+      "grad_norm": 0.34375,
+      "learning_rate": 0.0019459009571369122,
+      "loss": 1.5281,
       "step": 13000
     },
     {
-      "epoch": 0.08778204044476234,
-      "grad_norm": 0.94140625,
-      "learning_rate": 0.0019964887183822094,
-      "loss": 1.9055,
+      "epoch": 1.404494382022472,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0019438202247191011,
+      "loss": 1.5232,
       "step": 13500
     },
     {
-      "epoch": 0.09103322712790168,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.001996358670914884,
-      "loss": 1.8883,
+      "epoch": 1.4565126924677487,
+      "grad_norm": 0.98828125,
+      "learning_rate": 0.00194173949230129,
+      "loss": 1.5286,
       "step": 14000
     },
     {
-      "epoch": 0.09428441381104102,
-      "grad_norm": 0.9453125,
-      "learning_rate": 0.0019962286234475584,
-      "loss": 1.89,
+      "epoch": 1.5085310029130254,
+      "grad_norm": 0.72265625,
+      "learning_rate": 0.001939658759883479,
+      "loss": 1.5286,
       "step": 14500
     },
     {
-      "epoch": 0.09753560049418038,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0019960985759802327,
-      "loss": 1.8763,
+      "epoch": 1.5605493133583022,
+      "grad_norm": 0.341796875,
+      "learning_rate": 0.001937578027465668,
+      "loss": 1.5173,
       "step": 15000
     },
     {
-      "epoch": 0.10078678717731972,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0019959685285129074,
-      "loss": 1.8751,
+      "epoch": 1.6125676238035789,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0019354972950478568,
+      "loss": 1.5055,
       "step": 15500
     },
     {
-      "epoch": 0.10403797386045907,
-      "grad_norm": 0.9453125,
-      "learning_rate": 0.0019958384810455816,
-      "loss": 1.8804,
+      "epoch": 1.6645859342488556,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0019334165626300457,
+      "loss": 1.5071,
       "step": 16000
     },
     {
-      "epoch": 0.10728916054359841,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001995708433578256,
-      "loss": 1.8864,
+      "epoch": 1.7166042446941323,
+      "grad_norm": 0.37890625,
+      "learning_rate": 0.0019313358302122348,
+      "loss": 1.5071,
       "step": 16500
     },
     {
-      "epoch": 0.11054034722673776,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0019955783861109306,
-      "loss": 1.8726,
+      "epoch": 1.768622555139409,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019292550977944235,
+      "loss": 1.5211,
       "step": 17000
     },
     {
-      "epoch": 0.1137915339098771,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.001995448338643605,
-      "loss": 1.8611,
+      "epoch": 1.8206408655846857,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0019271743653766125,
+      "loss": 1.5211,
       "step": 17500
     },
     {
-      "epoch": 0.11704272059301644,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.001995318291176279,
-      "loss": 1.8544,
+      "epoch": 1.8726591760299627,
+      "grad_norm": 0.41796875,
+      "learning_rate": 0.0019250936329588016,
+      "loss": 1.5159,
       "step": 18000
     },
     {
-      "epoch": 0.1202939072761558,
-      "grad_norm": 2.34375,
-      "learning_rate": 0.001995188243708954,
-      "loss": 1.8469,
+      "epoch": 1.9246774864752392,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0019230129005409905,
+      "loss": 1.5006,
       "step": 18500
     },
     {
-      "epoch": 0.12354509395929514,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0019950581962416285,
-      "loss": 1.8493,
+      "epoch": 1.9766957969205161,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.0019209321681231794,
+      "loss": 1.5082,
       "step": 19000
     },
     {
-      "epoch": 0.12679628064243448,
-      "grad_norm": 2.40625,
-      "learning_rate": 0.0019949281487743028,
-      "loss": 1.8449,
+      "epoch": 2.0,
+      "eval_loss": 1.5491766929626465,
+      "eval_runtime": 1.6608,
+      "eval_samples_per_second": 602.119,
+      "eval_steps_per_second": 0.602,
+      "step": 19224
+    },
+    {
+      "epoch": 2.0287141073657926,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019188514357053683,
+      "loss": 1.5182,
       "step": 19500
     },
     {
-      "epoch": 0.13004746732557385,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.001994798101306977,
-      "loss": 1.8341,
+      "epoch": 2.0807324178110695,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0019167707032875572,
+      "loss": 1.5172,
       "step": 20000
     },
     {
-      "epoch": 0.13329865400871319,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.0019946680538396517,
-      "loss": 1.8258,
+      "epoch": 2.132750728256346,
+      "grad_norm": 0.37109375,
+      "learning_rate": 0.0019146899708697464,
+      "loss": 1.5101,
       "step": 20500
     },
     {
-      "epoch": 0.13654984069185253,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.001994538006372326,
-      "loss": 1.8294,
+      "epoch": 2.184769038701623,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001912609238451935,
+      "loss": 1.5086,
       "step": 21000
     },
     {
-      "epoch": 0.13980102737499187,
-      "grad_norm": 0.890625,
-      "learning_rate": 0.0019944079589050002,
-      "loss": 1.8346,
+      "epoch": 2.2367873491468995,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.001910528506034124,
+      "loss": 1.4943,
       "step": 21500
     },
     {
-      "epoch": 0.1430522140581312,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.001994277911437675,
-      "loss": 1.8136,
+      "epoch": 2.2888056595921764,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0019084477736163131,
+      "loss": 1.4848,
       "step": 22000
     },
     {
-      "epoch": 0.14630340074127057,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.001994147863970349,
-      "loss": 1.8161,
+      "epoch": 2.3408239700374533,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0019063670411985018,
+      "loss": 1.4823,
       "step": 22500
     },
     {
-      "epoch": 0.1495545874244099,
-      "grad_norm": 0.97265625,
-      "learning_rate": 0.0019940178165030235,
-      "loss": 1.8165,
+      "epoch": 2.39284228048273,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019042863087806907,
+      "loss": 1.4702,
       "step": 23000
     },
     {
-      "epoch": 0.15280577410754925,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.001993887769035698,
-      "loss": 1.8134,
+      "epoch": 2.444860590928007,
+      "grad_norm": 1.296875,
+      "learning_rate": 0.0019022055763628799,
+      "loss": 1.4673,
       "step": 23500
     },
     {
-      "epoch": 0.1560569607906886,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0019937577215683724,
-      "loss": 1.8177,
+      "epoch": 2.4968789013732833,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0019001248439450688,
+      "loss": 1.4706,
       "step": 24000
     },
     {
-      "epoch": 0.15930814747382796,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0019936276741010467,
-      "loss": 1.8106,
+      "epoch": 2.54889721181856,
+      "grad_norm": 0.75,
+      "learning_rate": 0.0018980441115272575,
+      "loss": 1.4635,
       "step": 24500
     },
     {
-      "epoch": 0.1625593341569673,
-      "grad_norm": 0.91796875,
-      "learning_rate": 0.0019934976266337214,
-      "loss": 1.808,
+      "epoch": 2.6009155222638367,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0018959633791094466,
+      "loss": 1.4499,
       "step": 25000
     },
     {
-      "epoch": 0.16581052084010664,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.001993367579166396,
-      "loss": 1.804,
+      "epoch": 2.6529338327091136,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0018938826466916355,
+      "loss": 1.4453,
       "step": 25500
     },
     {
-      "epoch": 0.16906170752324598,
-      "grad_norm": 8.6875,
-      "learning_rate": 0.0019932375316990703,
-      "loss": 1.7985,
+      "epoch": 2.70495214315439,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0018918019142738244,
+      "loss": 1.4463,
       "step": 26000
     },
     {
-      "epoch": 0.17231289420638532,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0019931074842317446,
-      "loss": 1.7876,
+      "epoch": 2.756970453599667,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0018897211818560133,
+      "loss": 1.452,
       "step": 26500
     },
     {
-      "epoch": 0.1755640808895247,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0019929774367644193,
-      "loss": 1.787,
+      "epoch": 2.808988764044944,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0018876404494382023,
+      "loss": 1.448,
       "step": 27000
     },
     {
-      "epoch": 0.17881526757266403,
-      "grad_norm": 0.9140625,
-      "learning_rate": 0.0019928473892970935,
-      "loss": 1.7812,
+      "epoch": 2.8610070744902205,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0018855597170203914,
+      "loss": 1.4525,
       "step": 27500
     },
     {
-      "epoch": 0.18206645425580337,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.001992717341829768,
-      "loss": 1.785,
+      "epoch": 2.9130253849354975,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.00188347898460258,
+      "loss": 1.4457,
       "step": 28000
     },
     {
-      "epoch": 0.1853176409389427,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019925872943624425,
-      "loss": 1.7763,
+      "epoch": 2.965043695380774,
+      "grad_norm": 0.60546875,
+      "learning_rate": 0.001881398252184769,
+      "loss": 1.4468,
       "step": 28500
     },
     {
-      "epoch": 0.18856882762208205,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0019924572468951168,
-      "loss": 1.8051,
+      "epoch": 3.0,
+      "eval_loss": 1.4870332479476929,
+      "eval_runtime": 1.4668,
+      "eval_samples_per_second": 681.76,
+      "eval_steps_per_second": 0.682,
+      "step": 28836
+    },
+    {
+      "epoch": 3.017062005826051,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018793175197669581,
+      "loss": 1.4453,
       "step": 29000
     },
     {
-      "epoch": 0.19182001430522141,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.001992327199427791,
-      "loss": 1.7849,
+      "epoch": 3.0690803162713274,
+      "grad_norm": 1.4453125,
+      "learning_rate": 0.001877236787349147,
+      "loss": 1.4455,
       "step": 29500
     },
     {
-      "epoch": 0.19507120098836075,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019921971519604657,
-      "loss": 1.7797,
+      "epoch": 3.1210986267166043,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.0018751560549313357,
+      "loss": 1.4378,
       "step": 30000
     },
     {
-      "epoch": 0.1983223876715001,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.00199206710449314,
-      "loss": 1.7877,
+      "epoch": 3.173116937161881,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0018730753225135249,
+      "loss": 1.4342,
       "step": 30500
     },
     {
-      "epoch": 0.20157357435463943,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019919370570258142,
-      "loss": 1.78,
+      "epoch": 3.2251352476071578,
+      "grad_norm": 5.09375,
+      "learning_rate": 0.0018709945900957138,
+      "loss": 1.4401,
       "step": 31000
     },
     {
-      "epoch": 0.2048247610377788,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.001991807009558489,
-      "loss": 1.7848,
+      "epoch": 3.2771535580524347,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018689138576779025,
+      "loss": 1.4317,
       "step": 31500
     },
     {
-      "epoch": 0.20807594772091814,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.001991676962091163,
-      "loss": 1.7725,
+      "epoch": 3.329171868497711,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0018668331252600916,
+      "loss": 1.4252,
       "step": 32000
     },
     {
-      "epoch": 0.21132713440405748,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0019915469146238375,
-      "loss": 1.7705,
+      "epoch": 3.381190178942988,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0018647523928422805,
+      "loss": 1.427,
       "step": 32500
     },
     {
-      "epoch": 0.21457832108719682,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.001991416867156512,
-      "loss": 1.7645,
+      "epoch": 3.4332084893882646,
+      "grad_norm": 0.56640625,
+      "learning_rate": 0.0018626716604244697,
+      "loss": 1.4207,
       "step": 33000
     },
     {
-      "epoch": 0.21782950777033616,
-      "grad_norm": 1.59375,
-      "learning_rate": 0.001991286819689187,
-      "loss": 1.7673,
+      "epoch": 3.4852267998335416,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0018605909280066584,
+      "loss": 1.4209,
       "step": 33500
     },
     {
-      "epoch": 0.22108069445347553,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.001991156772221861,
-      "loss": 1.7758,
+      "epoch": 3.537245110278818,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018585101955888473,
+      "loss": 1.418,
       "step": 34000
     },
     {
-      "epoch": 0.22433188113661487,
-      "grad_norm": 2.09375,
-      "learning_rate": 0.0019910267247545354,
-      "loss": 1.7593,
+      "epoch": 3.589263420724095,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0018564294631710364,
+      "loss": 1.4153,
       "step": 34500
     },
     {
-      "epoch": 0.2275830678197542,
-      "grad_norm": 0.9140625,
-      "learning_rate": 0.00199089667728721,
-      "loss": 1.756,
+      "epoch": 3.6412817311693715,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.001854348730753225,
+      "loss": 1.4171,
       "step": 35000
     },
     {
-      "epoch": 0.23083425450289355,
-      "grad_norm": 1.28125,
-      "learning_rate": 0.0019907666298198843,
-      "loss": 1.7467,
+      "epoch": 3.6933000416146484,
+      "grad_norm": 0.7421875,
+      "learning_rate": 0.001852267998335414,
+      "loss": 1.4203,
       "step": 35500
     },
     {
-      "epoch": 0.2340854411860329,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.0019906365823525586,
-      "loss": 1.7501,
+      "epoch": 3.7453183520599254,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0018501872659176031,
+      "loss": 1.4189,
       "step": 36000
     },
     {
-      "epoch": 0.23733662786917226,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019905065348852333,
-      "loss": 1.7459,
+      "epoch": 3.797336662505202,
+      "grad_norm": 0.53125,
+      "learning_rate": 0.001848106533499792,
+      "loss": 1.4212,
       "step": 36500
     },
     {
-      "epoch": 0.2405878145523116,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0019903764874179075,
-      "loss": 1.7398,
+      "epoch": 3.8493549729504783,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0018460258010819808,
+      "loss": 1.4151,
       "step": 37000
     },
     {
-      "epoch": 0.24383900123545094,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.001990246439950582,
-      "loss": 1.7416,
+      "epoch": 3.9013732833957553,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0018439450686641699,
+      "loss": 1.4087,
       "step": 37500
     },
     {
-      "epoch": 0.24709018791859028,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0019901163924832565,
-      "loss": 1.7476,
+      "epoch": 3.9533915938410322,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0018418643362463588,
+      "loss": 1.4038,
       "step": 38000
     },
     {
-      "epoch": 0.2503413746017296,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0019899863450159308,
-      "loss": 1.7383,
+      "epoch": 4.0,
+      "eval_loss": 1.4296818971633911,
+      "eval_runtime": 1.3293,
+      "eval_samples_per_second": 752.251,
+      "eval_steps_per_second": 0.752,
+      "step": 38448
+    },
+    {
+      "epoch": 4.005409904286309,
+      "grad_norm": 2.53125,
+      "learning_rate": 0.0018397836038285475,
+      "loss": 1.4037,
       "step": 38500
     },
     {
-      "epoch": 0.25359256128486896,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.001989856297548605,
-      "loss": 1.7379,
+      "epoch": 4.057428214731585,
+      "grad_norm": 1.703125,
+      "learning_rate": 0.0018377028714107366,
+      "loss": 1.402,
       "step": 39000
     },
     {
-      "epoch": 0.2568437479680083,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019897262500812797,
-      "loss": 1.7357,
+      "epoch": 4.109446525176862,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0018356221389929255,
+      "loss": 1.3972,
       "step": 39500
     },
     {
-      "epoch": 0.2600949346511477,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0019895962026139544,
-      "loss": 1.7301,
+      "epoch": 4.161464835622139,
+      "grad_norm": 0.2001953125,
+      "learning_rate": 0.0018335414065751145,
+      "loss": 1.3996,
       "step": 40000
     },
     {
-      "epoch": 0.26334612133428703,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0019894661551466287,
-      "loss": 1.7312,
+      "epoch": 4.213483146067416,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0018314606741573034,
+      "loss": 1.3989,
       "step": 40500
     },
     {
-      "epoch": 0.26659730801742637,
-      "grad_norm": 1.703125,
-      "learning_rate": 0.001989336107679303,
-      "loss": 1.7451,
+      "epoch": 4.265501456512692,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0018293799417394923,
+      "loss": 1.3945,
       "step": 41000
     },
     {
-      "epoch": 0.2698484947005657,
-      "grad_norm": 4.5,
-      "learning_rate": 0.0019892060602119776,
-      "loss": 1.7367,
+      "epoch": 4.317519766957969,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0018272992093216814,
+      "loss": 1.3936,
       "step": 41500
     },
     {
-      "epoch": 0.27309968138370505,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.001989076012744652,
-      "loss": 1.7231,
+      "epoch": 4.369538077403246,
+      "grad_norm": 0.384765625,
+      "learning_rate": 0.0018252184769038703,
+      "loss": 1.3906,
       "step": 42000
     },
     {
-      "epoch": 0.2763508680668444,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.001988945965277326,
-      "loss": 1.7225,
+      "epoch": 4.421556387848523,
+      "grad_norm": 0.380859375,
+      "learning_rate": 0.001823137744486059,
+      "loss": 1.3946,
       "step": 42500
     },
     {
-      "epoch": 0.27960205474998373,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.001988815917810001,
-      "loss": 1.7198,
+      "epoch": 4.473574698293799,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0018210570120682482,
+      "loss": 1.4069,
       "step": 43000
     },
     {
-      "epoch": 0.28285324143312307,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.001988685870342675,
-      "loss": 1.7255,
+      "epoch": 4.525593008739076,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001818976279650437,
+      "loss": 1.4049,
       "step": 43500
     },
     {
-      "epoch": 0.2861044281162624,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0019885558228753494,
-      "loss": 1.7231,
+      "epoch": 4.577611319184353,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0018168955472326258,
+      "loss": 1.3995,
       "step": 44000
     },
     {
-      "epoch": 0.2893556147994018,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.001988425775408024,
-      "loss": 1.7365,
+      "epoch": 4.62962962962963,
+      "grad_norm": 0.392578125,
+      "learning_rate": 0.001814814814814815,
+      "loss": 1.4053,
       "step": 44500
     },
     {
-      "epoch": 0.29260680148254115,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019882957279406983,
-      "loss": 1.7237,
+      "epoch": 4.681647940074907,
+      "grad_norm": 0.5078125,
+      "learning_rate": 0.0018127340823970038,
+      "loss": 1.4,
       "step": 45000
     },
     {
-      "epoch": 0.2958579881656805,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0019881656804733726,
-      "loss": 1.7336,
+      "epoch": 4.733666250520183,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0018106533499791927,
+      "loss": 1.3946,
       "step": 45500
     },
     {
-      "epoch": 0.2991091748488198,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.0019880356330060473,
-      "loss": 1.7488,
+      "epoch": 4.78568456096546,
+      "grad_norm": 0.498046875,
+      "learning_rate": 0.0018085726175613816,
+      "loss": 1.3914,
       "step": 46000
     },
     {
-      "epoch": 0.30236036153195917,
-      "grad_norm": 1.0,
-      "learning_rate": 0.0019879055855387216,
-      "loss": 1.7362,
+      "epoch": 4.837702871410737,
+      "grad_norm": 1.0703125,
+      "learning_rate": 0.0018064918851435705,
+      "loss": 1.3882,
       "step": 46500
     },
     {
-      "epoch": 0.3056115482150985,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.001987775538071396,
-      "loss": 1.7358,
+      "epoch": 4.889721181856014,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.0018044111527257595,
+      "loss": 1.3905,
       "step": 47000
     },
     {
-      "epoch": 0.30886273489823785,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0019876454906040705,
-      "loss": 1.7274,
+      "epoch": 4.94173949230129,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0018023304203079484,
+      "loss": 1.3927,
       "step": 47500
     },
     {
-      "epoch": 0.3121139215813772,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.001987515443136745,
-      "loss": 1.7265,
+      "epoch": 4.9937578027465666,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0018002496878901373,
+      "loss": 1.3883,
       "step": 48000
     },
     {
-      "epoch": 0.3153651082645165,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0019873853956694195,
-      "loss": 1.7121,
+      "epoch": 5.0,
+      "eval_loss": 1.4223600625991821,
+      "eval_runtime": 1.6852,
+      "eval_samples_per_second": 593.419,
+      "eval_steps_per_second": 0.593,
+      "step": 48060
+    },
+    {
+      "epoch": 5.0457761131918435,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0017981689554723264,
+      "loss": 1.3873,
       "step": 48500
     },
     {
-      "epoch": 0.3186162949476559,
-      "grad_norm": 1.1328125,
-      "learning_rate": 0.0019872553482020937,
-      "loss": 1.7082,
+      "epoch": 5.09779442363712,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0017960882230545153,
+      "loss": 1.3831,
       "step": 49000
     },
     {
-      "epoch": 0.32186748163079526,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0019871253007347684,
-      "loss": 1.7064,
+      "epoch": 5.149812734082397,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001794007490636704,
+      "loss": 1.3784,
       "step": 49500
     },
     {
-      "epoch": 0.3251186683139346,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0019869952532674427,
-      "loss": 1.7093,
+      "epoch": 5.201831044527673,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0017919267582188932,
+      "loss": 1.3821,
       "step": 50000
     },
     {
-      "epoch": 0.32836985499707394,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.001986865205800117,
-      "loss": 1.7092,
+      "epoch": 5.25384935497295,
+      "grad_norm": 0.451171875,
+      "learning_rate": 0.001789846025801082,
+      "loss": 1.3781,
       "step": 50500
     },
     {
-      "epoch": 0.3316210416802133,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019867351583327916,
-      "loss": 1.7031,
+      "epoch": 5.305867665418227,
+      "grad_norm": 0.33203125,
+      "learning_rate": 0.0017877652933832708,
+      "loss": 1.381,
       "step": 51000
     },
     {
-      "epoch": 0.3348722283633526,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.001986605110865466,
-      "loss": 1.7065,
+      "epoch": 5.357885975863504,
+      "grad_norm": 0.40625,
+      "learning_rate": 0.00178568456096546,
+      "loss": 1.381,
       "step": 51500
     },
     {
-      "epoch": 0.33812341504649196,
-      "grad_norm": 5.84375,
-      "learning_rate": 0.00198647506339814,
-      "loss": 1.7007,
+      "epoch": 5.40990428630878,
+      "grad_norm": 0.263671875,
+      "learning_rate": 0.0017836038285476488,
+      "loss": 1.375,
       "step": 52000
     },
     {
-      "epoch": 0.3413746017296313,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.001986345015930815,
-      "loss": 1.7033,
+      "epoch": 5.461922596754057,
+      "grad_norm": 0.314453125,
+      "learning_rate": 0.0017815230961298377,
+      "loss": 1.3776,
       "step": 52500
     },
     {
-      "epoch": 0.34462578841277064,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.001986214968463489,
-      "loss": 1.6962,
+      "epoch": 5.513940907199334,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.0017794423637120266,
+      "loss": 1.3773,
       "step": 53000
     },
     {
-      "epoch": 0.34787697509591003,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019860849209961634,
-      "loss": 1.6886,
+      "epoch": 5.565959217644611,
+      "grad_norm": 0.228515625,
+      "learning_rate": 0.0017773616312942156,
+      "loss": 1.3809,
       "step": 53500
     },
     {
-      "epoch": 0.3511281617790494,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.001985954873528838,
-      "loss": 1.6864,
+      "epoch": 5.617977528089888,
+      "grad_norm": 0.8671875,
+      "learning_rate": 0.0017752808988764045,
+      "loss": 1.3786,
       "step": 54000
     },
     {
-      "epoch": 0.3543793484621887,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0019858248260615128,
-      "loss": 1.6905,
+      "epoch": 5.669995838535164,
+      "grad_norm": 0.275390625,
+      "learning_rate": 0.0017732001664585936,
+      "loss": 1.3762,
       "step": 54500
     },
     {
-      "epoch": 0.35763053514532805,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.001985694778594187,
-      "loss": 1.696,
+      "epoch": 5.722014148980441,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017711194340407823,
+      "loss": 1.3741,
       "step": 55000
     },
     {
-      "epoch": 0.3608817218284674,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0019855647311268613,
-      "loss": 1.6931,
+      "epoch": 5.774032459425718,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.0017690387016229714,
+      "loss": 1.3719,
       "step": 55500
     },
     {
-      "epoch": 0.36413290851160673,
-      "grad_norm": 1.2578125,
-      "learning_rate": 0.001985434683659536,
-      "loss": 1.6919,
+      "epoch": 5.826050769870995,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0017669579692051603,
+      "loss": 1.3712,
       "step": 56000
     },
     {
-      "epoch": 0.3673840951947461,
-      "grad_norm": 0.9453125,
-      "learning_rate": 0.0019853046361922103,
-      "loss": 1.7009,
+      "epoch": 5.878069080316271,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.001764877236787349,
+      "loss": 1.3716,
       "step": 56500
     },
     {
-      "epoch": 0.3706352818778854,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0019851745887248845,
-      "loss": 1.7066,
+      "epoch": 5.930087390761548,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0017627965043695382,
+      "loss": 1.3739,
       "step": 57000
     },
     {
-      "epoch": 0.37388646856102475,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.001985044541257559,
-      "loss": 1.6974,
+      "epoch": 5.982105701206825,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001760715771951727,
+      "loss": 1.3744,
       "step": 57500
     },
     {
-      "epoch": 0.3771376552441641,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.0019849144937902335,
-      "loss": 1.695,
+      "epoch": 6.0,
+      "eval_loss": 1.4039781093597412,
+      "eval_runtime": 1.6711,
+      "eval_samples_per_second": 598.397,
+      "eval_steps_per_second": 0.598,
+      "step": 57672
+    },
+    {
+      "epoch": 6.034124011652102,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.001758635039533916,
+      "loss": 1.3701,
       "step": 58000
     },
     {
-      "epoch": 0.3803888419273035,
-      "grad_norm": 1.7421875,
-      "learning_rate": 0.0019847844463229077,
-      "loss": 1.7002,
+      "epoch": 6.086142322097379,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001756554307116105,
+      "loss": 1.3629,
       "step": 58500
     },
     {
-      "epoch": 0.38364002861044283,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0019846543988555824,
-      "loss": 1.6998,
+      "epoch": 6.138160632542655,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0017544735746982938,
+      "loss": 1.3656,
       "step": 59000
     },
     {
-      "epoch": 0.38689121529358217,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019845243513882567,
-      "loss": 1.7108,
+      "epoch": 6.190178942987932,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0017523928422804827,
+      "loss": 1.3673,
       "step": 59500
     },
     {
-      "epoch": 0.3901424019767215,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.001984394303920931,
-      "loss": 1.6777,
+      "epoch": 6.242197253433209,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0017503121098626717,
+      "loss": 1.363,
       "step": 60000
     },
     {
-      "epoch": 0.39339358865986085,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0019842642564536057,
-      "loss": 1.6866,
+      "epoch": 6.294215563878486,
+      "grad_norm": 1.046875,
+      "learning_rate": 0.0017482313774448606,
+      "loss": 1.3632,
       "step": 60500
     },
     {
-      "epoch": 0.3966447753430002,
-      "grad_norm": 1.890625,
-      "learning_rate": 0.00198413420898628,
-      "loss": 1.6838,
+      "epoch": 6.346233874323762,
+      "grad_norm": 0.359375,
+      "learning_rate": 0.0017461506450270495,
+      "loss": 1.36,
       "step": 61000
     },
     {
-      "epoch": 0.39989596202613953,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.001984004161518954,
-      "loss": 1.6834,
+      "epoch": 6.398252184769039,
+      "grad_norm": 0.58203125,
+      "learning_rate": 0.0017440699126092386,
+      "loss": 1.3578,
       "step": 61500
     },
     {
-      "epoch": 0.40314714870927887,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.001983874114051629,
-      "loss": 1.6837,
+      "epoch": 6.4502704952143155,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0017419891801914273,
+      "loss": 1.3622,
       "step": 62000
     },
     {
-      "epoch": 0.4063983353924182,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0019837440665843036,
-      "loss": 1.6869,
+      "epoch": 6.502288805659592,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0017399084477736164,
+      "loss": 1.3607,
       "step": 62500
     },
     {
-      "epoch": 0.4096495220755576,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.001983614019116978,
-      "loss": 1.7305,
+      "epoch": 6.554307116104869,
+      "grad_norm": 0.39453125,
+      "learning_rate": 0.0017378277153558054,
+      "loss": 1.3552,
       "step": 63000
     },
     {
-      "epoch": 0.41290070875869694,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.001983483971649652,
-      "loss": 1.746,
+      "epoch": 6.606325426550145,
+      "grad_norm": 0.322265625,
+      "learning_rate": 0.001735746982937994,
+      "loss": 1.3518,
       "step": 63500
     },
     {
-      "epoch": 0.4161518954418363,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.001983353924182327,
-      "loss": 1.7387,
+      "epoch": 6.658343736995422,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0017336662505201832,
+      "loss": 1.3498,
       "step": 64000
     },
     {
-      "epoch": 0.4194030821249756,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.001983223876715001,
-      "loss": 1.7394,
+      "epoch": 6.710362047440699,
+      "grad_norm": 1.1484375,
+      "learning_rate": 0.001731585518102372,
+      "loss": 1.3528,
       "step": 64500
     },
     {
-      "epoch": 0.42265426880811496,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.0019830938292476753,
-      "loss": 1.7914,
+      "epoch": 6.762380357885976,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.001729504785684561,
+      "loss": 1.3528,
       "step": 65000
     },
     {
-      "epoch": 0.4259054554912543,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.00198296378178035,
-      "loss": 1.7304,
+      "epoch": 6.814398668331252,
+      "grad_norm": 0.234375,
+      "learning_rate": 0.00172742405326675,
+      "loss": 1.3514,
       "step": 65500
     },
     {
-      "epoch": 0.42915664217439364,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019828337343130243,
-      "loss": 1.7173,
+      "epoch": 6.866416978776529,
+      "grad_norm": 0.44921875,
+      "learning_rate": 0.0017253433208489388,
+      "loss": 1.3519,
       "step": 66000
     },
     {
-      "epoch": 0.432407828857533,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019827036868456985,
-      "loss": 1.7409,
+      "epoch": 6.918435289221806,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0017232625884311278,
+      "loss": 1.3475,
       "step": 66500
     },
     {
-      "epoch": 0.4356590155406723,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0019825736393783732,
-      "loss": 1.7136,
+      "epoch": 6.970453599667083,
+      "grad_norm": 0.51171875,
+      "learning_rate": 0.0017211818560133169,
+      "loss": 1.3498,
       "step": 67000
     },
     {
-      "epoch": 0.4389102022238117,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0019824435919110475,
-      "loss": 1.7297,
+      "epoch": 7.0,
+      "eval_loss": 1.3733755350112915,
+      "eval_runtime": 1.5013,
+      "eval_samples_per_second": 666.111,
+      "eval_steps_per_second": 0.666,
+      "step": 67284
+    },
+    {
+      "epoch": 7.022471910112359,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0017191011235955056,
+      "loss": 1.348,
       "step": 67500
     },
     {
-      "epoch": 0.44216138890695106,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0019823135444437217,
-      "loss": 1.7409,
+      "epoch": 7.074490220557636,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0017170203911776945,
+      "loss": 1.3467,
       "step": 68000
     },
     {
-      "epoch": 0.4454125755900904,
-      "grad_norm": 1.765625,
-      "learning_rate": 0.0019821834969763964,
-      "loss": 1.7818,
+      "epoch": 7.126508531002913,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.0017149396587598836,
+      "loss": 1.3484,
       "step": 68500
     },
     {
-      "epoch": 0.44866376227322974,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.001982053449509071,
-      "loss": 1.7189,
+      "epoch": 7.17852684144819,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0017128589263420723,
+      "loss": 1.3504,
       "step": 69000
     },
     {
-      "epoch": 0.4519149489563691,
-      "grad_norm": 6.21875,
-      "learning_rate": 0.0019819234020417454,
-      "loss": 1.7128,
+      "epoch": 7.230545151893467,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0017107781939242615,
+      "loss": 1.3474,
       "step": 69500
     },
     {
-      "epoch": 0.4551661356395084,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0019817933545744197,
-      "loss": 1.6918,
+      "epoch": 7.282563462338743,
+      "grad_norm": 0.4296875,
+      "learning_rate": 0.0017086974615064504,
+      "loss": 1.3457,
       "step": 70000
     },
     {
-      "epoch": 0.45841732232264776,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0019816633071070944,
-      "loss": 1.6941,
+      "epoch": 7.33458177278402,
+      "grad_norm": 0.494140625,
+      "learning_rate": 0.0017066167290886393,
+      "loss": 1.344,
       "step": 70500
     },
     {
-      "epoch": 0.4616685090057871,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.0019815332596397686,
-      "loss": 1.6888,
+      "epoch": 7.386600083229297,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0017045359966708282,
+      "loss": 1.3409,
       "step": 71000
     },
     {
-      "epoch": 0.46491969568892644,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.001981403212172443,
-      "loss": 1.689,
+      "epoch": 7.438618393674574,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001702455264253017,
+      "loss": 1.3427,
       "step": 71500
     },
     {
-      "epoch": 0.4681708823720658,
-      "grad_norm": 4.75,
-      "learning_rate": 0.0019812731647051176,
-      "loss": 1.69,
+      "epoch": 7.49063670411985,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001700374531835206,
+      "loss": 1.3446,
       "step": 72000
     },
     {
-      "epoch": 0.4714220690552052,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.001981143117237792,
-      "loss": 1.6951,
+      "epoch": 7.542655014565127,
+      "grad_norm": 0.26953125,
+      "learning_rate": 0.001698293799417395,
+      "loss": 1.3427,
       "step": 72500
     },
     {
-      "epoch": 0.4746732557383445,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.001981013069770466,
-      "loss": 1.6849,
+      "epoch": 7.594673325010404,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0016962130669995838,
+      "loss": 1.3451,
       "step": 73000
     },
     {
-      "epoch": 0.47792444242148385,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.001980883022303141,
-      "loss": 1.7031,
+      "epoch": 7.646691635455681,
+      "grad_norm": 0.236328125,
+      "learning_rate": 0.0016941323345817728,
+      "loss": 1.3473,
       "step": 73500
     },
     {
-      "epoch": 0.4811756291046232,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001980752974835815,
-      "loss": 1.6962,
+      "epoch": 7.698709945900957,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.001692051602163962,
+      "loss": 1.3487,
       "step": 74000
     },
     {
-      "epoch": 0.48442681578776253,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0019806229273684893,
-      "loss": 1.7377,
+      "epoch": 7.750728256346234,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016899708697461506,
+      "loss": 1.3565,
       "step": 74500
     },
     {
-      "epoch": 0.4876780024709019,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.001980492879901164,
-      "loss": 1.7273,
+      "epoch": 7.802746566791511,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0016878901373283395,
+      "loss": 1.348,
       "step": 75000
     },
     {
-      "epoch": 0.4909291891540412,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0019803628324338383,
-      "loss": 1.703,
+      "epoch": 7.8547648772367875,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0016858094049105286,
+      "loss": 1.3478,
       "step": 75500
     },
     {
-      "epoch": 0.49418037583718055,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0019802327849665125,
-      "loss": 1.7018,
+      "epoch": 7.9067831876820645,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0016837286724927173,
+      "loss": 1.3484,
       "step": 76000
     },
     {
-      "epoch": 0.4974315625203199,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0019801027374991872,
-      "loss": 1.7036,
+      "epoch": 7.9588014981273405,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016816479400749065,
+      "loss": 1.3457,
       "step": 76500
     },
     {
-      "epoch": 0.5006827492034592,
-      "grad_norm": 1.25,
-      "learning_rate": 0.001979972690031862,
-      "loss": 1.7149,
+      "epoch": 8.0,
+      "eval_loss": 1.3691484928131104,
+      "eval_runtime": 1.5204,
+      "eval_samples_per_second": 657.725,
+      "eval_steps_per_second": 0.658,
+      "step": 76896
+    },
+    {
+      "epoch": 8.010819808572618,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016795672076570954,
+      "loss": 1.3419,
       "step": 77000
     },
     {
-      "epoch": 0.5039339358865986,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.001979842642564536,
-      "loss": 1.7483,
+      "epoch": 8.062838119017893,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0016774864752392843,
+      "loss": 1.3375,
       "step": 77500
     },
     {
-      "epoch": 0.5071851225697379,
-      "grad_norm": 0.9453125,
-      "learning_rate": 0.0019797125950972105,
-      "loss": 1.731,
+      "epoch": 8.11485642946317,
+      "grad_norm": 0.19921875,
+      "learning_rate": 0.0016754057428214732,
+      "loss": 1.3368,
       "step": 78000
     },
     {
-      "epoch": 0.5104363092528773,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.001979582547629885,
-      "loss": 1.7228,
+      "epoch": 8.166874739908447,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0016733250104036621,
+      "loss": 1.3385,
       "step": 78500
     },
     {
-      "epoch": 0.5136874959360166,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.0019794525001625594,
-      "loss": 1.7164,
+      "epoch": 8.218893050353724,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.001671244277985851,
+      "loss": 1.3329,
       "step": 79000
     },
     {
-      "epoch": 0.516938682619156,
-      "grad_norm": 6.75,
-      "learning_rate": 0.0019793224526952337,
-      "loss": 1.7132,
+      "epoch": 8.270911360799001,
+      "grad_norm": 0.462890625,
+      "learning_rate": 0.0016691635455680402,
+      "loss": 1.3346,
       "step": 79500
     },
     {
-      "epoch": 0.5201898693022954,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0019791924052279084,
-      "loss": 1.7004,
+      "epoch": 8.322929671244278,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016670828131502289,
+      "loss": 1.3342,
       "step": 80000
     },
     {
-      "epoch": 0.5234410559854347,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019790623577605826,
-      "loss": 1.7019,
+      "epoch": 8.374947981689555,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0016650020807324178,
+      "loss": 1.3313,
       "step": 80500
     },
     {
-      "epoch": 0.5266922426685741,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.001978932310293257,
-      "loss": 1.7002,
+      "epoch": 8.426966292134832,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001662921348314607,
+      "loss": 1.33,
       "step": 81000
     },
     {
-      "epoch": 0.5299434293517133,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0019788022628259316,
-      "loss": 1.6903,
+      "epoch": 8.478984602580109,
+      "grad_norm": 0.244140625,
+      "learning_rate": 0.0016608406158967956,
+      "loss": 1.3321,
       "step": 81500
     },
     {
-      "epoch": 0.5331946160348527,
-      "grad_norm": 1.2578125,
-      "learning_rate": 0.001978672215358606,
-      "loss": 1.6837,
+      "epoch": 8.531002913025384,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0016587598834789845,
+      "loss": 1.3322,
       "step": 82000
     },
     {
-      "epoch": 0.536445802717992,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.00197854216789128,
-      "loss": 1.6843,
+      "epoch": 8.583021223470661,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0016566791510611736,
+      "loss": 1.3354,
       "step": 82500
     },
     {
-      "epoch": 0.5396969894011314,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.001978412120423955,
-      "loss": 1.6899,
+      "epoch": 8.635039533915938,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016545984186433626,
+      "loss": 1.3358,
       "step": 83000
     },
     {
-      "epoch": 0.5429481760842707,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0019782820729566295,
-      "loss": 1.685,
+      "epoch": 8.687057844361215,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0016525176862255513,
+      "loss": 1.3303,
       "step": 83500
     },
     {
-      "epoch": 0.5461993627674101,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019781520254893038,
-      "loss": 1.6823,
+      "epoch": 8.739076154806492,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0016504369538077404,
+      "loss": 1.3332,
       "step": 84000
     },
     {
-      "epoch": 0.5494505494505495,
-      "grad_norm": 1.7421875,
-      "learning_rate": 0.001978021978021978,
-      "loss": 1.6779,
+      "epoch": 8.791094465251769,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0016483562213899293,
+      "loss": 1.3337,
       "step": 84500
     },
     {
-      "epoch": 0.5527017361336888,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0019778919305546527,
-      "loss": 1.6857,
+      "epoch": 8.843112775697046,
+      "grad_norm": 0.6015625,
+      "learning_rate": 0.0016462754889721182,
+      "loss": 1.3321,
       "step": 85000
     },
     {
-      "epoch": 0.5559529228168282,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001977761883087327,
-      "loss": 1.6778,
+      "epoch": 8.895131086142323,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0016441947565543071,
+      "loss": 1.3283,
       "step": 85500
     },
     {
-      "epoch": 0.5592041094999675,
-      "grad_norm": 0.9765625,
-      "learning_rate": 0.0019776318356200012,
-      "loss": 1.6724,
+      "epoch": 8.947149396587598,
+      "grad_norm": 0.376953125,
+      "learning_rate": 0.001642114024136496,
+      "loss": 1.3306,
       "step": 86000
     },
     {
-      "epoch": 0.5624552961831069,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.001977501788152676,
-      "loss": 1.6823,
+      "epoch": 8.999167707032875,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0016400332917186852,
+      "loss": 1.3315,
       "step": 86500
     },
     {
-      "epoch": 0.5657064828662461,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.00197737174068535,
-      "loss": 1.6812,
+      "epoch": 9.0,
+      "eval_loss": 1.3568580150604248,
+      "eval_runtime": 1.6522,
+      "eval_samples_per_second": 605.266,
+      "eval_steps_per_second": 0.605,
+      "step": 86508
+    },
+    {
+      "epoch": 9.051186017478152,
+      "grad_norm": 0.77734375,
+      "learning_rate": 0.0016379525593008739,
+      "loss": 1.3232,
       "step": 87000
     },
     {
-      "epoch": 0.5689576695493855,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019772416932180245,
-      "loss": 1.6774,
+      "epoch": 9.103204327923429,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0016358718268830628,
+      "loss": 1.3251,
       "step": 87500
     },
     {
-      "epoch": 0.5722088562325248,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.001977111645750699,
-      "loss": 1.6614,
+      "epoch": 9.155222638368706,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001633791094465252,
+      "loss": 1.3295,
       "step": 88000
     },
     {
-      "epoch": 0.5754600429156642,
-      "grad_norm": 1.5625,
-      "learning_rate": 0.0019769815982833734,
-      "loss": 1.657,
+      "epoch": 9.207240948813983,
+      "grad_norm": 0.25,
+      "learning_rate": 0.0016317103620474408,
+      "loss": 1.3261,
       "step": 88500
     },
     {
-      "epoch": 0.5787112295988036,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0019768515508160477,
-      "loss": 1.6616,
+      "epoch": 9.25925925925926,
+      "grad_norm": 0.171875,
+      "learning_rate": 0.0016296296296296295,
+      "loss": 1.3288,
       "step": 89000
     },
     {
-      "epoch": 0.5819624162819429,
-      "grad_norm": 1.421875,
-      "learning_rate": 0.0019767215033487224,
-      "loss": 1.6561,
+      "epoch": 9.311277569704536,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0016275488972118187,
+      "loss": 1.3276,
       "step": 89500
     },
     {
-      "epoch": 0.5852136029650823,
-      "grad_norm": 1.3046875,
-      "learning_rate": 0.0019765914558813966,
-      "loss": 1.6612,
+      "epoch": 9.363295880149813,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0016254681647940076,
+      "loss": 1.3257,
       "step": 90000
     },
     {
-      "epoch": 0.5884647896482216,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.001976461408414071,
-      "loss": 1.6658,
+      "epoch": 9.41531419059509,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0016233874323761963,
+      "loss": 1.3219,
       "step": 90500
     },
     {
-      "epoch": 0.591715976331361,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0019763313609467456,
-      "loss": 1.6678,
+      "epoch": 9.467332501040365,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0016213066999583854,
+      "loss": 1.3219,
       "step": 91000
     },
     {
-      "epoch": 0.5949671630145003,
-      "grad_norm": 2.4375,
-      "learning_rate": 0.0019762013134794203,
-      "loss": 1.6657,
+      "epoch": 9.519350811485642,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0016192259675405743,
+      "loss": 1.3216,
       "step": 91500
     },
     {
-      "epoch": 0.5982183496976397,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019760712660120945,
-      "loss": 1.6659,
+      "epoch": 9.57136912193092,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0016171452351227634,
+      "loss": 1.324,
       "step": 92000
     },
     {
-      "epoch": 0.6014695363807789,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.001975941218544769,
-      "loss": 1.6694,
+      "epoch": 9.623387432376196,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0016150645027049521,
+      "loss": 1.3212,
       "step": 92500
     },
     {
-      "epoch": 0.6047207230639183,
-      "grad_norm": 1.5859375,
-      "learning_rate": 0.0019758111710774435,
-      "loss": 1.6644,
+      "epoch": 9.675405742821473,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.001612983770287141,
+      "loss": 1.3217,
       "step": 93000
     },
     {
-      "epoch": 0.6079719097470577,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0019756811236101178,
-      "loss": 1.6526,
+      "epoch": 9.72742405326675,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0016109030378693302,
+      "loss": 1.3219,
       "step": 93500
     },
     {
-      "epoch": 0.611223096430197,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.001975551076142792,
-      "loss": 1.6593,
+      "epoch": 9.779442363712027,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0016088223054515189,
+      "loss": 1.3219,
       "step": 94000
     },
     {
-      "epoch": 0.6144742831133364,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0019754210286754667,
-      "loss": 1.6524,
+      "epoch": 9.831460674157304,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0016067415730337078,
+      "loss": 1.3188,
       "step": 94500
     },
     {
-      "epoch": 0.6177254697964757,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.001975290981208141,
-      "loss": 1.6447,
+      "epoch": 9.88347898460258,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001604660840615897,
+      "loss": 1.3205,
       "step": 95000
     },
     {
-      "epoch": 0.6209766564796151,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0019751609337408152,
-      "loss": 1.6478,
+      "epoch": 9.935497295047856,
+      "grad_norm": 1.828125,
+      "learning_rate": 0.0016025801081980858,
+      "loss": 1.3238,
       "step": 95500
     },
     {
-      "epoch": 0.6242278431627544,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.00197503088627349,
-      "loss": 1.6488,
+      "epoch": 9.987515605493133,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0016004993757802745,
+      "loss": 1.3224,
       "step": 96000
     },
     {
-      "epoch": 0.6274790298458938,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.001974900838806164,
-      "loss": 1.6573,
+      "epoch": 10.0,
+      "eval_loss": 1.3528562784194946,
+      "eval_runtime": 1.936,
+      "eval_samples_per_second": 516.533,
+      "eval_steps_per_second": 0.517,
+      "step": 96120
+    },
+    {
+      "epoch": 10.03953391593841,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0015984186433624637,
+      "loss": 1.3174,
       "step": 96500
     },
     {
-      "epoch": 0.630730216529033,
-      "grad_norm": 1.4765625,
-      "learning_rate": 0.0019747707913388385,
-      "loss": 1.6593,
+      "epoch": 10.091552226383687,
+      "grad_norm": 0.404296875,
+      "learning_rate": 0.0015963379109446526,
+      "loss": 1.3199,
       "step": 97000
     },
     {
-      "epoch": 0.6339814032121724,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.001974640743871513,
-      "loss": 1.6694,
+      "epoch": 10.143570536828964,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0015942571785268413,
+      "loss": 1.3187,
       "step": 97500
     },
     {
-      "epoch": 0.6372325898953118,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.001974510696404188,
-      "loss": 1.6712,
+      "epoch": 10.19558884727424,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0015921764461090304,
+      "loss": 1.3205,
       "step": 98000
     },
     {
-      "epoch": 0.6404837765784511,
-      "grad_norm": 1.890625,
-      "learning_rate": 0.001974380648936862,
-      "loss": 1.6678,
+      "epoch": 10.247607157719518,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0015900957136912193,
+      "loss": 1.3201,
       "step": 98500
     },
     {
-      "epoch": 0.6437349632615905,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0019742506014695364,
-      "loss": 1.662,
+      "epoch": 10.299625468164795,
+      "grad_norm": 0.65625,
+      "learning_rate": 0.0015880149812734085,
+      "loss": 1.3193,
       "step": 99000
     },
     {
-      "epoch": 0.6469861499447298,
-      "grad_norm": 1.1640625,
-      "learning_rate": 0.001974120554002211,
-      "loss": 1.6648,
+      "epoch": 10.35164377861007,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0015859342488555972,
+      "loss": 1.3181,
       "step": 99500
     },
     {
-      "epoch": 0.6502373366278692,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0019739905065348853,
-      "loss": 1.6518,
+      "epoch": 10.403662089055347,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.001583853516437786,
+      "loss": 1.3169,
       "step": 100000
     },
     {
-      "epoch": 0.6534885233110085,
-      "grad_norm": 1.6484375,
-      "learning_rate": 0.0019738604590675596,
-      "loss": 1.6528,
+      "epoch": 10.455680399500624,
+      "grad_norm": 0.298828125,
+      "learning_rate": 0.0015817727840199752,
+      "loss": 1.3162,
       "step": 100500
     },
     {
-      "epoch": 0.6567397099941479,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0019737304116002343,
-      "loss": 1.6754,
+      "epoch": 10.5076987099459,
+      "grad_norm": 0.458984375,
+      "learning_rate": 0.0015796920516021641,
+      "loss": 1.3217,
       "step": 101000
     },
     {
-      "epoch": 0.6599908966772872,
-      "grad_norm": 2.1875,
-      "learning_rate": 0.0019736003641329086,
-      "loss": 1.68,
+      "epoch": 10.559717020391178,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.0015776113191843528,
+      "loss": 1.3245,
       "step": 101500
     },
     {
-      "epoch": 0.6632420833604266,
-      "grad_norm": 1.1015625,
-      "learning_rate": 0.001973470316665583,
-      "loss": 1.6636,
+      "epoch": 10.611735330836455,
+      "grad_norm": 0.369140625,
+      "learning_rate": 0.001575530586766542,
+      "loss": 1.3233,
       "step": 102000
     },
     {
-      "epoch": 0.666493270043566,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0019733402691982575,
-      "loss": 1.6543,
+      "epoch": 10.663753641281732,
+      "grad_norm": 0.31640625,
+      "learning_rate": 0.0015734498543487309,
+      "loss": 1.3182,
       "step": 102500
     },
     {
-      "epoch": 0.6697444567267052,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.0019732102217309318,
-      "loss": 1.6503,
+      "epoch": 10.715771951727008,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015713691219309195,
+      "loss": 1.3163,
       "step": 103000
     },
     {
-      "epoch": 0.6729956434098446,
-      "grad_norm": 1.3203125,
-      "learning_rate": 0.001973080174263606,
-      "loss": 1.6522,
+      "epoch": 10.767790262172285,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015692883895131087,
+      "loss": 1.3181,
       "step": 103500
     },
     {
-      "epoch": 0.6762468300929839,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.0019729501267962807,
-      "loss": 1.6545,
+      "epoch": 10.81980857261756,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0015672076570952976,
+      "loss": 1.3171,
       "step": 104000
     },
     {
-      "epoch": 0.6794980167761233,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.001972820079328955,
-      "loss": 1.651,
+      "epoch": 10.871826883062838,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0015651269246774865,
+      "loss": 1.3177,
       "step": 104500
     },
     {
-      "epoch": 0.6827492034592626,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0019726900318616293,
-      "loss": 1.6601,
+      "epoch": 10.923845193508114,
+      "grad_norm": 0.306640625,
+      "learning_rate": 0.0015630461922596754,
+      "loss": 1.3211,
       "step": 105000
     },
     {
-      "epoch": 0.686000390142402,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.001972559984394304,
-      "loss": 1.6394,
+      "epoch": 10.975863503953391,
+      "grad_norm": 0.34765625,
+      "learning_rate": 0.0015609654598418643,
+      "loss": 1.3183,
       "step": 105500
     },
     {
-      "epoch": 0.6892515768255413,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0019724299369269786,
-      "loss": 1.639,
+      "epoch": 11.0,
+      "eval_loss": 1.347601056098938,
+      "eval_runtime": 1.5374,
+      "eval_samples_per_second": 650.453,
+      "eval_steps_per_second": 0.65,
+      "step": 105732
+    },
+    {
+      "epoch": 11.027881814398668,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0015588847274240535,
+      "loss": 1.315,
       "step": 106000
     },
     {
-      "epoch": 0.6925027635086807,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001972299889459653,
-      "loss": 1.629,
+      "epoch": 11.079900124843945,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0015568039950062422,
+      "loss": 1.3125,
       "step": 106500
     },
     {
-      "epoch": 0.6957539501918201,
-      "grad_norm": 1.234375,
-      "learning_rate": 0.001972169841992327,
-      "loss": 1.645,
+      "epoch": 11.131918435289222,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.001554723262588431,
+      "loss": 1.3114,
       "step": 107000
     },
     {
-      "epoch": 0.6990051368749594,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.001972039794525002,
-      "loss": 1.6692,
+      "epoch": 11.1839367457345,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0015526425301706202,
+      "loss": 1.3104,
       "step": 107500
     },
     {
-      "epoch": 0.7022563235580987,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.001971909747057676,
-      "loss": 1.6579,
+      "epoch": 11.235955056179776,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0015505617977528091,
+      "loss": 1.3109,
       "step": 108000
     },
     {
-      "epoch": 0.705507510241238,
-      "grad_norm": 1.359375,
-      "learning_rate": 0.0019717796995903504,
-      "loss": 1.6385,
+      "epoch": 11.287973366625051,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0015484810653349978,
+      "loss": 1.3119,
       "step": 108500
     },
     {
-      "epoch": 0.7087586969243774,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.001971649652123025,
-      "loss": 1.648,
+      "epoch": 11.339991677070328,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.001546400332917187,
+      "loss": 1.31,
       "step": 109000
     },
     {
-      "epoch": 0.7120098836075167,
-      "grad_norm": 1.6015625,
-      "learning_rate": 0.0019715196046556993,
-      "loss": 1.6436,
+      "epoch": 11.392009987515605,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0015443196004993759,
+      "loss": 1.3114,
       "step": 109500
     },
     {
-      "epoch": 0.7152610702906561,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0019713895571883736,
-      "loss": 1.6362,
+      "epoch": 11.444028297960882,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0015422388680815646,
+      "loss": 1.3133,
       "step": 110000
     },
     {
-      "epoch": 0.7185122569737954,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019712595097210483,
-      "loss": 1.645,
+      "epoch": 11.496046608406159,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0015401581356637537,
+      "loss": 1.3129,
       "step": 110500
     },
     {
-      "epoch": 0.7217634436569348,
-      "grad_norm": 0.5234375,
-      "learning_rate": 0.0019711294622537226,
-      "loss": 1.642,
+      "epoch": 11.548064918851436,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0015380774032459426,
+      "loss": 1.3125,
       "step": 111000
     },
     {
-      "epoch": 0.7250146303400741,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.001970999414786397,
-      "loss": 1.6355,
+      "epoch": 11.600083229296713,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0015359966708281315,
+      "loss": 1.312,
       "step": 111500
     },
     {
-      "epoch": 0.7282658170232135,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0019708693673190715,
-      "loss": 1.6339,
+      "epoch": 11.65210153974199,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0015339159384103204,
+      "loss": 1.3107,
       "step": 112000
     },
     {
-      "epoch": 0.7315170037063529,
-      "grad_norm": 0.98046875,
-      "learning_rate": 0.001970739319851746,
-      "loss": 1.6294,
+      "epoch": 11.704119850187267,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0015318352059925093,
+      "loss": 1.3116,
       "step": 112500
     },
     {
-      "epoch": 0.7347681903894921,
-      "grad_norm": 3.84375,
-      "learning_rate": 0.0019706092723844205,
-      "loss": 1.6307,
+      "epoch": 11.756138160632542,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0015297544735746985,
+      "loss": 1.3104,
       "step": 113000
     },
     {
-      "epoch": 0.7380193770726315,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0019704792249170947,
-      "loss": 1.6373,
+      "epoch": 11.808156471077819,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0015276737411568874,
+      "loss": 1.3102,
       "step": 113500
     },
     {
-      "epoch": 0.7412705637557708,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0019703491774497694,
-      "loss": 1.6368,
+      "epoch": 11.860174781523096,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001525593008739076,
+      "loss": 1.309,
       "step": 114000
     },
     {
-      "epoch": 0.7445217504389102,
-      "grad_norm": 42.75,
-      "learning_rate": 0.0019702191299824437,
-      "loss": 1.6278,
+      "epoch": 11.912193091968373,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.0015235122763212652,
+      "loss": 1.3096,
       "step": 114500
     },
     {
-      "epoch": 0.7477729371220495,
-      "grad_norm": 3.53125,
-      "learning_rate": 0.001970089082515118,
-      "loss": 1.6316,
+      "epoch": 11.96421140241365,
+      "grad_norm": 0.9296875,
+      "learning_rate": 0.0015214315439034541,
+      "loss": 1.307,
       "step": 115000
     },
     {
-      "epoch": 0.7510241238051889,
-      "grad_norm": 1.3203125,
-      "learning_rate": 0.0019699590350477927,
-      "loss": 1.6202,
+      "epoch": 12.0,
+      "eval_loss": 1.3371446132659912,
+      "eval_runtime": 1.4263,
+      "eval_samples_per_second": 701.11,
+      "eval_steps_per_second": 0.701,
+      "step": 115344
+    },
+    {
+      "epoch": 12.016229712858927,
+      "grad_norm": 0.185546875,
+      "learning_rate": 0.0015193508114856428,
+      "loss": 1.3041,
       "step": 115500
     },
     {
-      "epoch": 0.7542753104883282,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.001969828987580467,
-      "loss": 1.6194,
+      "epoch": 12.068248023304204,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.001517270079067832,
+      "loss": 1.3025,
       "step": 116000
     },
     {
-      "epoch": 0.7575264971714676,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.001969698940113141,
-      "loss": 1.6219,
+      "epoch": 12.12026633374948,
+      "grad_norm": 0.232421875,
+      "learning_rate": 0.0015151893466500209,
+      "loss": 1.3041,
       "step": 116500
     },
     {
-      "epoch": 0.760777683854607,
-      "grad_norm": 1.5703125,
-      "learning_rate": 0.001969568892645816,
-      "loss": 1.6229,
+      "epoch": 12.172284644194757,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0015131086142322098,
+      "loss": 1.3063,
       "step": 117000
     },
     {
-      "epoch": 0.7640288705377463,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.00196943884517849,
-      "loss": 1.6253,
+      "epoch": 12.224302954640033,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0015110278818143987,
+      "loss": 1.3037,
       "step": 117500
     },
     {
-      "epoch": 0.7672800572208857,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.0019693087977111644,
-      "loss": 1.6203,
+      "epoch": 12.27632126508531,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0015089471493965876,
+      "loss": 1.3049,
       "step": 118000
     },
     {
-      "epoch": 0.7705312439040249,
-      "grad_norm": 2.546875,
-      "learning_rate": 0.001969178750243839,
-      "loss": 1.6222,
+      "epoch": 12.328339575530586,
+      "grad_norm": 0.32421875,
+      "learning_rate": 0.0015068664169787765,
+      "loss": 1.3033,
       "step": 118500
     },
     {
-      "epoch": 0.7737824305871643,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0019690487027765134,
-      "loss": 1.6241,
+      "epoch": 12.380357885975863,
+      "grad_norm": 0.1884765625,
+      "learning_rate": 0.0015047856845609654,
+      "loss": 1.3032,
       "step": 119000
     },
     {
-      "epoch": 0.7770336172703036,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0019689186553091876,
-      "loss": 1.6173,
+      "epoch": 12.43237619642114,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0015027049521431544,
+      "loss": 1.3031,
       "step": 119500
     },
     {
-      "epoch": 0.780284803953443,
-      "grad_norm": 1.4921875,
-      "learning_rate": 0.0019687886078418623,
-      "loss": 1.6136,
+      "epoch": 12.484394506866417,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0015006242197253433,
+      "loss": 1.3031,
       "step": 120000
     },
     {
-      "epoch": 0.7835359906365823,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.001968658560374537,
-      "loss": 1.6079,
+      "epoch": 12.536412817311694,
+      "grad_norm": 0.90625,
+      "learning_rate": 0.0014985434873075324,
+      "loss": 1.304,
       "step": 120500
     },
     {
-      "epoch": 0.7867871773197217,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0019685285129072113,
-      "loss": 1.6035,
+      "epoch": 12.588431127756971,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001496462754889721,
+      "loss": 1.3042,
       "step": 121000
     },
     {
-      "epoch": 0.7900383640028611,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019683984654398855,
-      "loss": 1.6098,
+      "epoch": 12.640449438202246,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0014943820224719102,
+      "loss": 1.3055,
       "step": 121500
     },
     {
-      "epoch": 0.7932895506860004,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0019682684179725602,
-      "loss": 1.6112,
+      "epoch": 12.692467748647523,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0014923012900540991,
+      "loss": 1.3061,
       "step": 122000
     },
     {
-      "epoch": 0.7965407373691398,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0019681383705052345,
-      "loss": 1.6075,
+      "epoch": 12.7444860590928,
+      "grad_norm": 0.390625,
+      "learning_rate": 0.0014902205576362878,
+      "loss": 1.304,
       "step": 122500
     },
     {
-      "epoch": 0.7997919240522791,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0019680083230379087,
-      "loss": 1.6124,
+      "epoch": 12.796504369538077,
+      "grad_norm": 0.408203125,
+      "learning_rate": 0.001488139825218477,
+      "loss": 1.3048,
       "step": 123000
     },
     {
-      "epoch": 0.8030431107354185,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0019678782755705834,
-      "loss": 1.6121,
+      "epoch": 12.848522679983354,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0014860590928006659,
+      "loss": 1.3038,
       "step": 123500
     },
     {
-      "epoch": 0.8062942974185577,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0019677482281032577,
-      "loss": 1.5979,
+      "epoch": 12.900540990428631,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0014839783603828548,
+      "loss": 1.3029,
       "step": 124000
     },
     {
-      "epoch": 0.8095454841016971,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001967618180635932,
-      "loss": 1.6022,
+      "epoch": 12.952559300873908,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0014818976279650437,
+      "loss": 1.302,
       "step": 124500
     },
     {
-      "epoch": 0.8127966707848364,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.0019674881331686067,
-      "loss": 1.6035,
+      "epoch": 13.0,
+      "eval_loss": 1.336362600326538,
+      "eval_runtime": 1.5551,
+      "eval_samples_per_second": 643.05,
+      "eval_steps_per_second": 0.643,
+      "step": 124956
+    },
+    {
+      "epoch": 13.004577611319185,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.0014798168955472326,
+      "loss": 1.3067,
       "step": 125000
     },
     {
-      "epoch": 0.8160478574679758,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.001967358085701281,
-      "loss": 1.5999,
+      "epoch": 13.056595921764462,
+      "grad_norm": 0.3125,
+      "learning_rate": 0.0014777361631294215,
+      "loss": 1.3017,
       "step": 125500
     },
     {
-      "epoch": 0.8192990441511152,
-      "grad_norm": 0.5,
-      "learning_rate": 0.001967228038233955,
-      "loss": 1.6004,
+      "epoch": 13.108614232209737,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014756554307116107,
+      "loss": 1.3037,
       "step": 126000
     },
     {
-      "epoch": 0.8225502308342545,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.00196709799076663,
-      "loss": 1.5918,
+      "epoch": 13.160632542655014,
+      "grad_norm": 0.28515625,
+      "learning_rate": 0.0014735746982937994,
+      "loss": 1.3047,
       "step": 126500
     },
     {
-      "epoch": 0.8258014175173939,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0019669679432993046,
-      "loss": 1.6031,
+      "epoch": 13.21265085310029,
+      "grad_norm": 0.1982421875,
+      "learning_rate": 0.0014714939658759883,
+      "loss": 1.3034,
       "step": 127000
     },
     {
-      "epoch": 0.8290526042005332,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.001966837895831979,
-      "loss": 1.6028,
+      "epoch": 13.264669163545568,
+      "grad_norm": 7.0,
+      "learning_rate": 0.0014694132334581774,
+      "loss": 1.303,
       "step": 127500
     },
     {
-      "epoch": 0.8323037908836726,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.001966707848364653,
-      "loss": 1.5984,
+      "epoch": 13.316687473990845,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0014673325010403661,
+      "loss": 1.304,
       "step": 128000
     },
     {
-      "epoch": 0.8355549775668119,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.001966577800897328,
-      "loss": 1.5988,
+      "epoch": 13.368705784436122,
+      "grad_norm": 0.240234375,
+      "learning_rate": 0.0014652517686225552,
+      "loss": 1.3033,
       "step": 128500
     },
     {
-      "epoch": 0.8388061642499512,
-      "grad_norm": 1.1171875,
-      "learning_rate": 0.001966447753430002,
-      "loss": 1.6013,
+      "epoch": 13.420724094881399,
+      "grad_norm": 0.349609375,
+      "learning_rate": 0.0014631710362047442,
+      "loss": 1.3014,
       "step": 129000
     },
     {
-      "epoch": 0.8420573509330905,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0019663177059626763,
-      "loss": 1.6111,
+      "epoch": 13.472742405326676,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.001461090303786933,
+      "loss": 1.3004,
       "step": 129500
     },
     {
-      "epoch": 0.8453085376162299,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.001966187658495351,
-      "loss": 1.6014,
+      "epoch": 13.524760715771952,
+      "grad_norm": 0.6875,
+      "learning_rate": 0.001459009571369122,
+      "loss": 1.3011,
       "step": 130000
     },
     {
-      "epoch": 0.8485597242993693,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.0019660576110280253,
-      "loss": 1.5989,
+      "epoch": 13.576779026217228,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.001456928838951311,
+      "loss": 1.3005,
       "step": 130500
     },
     {
-      "epoch": 0.8518109109825086,
-      "grad_norm": 5.5,
-      "learning_rate": 0.0019659275635606995,
-      "loss": 1.5957,
+      "epoch": 13.628797336662505,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0014548481065334998,
+      "loss": 1.3022,
       "step": 131000
     },
     {
-      "epoch": 0.855062097665648,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.0019657975160933742,
-      "loss": 1.5993,
+      "epoch": 13.680815647107782,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0014527673741156887,
+      "loss": 1.3013,
       "step": 131500
     },
     {
-      "epoch": 0.8583132843487873,
-      "grad_norm": 1.5625,
-      "learning_rate": 0.0019656674686260485,
-      "loss": 1.6048,
+      "epoch": 13.732833957553058,
+      "grad_norm": 0.1904296875,
+      "learning_rate": 0.0014506866416978776,
+      "loss": 1.3005,
       "step": 132000
     },
     {
-      "epoch": 0.8615644710319267,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0019655374211587228,
-      "loss": 1.6039,
+      "epoch": 13.784852267998335,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0014486059092800666,
+      "loss": 1.2997,
       "step": 132500
     },
     {
-      "epoch": 0.864815657715066,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019654073736913974,
-      "loss": 1.6012,
+      "epoch": 13.836870578443612,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0014465251768622557,
+      "loss": 1.2998,
       "step": 133000
     },
     {
-      "epoch": 0.8680668443982054,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019652773262240717,
-      "loss": 1.5923,
+      "epoch": 13.88888888888889,
+      "grad_norm": 0.2421875,
+      "learning_rate": 0.0014444444444444444,
+      "loss": 1.3001,
       "step": 133500
     },
     {
-      "epoch": 0.8713180310813446,
-      "grad_norm": 1.3671875,
-      "learning_rate": 0.001965147278756746,
-      "loss": 1.585,
+      "epoch": 13.940907199334166,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.0014423637120266333,
+      "loss": 1.2998,
       "step": 134000
     },
     {
-      "epoch": 0.874569217764484,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019650172312894207,
-      "loss": 1.5838,
+      "epoch": 13.992925509779443,
+      "grad_norm": 0.455078125,
+      "learning_rate": 0.0014402829796088224,
+      "loss": 1.3013,
       "step": 134500
     },
     {
-      "epoch": 0.8778204044476234,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0019648871838220954,
-      "loss": 1.5818,
+      "epoch": 14.0,
+      "eval_loss": 1.3362102508544922,
+      "eval_runtime": 1.3748,
+      "eval_samples_per_second": 727.372,
+      "eval_steps_per_second": 0.727,
+      "step": 134568
+    },
+    {
+      "epoch": 14.044943820224718,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0014382022471910111,
+      "loss": 1.2953,
       "step": 135000
     },
     {
-      "epoch": 0.8810715911307627,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0019647571363547696,
-      "loss": 1.5889,
+      "epoch": 14.096962130669995,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0014361215147732003,
+      "loss": 1.2956,
       "step": 135500
     },
     {
-      "epoch": 0.8843227778139021,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.001964627088887444,
-      "loss": 1.587,
+      "epoch": 14.148980441115272,
+      "grad_norm": 0.3359375,
+      "learning_rate": 0.0014340407823553892,
+      "loss": 1.2953,
       "step": 136000
     },
     {
-      "epoch": 0.8875739644970414,
-      "grad_norm": 5.25,
-      "learning_rate": 0.0019644970414201186,
-      "loss": 1.5835,
+      "epoch": 14.20099875156055,
+      "grad_norm": 0.337890625,
+      "learning_rate": 0.001431960049937578,
+      "loss": 1.2959,
       "step": 136500
     },
     {
-      "epoch": 0.8908251511801808,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.001964366993952793,
-      "loss": 1.5802,
+      "epoch": 14.253017062005826,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001429879317519767,
+      "loss": 1.2951,
       "step": 137000
     },
     {
-      "epoch": 0.8940763378633201,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.001964236946485467,
-      "loss": 1.5836,
+      "epoch": 14.305035372451103,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.001427798585101956,
+      "loss": 1.2948,
       "step": 137500
     },
     {
-      "epoch": 0.8973275245464595,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.001964106899018142,
-      "loss": 1.5861,
+      "epoch": 14.35705368289638,
+      "grad_norm": 0.412109375,
+      "learning_rate": 0.0014257178526841448,
+      "loss": 1.2962,
       "step": 138000
     },
     {
-      "epoch": 0.9005787112295988,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.001963976851550816,
-      "loss": 1.5777,
+      "epoch": 14.409071993341657,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.001423637120266334,
+      "loss": 1.2941,
       "step": 138500
     },
     {
-      "epoch": 0.9038298979127382,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019638468040834903,
-      "loss": 1.5759,
+      "epoch": 14.461090303786934,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0014215563878485226,
+      "loss": 1.2958,
       "step": 139000
     },
     {
-      "epoch": 0.9070810845958774,
-      "grad_norm": 2.625,
-      "learning_rate": 0.001963716756616165,
-      "loss": 1.5863,
+      "epoch": 14.513108614232209,
+      "grad_norm": 0.2255859375,
+      "learning_rate": 0.0014194756554307116,
+      "loss": 1.2949,
       "step": 139500
     },
     {
-      "epoch": 0.9103322712790168,
-      "grad_norm": 0.5390625,
-      "learning_rate": 0.0019635867091488393,
-      "loss": 1.5756,
+      "epoch": 14.565126924677486,
+      "grad_norm": 0.177734375,
+      "learning_rate": 0.0014173949230129007,
+      "loss": 1.2933,
       "step": 140000
     },
     {
-      "epoch": 0.9135834579621562,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0019634566616815135,
-      "loss": 1.5774,
+      "epoch": 14.617145235122763,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0014153141905950894,
+      "loss": 1.295,
       "step": 140500
     },
     {
-      "epoch": 0.9168346446452955,
-      "grad_norm": 2.828125,
-      "learning_rate": 0.0019633266142141882,
-      "loss": 1.5707,
+      "epoch": 14.66916354556804,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0014132334581772783,
+      "loss": 1.2944,
       "step": 141000
     },
     {
-      "epoch": 0.9200858313284349,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.001963196566746863,
-      "loss": 1.5849,
+      "epoch": 14.721181856013317,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0014111527257594674,
+      "loss": 1.2923,
       "step": 141500
     },
     {
-      "epoch": 0.9233370180115742,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.001963066519279537,
-      "loss": 1.5774,
+      "epoch": 14.773200166458594,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.0014090719933416563,
+      "loss": 1.2925,
       "step": 142000
     },
     {
-      "epoch": 0.9265882046947136,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019629364718122115,
-      "loss": 1.5774,
+      "epoch": 14.82521847690387,
+      "grad_norm": 0.3828125,
+      "learning_rate": 0.0014069912609238453,
+      "loss": 1.2907,
       "step": 142500
     },
     {
-      "epoch": 0.9298393913778529,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.001962806424344886,
-      "loss": 1.5733,
+      "epoch": 14.877236787349148,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0014049105285060342,
+      "loss": 1.2936,
       "step": 143000
     },
     {
-      "epoch": 0.9330905780609923,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0019626763768775604,
-      "loss": 1.5819,
+      "epoch": 14.929255097794425,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001402829796088223,
+      "loss": 1.2927,
       "step": 143500
     },
     {
-      "epoch": 0.9363417647441316,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0019625463294102347,
-      "loss": 1.5717,
+      "epoch": 14.9812734082397,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.001400749063670412,
+      "loss": 1.2911,
       "step": 144000
     },
     {
-      "epoch": 0.939592951427271,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0019624162819429094,
-      "loss": 1.5737,
+      "epoch": 15.0,
+      "eval_loss": 1.3259565830230713,
+      "eval_runtime": 1.5089,
+      "eval_samples_per_second": 662.754,
+      "eval_steps_per_second": 0.663,
+      "step": 144180
+    },
+    {
+      "epoch": 15.033291718684977,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001398668331252601,
+      "loss": 1.2892,
       "step": 144500
     },
     {
-      "epoch": 0.9428441381104103,
-      "grad_norm": 1.3359375,
-      "learning_rate": 0.0019622862344755836,
-      "loss": 1.577,
+      "epoch": 15.085310029130254,
+      "grad_norm": 0.474609375,
+      "learning_rate": 0.0013965875988347898,
+      "loss": 1.2898,
       "step": 145000
     },
     {
-      "epoch": 0.9460953247935496,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.001962156187008258,
-      "loss": 1.5795,
+      "epoch": 15.13732833957553,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.001394506866416979,
+      "loss": 1.2916,
       "step": 145500
     },
     {
-      "epoch": 0.949346511476689,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0019620261395409326,
-      "loss": 1.5723,
+      "epoch": 15.189346650020807,
+      "grad_norm": 0.2373046875,
+      "learning_rate": 0.0013924261339991677,
+      "loss": 1.2906,
       "step": 146000
     },
     {
-      "epoch": 0.9525976981598283,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.001961896092073607,
-      "loss": 1.5775,
+      "epoch": 15.241364960466084,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0013903454015813566,
+      "loss": 1.2905,
       "step": 146500
     },
     {
-      "epoch": 0.9558488848429677,
-      "grad_norm": 1.6875,
-      "learning_rate": 0.001961766044606281,
-      "loss": 1.5767,
+      "epoch": 15.293383270911361,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0013882646691635457,
+      "loss": 1.2898,
       "step": 147000
     },
     {
-      "epoch": 0.959100071526107,
-      "grad_norm": 0.9765625,
-      "learning_rate": 0.001961635997138956,
-      "loss": 1.5762,
+      "epoch": 15.345401581356638,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0013861839367457346,
+      "loss": 1.2893,
       "step": 147500
     },
     {
-      "epoch": 0.9623512582092464,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.00196150594967163,
-      "loss": 1.5697,
+      "epoch": 15.397419891801913,
+      "grad_norm": 0.2451171875,
+      "learning_rate": 0.0013841032043279233,
+      "loss": 1.2905,
       "step": 148000
     },
     {
-      "epoch": 0.9656024448923857,
-      "grad_norm": 1.2265625,
-      "learning_rate": 0.0019613759022043043,
-      "loss": 1.5608,
+      "epoch": 15.44943820224719,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0013820224719101124,
+      "loss": 1.2899,
       "step": 148500
     },
     {
-      "epoch": 0.9688536315755251,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.001961245854736979,
-      "loss": 1.5679,
+      "epoch": 15.501456512692467,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013799417394923014,
+      "loss": 1.2913,
       "step": 149000
     },
     {
-      "epoch": 0.9721048182586645,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019611158072696537,
-      "loss": 1.5627,
+      "epoch": 15.553474823137744,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013778610070744903,
+      "loss": 1.29,
       "step": 149500
     },
     {
-      "epoch": 0.9753560049418037,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.001960985759802328,
-      "loss": 1.5678,
+      "epoch": 15.605493133583021,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0013757802746566792,
+      "loss": 1.2906,
       "step": 150000
     },
     {
-      "epoch": 0.9786071916249431,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0019608557123350022,
-      "loss": 1.5679,
+      "epoch": 15.657511444028298,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001373699542238868,
+      "loss": 1.289,
       "step": 150500
     },
     {
-      "epoch": 0.9818583783080824,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.001960725664867677,
-      "loss": 1.5662,
+      "epoch": 15.709529754473575,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0013716188098210572,
+      "loss": 1.2894,
       "step": 151000
     },
     {
-      "epoch": 0.9851095649912218,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.001960595617400351,
-      "loss": 1.5687,
+      "epoch": 15.761548064918852,
+      "grad_norm": 0.271484375,
+      "learning_rate": 0.001369538077403246,
+      "loss": 1.2876,
       "step": 151500
     },
     {
-      "epoch": 0.9883607516743611,
-      "grad_norm": 3.328125,
-      "learning_rate": 0.0019604655699330255,
-      "loss": 1.5623,
+      "epoch": 15.813566375364129,
+      "grad_norm": 0.201171875,
+      "learning_rate": 0.0013674573449854348,
+      "loss": 1.2886,
       "step": 152000
     },
     {
-      "epoch": 0.9916119383575005,
-      "grad_norm": 1.0,
-      "learning_rate": 0.0019603355224657,
-      "loss": 1.5668,
+      "epoch": 15.865584685809406,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001365376612567624,
+      "loss": 1.2875,
       "step": 152500
     },
     {
-      "epoch": 0.9948631250406398,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019602054749983744,
-      "loss": 1.5668,
+      "epoch": 15.917602996254681,
+      "grad_norm": 0.1943359375,
+      "learning_rate": 0.0013632958801498127,
+      "loss": 1.2874,
       "step": 153000
     },
     {
-      "epoch": 0.9981143117237792,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.0019600754275310487,
-      "loss": 1.5688,
+      "epoch": 15.969621306699958,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0013612151477320016,
+      "loss": 1.2876,
       "step": 153500
     },
     {
-      "epoch": 1.0,
-      "eval_loss": 1.5614691972732544,
-      "eval_runtime": 0.5382,
-      "eval_samples_per_second": 1857.967,
-      "eval_steps_per_second": 29.727,
-      "step": 153790
+      "epoch": 16.0,
+      "eval_loss": 1.3189575672149658,
+      "eval_runtime": 1.5607,
+      "eval_samples_per_second": 640.73,
+      "eval_steps_per_second": 0.641,
+      "step": 153792
     },
     {
-      "epoch": 1.0013654984069185,
-      "grad_norm": 3.140625,
-      "learning_rate": 0.0019599453800637234,
-      "loss": 1.5728,
+      "epoch": 16.021639617145237,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0013591344153141907,
+      "loss": 1.285,
       "step": 154000
     },
     {
-      "epoch": 1.004616685090058,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.0019598153325963976,
-      "loss": 1.5747,
+      "epoch": 16.073657927590514,
+      "grad_norm": 0.1845703125,
+      "learning_rate": 0.0013570536828963796,
+      "loss": 1.2834,
       "step": 154500
     },
     {
-      "epoch": 1.0078678717731973,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.001959685285129072,
-      "loss": 1.572,
+      "epoch": 16.125676238035787,
+      "grad_norm": 0.310546875,
+      "learning_rate": 0.0013549729504785683,
+      "loss": 1.2828,
       "step": 155000
     },
     {
-      "epoch": 1.0111190584563365,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0019595552376617466,
-      "loss": 1.5646,
+      "epoch": 16.177694548481064,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013528922180607575,
+      "loss": 1.2826,
       "step": 155500
     },
     {
-      "epoch": 1.0143702451394758,
-      "grad_norm": 1.953125,
-      "learning_rate": 0.0019594251901944213,
-      "loss": 1.5695,
+      "epoch": 16.22971285892634,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013508114856429464,
+      "loss": 1.2825,
       "step": 156000
     },
     {
-      "epoch": 1.0176214318226153,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0019592951427270956,
-      "loss": 1.5595,
+      "epoch": 16.281731169371618,
+      "grad_norm": 0.2578125,
+      "learning_rate": 0.001348730753225135,
+      "loss": 1.2836,
       "step": 156500
     },
     {
-      "epoch": 1.0208726185057546,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.00195916509525977,
-      "loss": 1.5574,
+      "epoch": 16.333749479816895,
+      "grad_norm": 0.291015625,
+      "learning_rate": 0.0013466500208073242,
+      "loss": 1.2853,
       "step": 157000
     },
     {
-      "epoch": 1.024123805188894,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.0019590350477924445,
-      "loss": 1.5624,
+      "epoch": 16.38576779026217,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013445692883895131,
+      "loss": 1.2859,
       "step": 157500
     },
     {
-      "epoch": 1.0273749918720332,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0019589050003251188,
-      "loss": 1.5611,
+      "epoch": 16.43778610070745,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.0013424885559717022,
+      "loss": 1.2841,
       "step": 158000
     },
     {
-      "epoch": 1.0306261785551727,
-      "grad_norm": 2.828125,
-      "learning_rate": 0.001958774952857793,
-      "loss": 1.5626,
+      "epoch": 16.489804411152726,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.001340407823553891,
+      "loss": 1.2834,
       "step": 158500
     },
     {
-      "epoch": 1.033877365238312,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0019586449053904677,
-      "loss": 1.5657,
+      "epoch": 16.541822721598002,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0013383270911360799,
+      "loss": 1.2834,
       "step": 159000
     },
     {
-      "epoch": 1.0371285519214513,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.001958514857923142,
-      "loss": 1.5622,
+      "epoch": 16.59384103204328,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.001336246358718269,
+      "loss": 1.2834,
       "step": 159500
     },
     {
-      "epoch": 1.0403797386045908,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0019583848104558163,
-      "loss": 1.5667,
+      "epoch": 16.645859342488556,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.001334165626300458,
+      "loss": 1.2856,
       "step": 160000
     },
     {
-      "epoch": 1.04363092528773,
-      "grad_norm": 7.78125,
-      "learning_rate": 0.001958254762988491,
-      "loss": 1.5694,
+      "epoch": 16.697877652933833,
+      "grad_norm": 0.50390625,
+      "learning_rate": 0.0013320848938826466,
+      "loss": 1.2829,
       "step": 160500
     },
     {
-      "epoch": 1.0468821119708693,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.001958124715521165,
-      "loss": 1.5726,
+      "epoch": 16.74989596337911,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0013300041614648357,
+      "loss": 1.283,
       "step": 161000
     },
     {
-      "epoch": 1.0501332986540086,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.0019579946680538395,
-      "loss": 1.5669,
+      "epoch": 16.801914273824387,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0013279234290470246,
+      "loss": 1.2848,
       "step": 161500
     },
     {
-      "epoch": 1.0533844853371481,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.001957864620586514,
-      "loss": 1.567,
+      "epoch": 16.853932584269664,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013258426966292133,
+      "loss": 1.2837,
       "step": 162000
     },
     {
-      "epoch": 1.0566356720202874,
-      "grad_norm": 0.51953125,
-      "learning_rate": 0.0019577345731191884,
-      "loss": 1.5696,
+      "epoch": 16.90595089471494,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0013237619642114025,
+      "loss": 1.2824,
       "step": 162500
     },
     {
-      "epoch": 1.0598868587034267,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0019576045256518627,
-      "loss": 1.5617,
+      "epoch": 16.957969205160218,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.0013216812317935914,
+      "loss": 1.284,
       "step": 163000
     },
     {
-      "epoch": 1.0631380453865662,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0019574744781845374,
-      "loss": 1.5626,
+      "epoch": 17.0,
+      "eval_loss": 1.3203132152557373,
+      "eval_runtime": 1.432,
+      "eval_samples_per_second": 698.327,
+      "eval_steps_per_second": 0.698,
+      "step": 163404
+    },
+    {
+      "epoch": 17.00998751560549,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0013196004993757803,
+      "loss": 1.2829,
       "step": 163500
     },
     {
-      "epoch": 1.0663892320697055,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.001957344430717212,
-      "loss": 1.5643,
+      "epoch": 17.06200582605077,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0013175197669579692,
+      "loss": 1.2788,
       "step": 164000
     },
     {
-      "epoch": 1.0696404187528448,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0019572143832498863,
-      "loss": 1.5572,
+      "epoch": 17.114024136496045,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0013154390345401581,
+      "loss": 1.2805,
       "step": 164500
     },
     {
-      "epoch": 1.072891605435984,
-      "grad_norm": 3.140625,
-      "learning_rate": 0.0019570843357825606,
-      "loss": 1.5707,
+      "epoch": 17.166042446941322,
+      "grad_norm": 0.2275390625,
+      "learning_rate": 0.0013133583021223473,
+      "loss": 1.2815,
       "step": 165000
     },
     {
-      "epoch": 1.0761427921191236,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0019569542883152353,
-      "loss": 1.5728,
+      "epoch": 17.2180607573866,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001311277569704536,
+      "loss": 1.2817,
       "step": 165500
     },
     {
-      "epoch": 1.0793939788022628,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0019568242408479096,
-      "loss": 1.5795,
+      "epoch": 17.270079067831876,
+      "grad_norm": 0.345703125,
+      "learning_rate": 0.0013091968372867249,
+      "loss": 1.2835,
       "step": 166000
     },
     {
-      "epoch": 1.0826451654854021,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.001956694193380584,
-      "loss": 1.5778,
+      "epoch": 17.322097378277153,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001307116104868914,
+      "loss": 1.2823,
       "step": 166500
     },
     {
-      "epoch": 1.0858963521685414,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.0019565641459132585,
-      "loss": 1.5702,
+      "epoch": 17.37411568872243,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.001305035372451103,
+      "loss": 1.2831,
       "step": 167000
     },
     {
-      "epoch": 1.089147538851681,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0019564340984459328,
-      "loss": 1.5654,
+      "epoch": 17.426133999167707,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0013029546400332916,
+      "loss": 1.283,
       "step": 167500
     },
     {
-      "epoch": 1.0923987255348202,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.001956304050978607,
-      "loss": 1.5677,
+      "epoch": 17.478152309612984,
+      "grad_norm": 0.3984375,
+      "learning_rate": 0.0013008739076154807,
+      "loss": 1.2823,
       "step": 168000
     },
     {
-      "epoch": 1.0956499122179595,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0019561740035112817,
-      "loss": 1.5646,
+      "epoch": 17.53017062005826,
+      "grad_norm": 0.2333984375,
+      "learning_rate": 0.0012987931751976696,
+      "loss": 1.2826,
       "step": 168500
     },
     {
-      "epoch": 1.098901098901099,
-      "grad_norm": 1.5859375,
-      "learning_rate": 0.001956043956043956,
-      "loss": 1.583,
+      "epoch": 17.582188930503538,
+      "grad_norm": 0.255859375,
+      "learning_rate": 0.0012967124427798583,
+      "loss": 1.2823,
       "step": 169000
     },
     {
-      "epoch": 1.1021522855842383,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0019559139085766303,
-      "loss": 1.5675,
+      "epoch": 17.634207240948815,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.0012946317103620475,
+      "loss": 1.2851,
       "step": 169500
     },
     {
-      "epoch": 1.1054034722673776,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.001955783861109305,
-      "loss": 1.5986,
+      "epoch": 17.68622555139409,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0012925509779442364,
+      "loss": 1.2828,
       "step": 170000
     },
     {
-      "epoch": 1.1086546589505168,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019556538136419797,
-      "loss": 1.6581,
+      "epoch": 17.73824386183937,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0012904702455264253,
+      "loss": 1.2842,
       "step": 170500
     },
     {
-      "epoch": 1.1119058456336564,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.001955523766174654,
-      "loss": 1.6557,
+      "epoch": 17.790262172284645,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0012883895131086142,
+      "loss": 1.2839,
       "step": 171000
     },
     {
-      "epoch": 1.1151570323167956,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.001955393718707328,
-      "loss": 1.6413,
+      "epoch": 17.842280482729922,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0012863087806908031,
+      "loss": 1.2848,
       "step": 171500
     },
     {
-      "epoch": 1.118408218999935,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.001955263671240003,
-      "loss": 1.6369,
+      "epoch": 17.8942987931752,
+      "grad_norm": 1.0,
+      "learning_rate": 0.0012842280482729923,
+      "loss": 1.2834,
       "step": 172000
     },
     {
-      "epoch": 1.1216594056830744,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001955133623772677,
-      "loss": 1.637,
+      "epoch": 17.946317103620473,
+      "grad_norm": 0.2099609375,
+      "learning_rate": 0.0012821473158551812,
+      "loss": 1.2839,
       "step": 172500
     },
     {
-      "epoch": 1.1249105923662137,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0019550035763053514,
-      "loss": 1.6328,
+      "epoch": 17.99833541406575,
+      "grad_norm": 0.283203125,
+      "learning_rate": 0.0012800665834373699,
+      "loss": 1.2837,
       "step": 173000
     },
     {
-      "epoch": 1.128161779049353,
-      "grad_norm": 2.4375,
-      "learning_rate": 0.001954873528838026,
-      "loss": 1.6243,
+      "epoch": 18.0,
+      "eval_loss": 1.3176885843276978,
+      "eval_runtime": 1.6332,
+      "eval_samples_per_second": 612.278,
+      "eval_steps_per_second": 0.612,
+      "step": 173016
+    },
+    {
+      "epoch": 18.050353724511027,
+      "grad_norm": 0.7734375,
+      "learning_rate": 0.001277985851019559,
+      "loss": 1.2788,
       "step": 173500
     },
     {
-      "epoch": 1.1314129657324923,
-      "grad_norm": 3.828125,
-      "learning_rate": 0.0019547434813707004,
-      "loss": 1.6165,
+      "epoch": 18.102372034956304,
+      "grad_norm": 0.224609375,
+      "learning_rate": 0.001275905118601748,
+      "loss": 1.28,
       "step": 174000
     },
     {
-      "epoch": 1.1346641524156318,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0019546134339033746,
-      "loss": 1.6187,
+      "epoch": 18.15439034540158,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0012738243861839366,
+      "loss": 1.2797,
       "step": 174500
     },
     {
-      "epoch": 1.137915339098771,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0019544833864360493,
-      "loss": 1.6138,
+      "epoch": 18.206408655846857,
+      "grad_norm": 0.2890625,
+      "learning_rate": 0.0012717436537661257,
+      "loss": 1.2813,
       "step": 175000
     },
     {
-      "epoch": 1.1411665257819104,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0019543533389687236,
-      "loss": 1.6141,
+      "epoch": 18.258426966292134,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0012696629213483147,
+      "loss": 1.2816,
       "step": 175500
     },
     {
-      "epoch": 1.1444177124650499,
-      "grad_norm": 1.3515625,
-      "learning_rate": 0.001954223291501398,
-      "loss": 1.6027,
+      "epoch": 18.31044527673741,
+      "grad_norm": 0.21484375,
+      "learning_rate": 0.0012675821889305036,
+      "loss": 1.282,
       "step": 176000
     },
     {
-      "epoch": 1.1476688991481891,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0019540932440340725,
-      "loss": 1.6105,
+      "epoch": 18.36246358718269,
+      "grad_norm": 0.62109375,
+      "learning_rate": 0.0012655014565126925,
+      "loss": 1.2821,
       "step": 176500
     },
     {
-      "epoch": 1.1509200858313284,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.001953963196566747,
-      "loss": 1.6119,
+      "epoch": 18.414481897627965,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0012634207240948814,
+      "loss": 1.2827,
       "step": 177000
     },
     {
-      "epoch": 1.1541712725144677,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.001953833149099421,
-      "loss": 1.6046,
+      "epoch": 18.466500208073242,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.0012613399916770703,
+      "loss": 1.2802,
       "step": 177500
     },
     {
-      "epoch": 1.1574224591976072,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0019537031016320957,
-      "loss": 1.6093,
+      "epoch": 18.51851851851852,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0012592592592592592,
+      "loss": 1.2803,
       "step": 178000
     },
     {
-      "epoch": 1.1606736458807465,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0019535730541647704,
-      "loss": 1.6047,
+      "epoch": 18.570536828963796,
+      "grad_norm": 0.353515625,
+      "learning_rate": 0.0012571785268414481,
+      "loss": 1.2808,
       "step": 178500
     },
     {
-      "epoch": 1.1639248325638858,
-      "grad_norm": 1.4140625,
-      "learning_rate": 0.0019534430066974447,
-      "loss": 1.5999,
+      "epoch": 18.622555139409073,
+      "grad_norm": 0.1865234375,
+      "learning_rate": 0.0012550977944236373,
+      "loss": 1.2795,
       "step": 179000
     },
     {
-      "epoch": 1.167176019247025,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.001953312959230119,
-      "loss": 1.5978,
+      "epoch": 18.67457344985435,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0012530170620058262,
+      "loss": 1.2789,
       "step": 179500
     },
     {
-      "epoch": 1.1704272059301646,
-      "grad_norm": 1.1796875,
-      "learning_rate": 0.0019531829117627937,
-      "loss": 1.5949,
+      "epoch": 18.726591760299627,
+      "grad_norm": 0.2158203125,
+      "learning_rate": 0.0012509363295880149,
+      "loss": 1.2805,
       "step": 180000
     },
     {
-      "epoch": 1.1736783926133039,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.001953052864295468,
-      "loss": 1.5913,
+      "epoch": 18.778610070744904,
+      "grad_norm": 1.640625,
+      "learning_rate": 0.001248855597170204,
+      "loss": 1.2808,
       "step": 180500
     },
     {
-      "epoch": 1.1769295792964432,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0019529228168281424,
-      "loss": 1.5873,
+      "epoch": 18.83062838119018,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.001246774864752393,
+      "loss": 1.2798,
       "step": 181000
     },
     {
-      "epoch": 1.1801807659795824,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0019527927693608167,
-      "loss": 1.5831,
+      "epoch": 18.882646691635454,
+      "grad_norm": 0.2490234375,
+      "learning_rate": 0.0012446941323345816,
+      "loss": 1.2794,
       "step": 181500
     },
     {
-      "epoch": 1.183431952662722,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019526627218934911,
-      "loss": 1.575,
+      "epoch": 18.93466500208073,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0012426133999167708,
+      "loss": 1.2801,
       "step": 182000
     },
     {
-      "epoch": 1.1866831393458612,
-      "grad_norm": 0.890625,
-      "learning_rate": 0.0019525326744261656,
-      "loss": 1.583,
+      "epoch": 18.986683312526008,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0012405326674989597,
+      "loss": 1.2823,
       "step": 182500
     },
     {
-      "epoch": 1.1899343260290005,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0019524026269588399,
-      "loss": 1.5749,
+      "epoch": 19.0,
+      "eval_loss": 1.3176276683807373,
+      "eval_runtime": 1.3968,
+      "eval_samples_per_second": 715.946,
+      "eval_steps_per_second": 0.716,
+      "step": 182628
+    },
+    {
+      "epoch": 19.038701622971285,
+      "grad_norm": 0.203125,
+      "learning_rate": 0.0012384519350811486,
+      "loss": 1.2802,
       "step": 183000
     },
     {
-      "epoch": 1.19318551271214,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019522725794915144,
-      "loss": 1.5779,
+      "epoch": 19.090719933416562,
+      "grad_norm": 0.326171875,
+      "learning_rate": 0.0012363712026633375,
+      "loss": 1.2782,
       "step": 183500
     },
     {
-      "epoch": 1.1964366993952793,
-      "grad_norm": 3.5625,
-      "learning_rate": 0.0019521425320241888,
-      "loss": 1.5732,
+      "epoch": 19.14273824386184,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.0012342904702455264,
+      "loss": 1.2769,
       "step": 184000
     },
     {
-      "epoch": 1.1996878860784186,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.001952012484556863,
-      "loss": 1.5771,
+      "epoch": 19.194756554307116,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012322097378277153,
+      "loss": 1.2789,
       "step": 184500
     },
     {
-      "epoch": 1.2029390727615579,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0019518824370895378,
-      "loss": 1.58,
+      "epoch": 19.246774864752393,
+      "grad_norm": 0.24609375,
+      "learning_rate": 0.0012301290054099045,
+      "loss": 1.279,
       "step": 185000
     },
     {
-      "epoch": 1.2061902594446974,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0019517523896222123,
-      "loss": 1.5675,
+      "epoch": 19.29879317519767,
+      "grad_norm": 0.2080078125,
+      "learning_rate": 0.0012280482729920932,
+      "loss": 1.2813,
       "step": 185500
     },
     {
-      "epoch": 1.2094414461278367,
-      "grad_norm": 4.25,
-      "learning_rate": 0.0019516223421548868,
-      "loss": 1.563,
+      "epoch": 19.350811485642947,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0012259675405742823,
+      "loss": 1.2811,
       "step": 186000
     },
     {
-      "epoch": 1.212692632810976,
-      "grad_norm": 1.1015625,
-      "learning_rate": 0.001951492294687561,
-      "loss": 1.5731,
+      "epoch": 19.402829796088223,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0012238868081564712,
+      "loss": 1.2826,
       "step": 186500
     },
     {
-      "epoch": 1.2159438194941155,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0019513622472202355,
-      "loss": 1.569,
+      "epoch": 19.4548481065335,
+      "grad_norm": 0.27734375,
+      "learning_rate": 0.00122180607573866,
+      "loss": 1.2811,
       "step": 187000
     },
     {
-      "epoch": 1.2191950061772547,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.00195123219975291,
-      "loss": 1.5707,
+      "epoch": 19.506866416978777,
+      "grad_norm": 0.2236328125,
+      "learning_rate": 0.001219725343320849,
+      "loss": 1.2814,
       "step": 187500
     },
     {
-      "epoch": 1.222446192860394,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.0019511021522855842,
-      "loss": 1.5635,
+      "epoch": 19.558884727424054,
+      "grad_norm": 0.41015625,
+      "learning_rate": 0.001217644610903038,
+      "loss": 1.2794,
       "step": 188000
     },
     {
-      "epoch": 1.2256973795435333,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0019509721048182587,
-      "loss": 1.5603,
+      "epoch": 19.61090303786933,
+      "grad_norm": 0.2041015625,
+      "learning_rate": 0.0012155638784852269,
+      "loss": 1.2801,
       "step": 188500
     },
     {
-      "epoch": 1.2289485662266728,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0019508420573509332,
-      "loss": 1.5645,
+      "epoch": 19.662921348314608,
+      "grad_norm": 0.435546875,
+      "learning_rate": 0.0012134831460674158,
+      "loss": 1.2796,
       "step": 189000
     },
     {
-      "epoch": 1.232199752909812,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019507120098836074,
-      "loss": 1.5594,
+      "epoch": 19.714939658759885,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0012114024136496047,
+      "loss": 1.2785,
       "step": 189500
     },
     {
-      "epoch": 1.2354509395929514,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.001950581962416282,
-      "loss": 1.5594,
+      "epoch": 19.76695796920516,
+      "grad_norm": 5.6875,
+      "learning_rate": 0.0012093216812317936,
+      "loss": 1.2783,
       "step": 190000
     },
     {
-      "epoch": 1.2387021262760909,
-      "grad_norm": 0.94140625,
-      "learning_rate": 0.0019504519149489564,
-      "loss": 1.5585,
+      "epoch": 19.818976279650435,
+      "grad_norm": 0.205078125,
+      "learning_rate": 0.0012072409488139825,
+      "loss": 1.2782,
       "step": 190500
     },
     {
-      "epoch": 1.2419533129592302,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0019503218674816307,
-      "loss": 1.5542,
+      "epoch": 19.870994590095712,
+      "grad_norm": 3.21875,
+      "learning_rate": 0.0012051602163961714,
+      "loss": 1.2792,
       "step": 191000
     },
     {
-      "epoch": 1.2452044996423695,
-      "grad_norm": 2.671875,
-      "learning_rate": 0.0019501918200143051,
-      "loss": 1.554,
+      "epoch": 19.92301290054099,
+      "grad_norm": 0.2119140625,
+      "learning_rate": 0.0012030794839783603,
+      "loss": 1.2803,
       "step": 191500
     },
     {
-      "epoch": 1.2484556863255087,
-      "grad_norm": 1.890625,
-      "learning_rate": 0.0019500617725469796,
-      "loss": 1.5572,
+      "epoch": 19.975031210986266,
+      "grad_norm": 0.419921875,
+      "learning_rate": 0.0012009987515605495,
+      "loss": 1.2778,
       "step": 192000
     },
     {
-      "epoch": 1.251706873008648,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0019499317250796543,
-      "loss": 1.5587,
+      "epoch": 20.0,
+      "eval_loss": 1.3075087070465088,
+      "eval_runtime": 1.6065,
+      "eval_samples_per_second": 622.457,
+      "eval_steps_per_second": 0.622,
+      "step": 192240
+    },
+    {
+      "epoch": 20.027049521431543,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011989180191427382,
+      "loss": 1.2763,
       "step": 192500
     },
     {
-      "epoch": 1.2549580596917875,
-      "grad_norm": 8.4375,
-      "learning_rate": 0.0019498016776123286,
-      "loss": 1.5515,
+      "epoch": 20.07906783187682,
+      "grad_norm": 0.26171875,
+      "learning_rate": 0.0011968372867249273,
+      "loss": 1.2755,
       "step": 193000
     },
     {
-      "epoch": 1.2582092463749268,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.001949671630145003,
-      "loss": 1.5584,
+      "epoch": 20.131086142322097,
+      "grad_norm": 0.220703125,
+      "learning_rate": 0.0011947565543071162,
+      "loss": 1.2754,
       "step": 193500
     },
     {
-      "epoch": 1.2614604330580663,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0019495415826776775,
-      "loss": 1.5514,
+      "epoch": 20.183104452767374,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.001192675821889305,
+      "loss": 1.2756,
       "step": 194000
     },
     {
-      "epoch": 1.2647116197412056,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0019494115352103518,
-      "loss": 1.5492,
+      "epoch": 20.23512276321265,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.001190595089471494,
+      "loss": 1.278,
       "step": 194500
     },
     {
-      "epoch": 1.267962806424345,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0019492814877430263,
-      "loss": 1.5522,
+      "epoch": 20.287141073657928,
+      "grad_norm": 0.1767578125,
+      "learning_rate": 0.001188514357053683,
+      "loss": 1.2766,
       "step": 195000
     },
     {
-      "epoch": 1.2712139931074842,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019491514402757008,
-      "loss": 1.5548,
+      "epoch": 20.339159384103205,
+      "grad_norm": 0.466796875,
+      "learning_rate": 0.0011864336246358719,
+      "loss": 1.2769,
       "step": 195500
     },
     {
-      "epoch": 1.2744651797906235,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.001949021392808375,
-      "loss": 1.55,
+      "epoch": 20.39117769454848,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0011843528922180608,
+      "loss": 1.2788,
       "step": 196000
     },
     {
-      "epoch": 1.277716366473763,
-      "grad_norm": 1.21875,
-      "learning_rate": 0.0019488913453410495,
-      "loss": 1.5467,
+      "epoch": 20.44319600499376,
+      "grad_norm": 0.20703125,
+      "learning_rate": 0.0011822721598002497,
+      "loss": 1.279,
       "step": 196500
     },
     {
-      "epoch": 1.2809675531569022,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.001948761297873724,
-      "loss": 1.5492,
+      "epoch": 20.495214315439036,
+      "grad_norm": 0.17578125,
+      "learning_rate": 0.0011801914273824386,
+      "loss": 1.2782,
       "step": 197000
     },
     {
-      "epoch": 1.2842187398400415,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019486312504063982,
-      "loss": 1.5494,
+      "epoch": 20.547232625884313,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0011781106949646277,
+      "loss": 1.2773,
       "step": 197500
     },
     {
-      "epoch": 1.287469926523181,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0019485012029390727,
-      "loss": 1.5533,
+      "epoch": 20.59925093632959,
+      "grad_norm": 0.22265625,
+      "learning_rate": 0.0011760299625468164,
+      "loss": 1.279,
       "step": 198000
     },
     {
-      "epoch": 1.2907211132063203,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.0019483711554717472,
-      "loss": 1.5472,
+      "epoch": 20.651269246774866,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.0011739492301290053,
+      "loss": 1.2786,
       "step": 198500
     },
     {
-      "epoch": 1.2939722998894596,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0019482411080044215,
-      "loss": 1.5491,
+      "epoch": 20.70328755722014,
+      "grad_norm": 0.296875,
+      "learning_rate": 0.0011718684977111945,
+      "loss": 1.2788,
       "step": 199000
     },
     {
-      "epoch": 1.297223486572599,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.0019481110605370962,
-      "loss": 1.5507,
+      "epoch": 20.755305867665417,
+      "grad_norm": 0.8046875,
+      "learning_rate": 0.0011697877652933832,
+      "loss": 1.2781,
       "step": 199500
     },
     {
-      "epoch": 1.3004746732557384,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.0019479810130697706,
-      "loss": 1.5485,
+      "epoch": 20.807324178110694,
+      "grad_norm": 0.2314453125,
+      "learning_rate": 0.001167707032875572,
+      "loss": 1.2789,
       "step": 200000
     },
     {
-      "epoch": 1.3037258599388777,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0019478509656024451,
-      "loss": 1.5496,
+      "epoch": 20.85934248855597,
+      "grad_norm": 0.23046875,
+      "learning_rate": 0.0011656263004577612,
+      "loss": 1.2809,
       "step": 200500
     },
     {
-      "epoch": 1.306977046622017,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0019477209181351194,
-      "loss": 1.5401,
+      "epoch": 20.911360799001248,
+      "grad_norm": 0.21875,
+      "learning_rate": 0.0011635455680399501,
+      "loss": 1.2778,
       "step": 201000
     },
     {
-      "epoch": 1.3102282333051565,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0019475908706677938,
-      "loss": 1.5444,
+      "epoch": 20.963379109446524,
+      "grad_norm": 0.265625,
+      "learning_rate": 0.001161464835622139,
+      "loss": 1.2777,
       "step": 201500
     },
     {
-      "epoch": 1.3134794199882958,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0019474608232004683,
-      "loss": 1.5402,
+      "epoch": 21.0,
+      "eval_loss": 1.3104900121688843,
+      "eval_runtime": 1.3896,
+      "eval_samples_per_second": 719.624,
+      "eval_steps_per_second": 0.72,
+      "step": 201852
+    },
+    {
+      "epoch": 21.0153974198918,
+      "grad_norm": 0.2265625,
+      "learning_rate": 0.001159384103204328,
+      "loss": 1.2777,
       "step": 202000
     },
     {
-      "epoch": 1.316730606671435,
-      "grad_norm": 1.484375,
-      "learning_rate": 0.0019473307757331426,
-      "loss": 1.5417,
+      "epoch": 21.06741573033708,
+      "grad_norm": 0.173828125,
+      "learning_rate": 0.0011573033707865169,
+      "loss": 1.2763,
       "step": 202500
     },
     {
-      "epoch": 1.3199817933545743,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.001947200728265817,
-      "loss": 1.5397,
+      "epoch": 21.119434040782355,
+      "grad_norm": 0.2353515625,
+      "learning_rate": 0.0011552226383687058,
+      "loss": 1.2753,
       "step": 203000
     },
     {
-      "epoch": 1.3232329800377138,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0019470706807984915,
-      "loss": 1.5408,
+      "epoch": 21.171452351227632,
+      "grad_norm": 0.1875,
+      "learning_rate": 0.0011531419059508947,
+      "loss": 1.2752,
       "step": 203500
     },
     {
-      "epoch": 1.3264841667208531,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0019469406333311658,
-      "loss": 1.54,
+      "epoch": 21.22347066167291,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0011510611735330836,
+      "loss": 1.2754,
       "step": 204000
     },
     {
-      "epoch": 1.3297353534039924,
-      "grad_norm": 0.98046875,
-      "learning_rate": 0.0019468105858638403,
-      "loss": 1.5395,
+      "epoch": 21.275488972118186,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0011489804411152727,
+      "loss": 1.2756,
       "step": 204500
     },
     {
-      "epoch": 1.332986540087132,
-      "grad_norm": 0.9453125,
-      "learning_rate": 0.0019466805383965148,
-      "loss": 1.5428,
+      "epoch": 21.327507282563463,
+      "grad_norm": 0.2412109375,
+      "learning_rate": 0.0011468997086974614,
+      "loss": 1.2754,
       "step": 205000
     },
     {
-      "epoch": 1.3362377267702712,
-      "grad_norm": 0.91796875,
-      "learning_rate": 0.001946550490929189,
-      "loss": 1.533,
+      "epoch": 21.37952559300874,
+      "grad_norm": 0.208984375,
+      "learning_rate": 0.0011448189762796504,
+      "loss": 1.2753,
       "step": 205500
     },
     {
-      "epoch": 1.3394889134534105,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0019464204434618635,
-      "loss": 1.5305,
+      "epoch": 21.431543903454017,
+      "grad_norm": 0.361328125,
+      "learning_rate": 0.0011427382438618395,
+      "loss": 1.2757,
       "step": 206000
     },
     {
-      "epoch": 1.3427401001365498,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.001946290395994538,
-      "loss": 1.5351,
+      "epoch": 21.483562213899294,
+      "grad_norm": 0.19140625,
+      "learning_rate": 0.0011406575114440284,
+      "loss": 1.2755,
       "step": 206500
     },
     {
-      "epoch": 1.345991286819689,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019461603485272127,
-      "loss": 1.5327,
+      "epoch": 21.53558052434457,
+      "grad_norm": 0.248046875,
+      "learning_rate": 0.001138576779026217,
+      "loss": 1.2764,
       "step": 207000
     },
     {
-      "epoch": 1.3492424735028286,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.001946030301059887,
-      "loss": 1.5305,
+      "epoch": 21.587598834789844,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0011364960466084062,
+      "loss": 1.2765,
       "step": 207500
     },
     {
-      "epoch": 1.3524936601859678,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0019459002535925614,
-      "loss": 1.5374,
+      "epoch": 21.63961714523512,
+      "grad_norm": 0.48828125,
+      "learning_rate": 0.0011344153141905951,
+      "loss": 1.2756,
       "step": 208000
     },
     {
-      "epoch": 1.3557448468691073,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.001945770206125236,
-      "loss": 1.5349,
+      "epoch": 21.691635455680398,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.001132334581772784,
+      "loss": 1.2772,
       "step": 208500
     },
     {
-      "epoch": 1.3589960335522466,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0019456401586579102,
-      "loss": 1.5313,
+      "epoch": 21.743653766125675,
+      "grad_norm": 0.30859375,
+      "learning_rate": 0.001130253849354973,
+      "loss": 1.2757,
       "step": 209000
     },
     {
-      "epoch": 1.362247220235386,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019455101111905846,
-      "loss": 1.5399,
+      "epoch": 21.795672076570952,
+      "grad_norm": 2.46875,
+      "learning_rate": 0.0011281731169371619,
+      "loss": 1.2767,
       "step": 209500
     },
     {
-      "epoch": 1.3654984069185252,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0019453800637232591,
-      "loss": 1.5343,
+      "epoch": 21.84769038701623,
+      "grad_norm": 0.2431640625,
+      "learning_rate": 0.001126092384519351,
+      "loss": 1.2741,
       "step": 210000
     },
     {
-      "epoch": 1.3687495936016645,
-      "grad_norm": 1.2421875,
-      "learning_rate": 0.0019452500162559334,
-      "loss": 1.5308,
+      "epoch": 21.899708697461506,
+      "grad_norm": 0.94921875,
+      "learning_rate": 0.0011240116521015397,
+      "loss": 1.277,
       "step": 210500
     },
     {
-      "epoch": 1.372000780284804,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019451199687886079,
-      "loss": 1.5316,
+      "epoch": 21.951727007906783,
+      "grad_norm": 0.1923828125,
+      "learning_rate": 0.0011219309196837286,
+      "loss": 1.2761,
       "step": 211000
     },
     {
-      "epoch": 1.3752519669679433,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0019449899213212823,
-      "loss": 1.5338,
+      "epoch": 22.0,
+      "eval_loss": 1.309814453125,
+      "eval_runtime": 1.7271,
+      "eval_samples_per_second": 578.995,
+      "eval_steps_per_second": 0.579,
+      "step": 211464
+    },
+    {
+      "epoch": 22.00374531835206,
+      "grad_norm": 0.212890625,
+      "learning_rate": 0.0011198501872659178,
+      "loss": 1.2756,
       "step": 211500
     },
     {
-      "epoch": 1.3785031536510828,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0019448598738539566,
-      "loss": 1.5312,
+      "epoch": 22.055763628797337,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011177694548481065,
+      "loss": 1.2751,
       "step": 212000
     },
     {
-      "epoch": 1.381754340334222,
-      "grad_norm": 0.75,
-      "learning_rate": 0.001944729826386631,
-      "loss": 1.5325,
+      "epoch": 22.107781939242614,
+      "grad_norm": 0.181640625,
+      "learning_rate": 0.0011156887224302954,
+      "loss": 1.2746,
       "step": 212500
     },
     {
-      "epoch": 1.3850055270173613,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0019445997789193056,
-      "loss": 1.5329,
+      "epoch": 22.15980024968789,
+      "grad_norm": 0.279296875,
+      "learning_rate": 0.0011136079900124845,
+      "loss": 1.2754,
       "step": 213000
     },
     {
-      "epoch": 1.3882567137005006,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0019444697314519798,
-      "loss": 1.5323,
+      "epoch": 22.211818560133167,
+      "grad_norm": 0.259765625,
+      "learning_rate": 0.0011115272575946734,
+      "loss": 1.2767,
       "step": 213500
     },
     {
-      "epoch": 1.39150790038364,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.0019443396839846543,
-      "loss": 1.5399,
+      "epoch": 22.263836870578444,
+      "grad_norm": 0.2392578125,
+      "learning_rate": 0.0011094465251768621,
+      "loss": 1.2737,
       "step": 214000
     },
     {
-      "epoch": 1.3947590870667794,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.001944209636517329,
-      "loss": 1.5351,
+      "epoch": 22.31585518102372,
+      "grad_norm": 0.2734375,
+      "learning_rate": 0.0011073657927590512,
+      "loss": 1.2737,
       "step": 214500
     },
     {
-      "epoch": 1.3980102737499187,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0019440795890500035,
-      "loss": 1.5353,
+      "epoch": 22.367873491469,
+      "grad_norm": 0.33984375,
+      "learning_rate": 0.0011052850603412402,
+      "loss": 1.2735,
       "step": 215000
     },
     {
-      "epoch": 1.401261460433058,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0019439495415826777,
-      "loss": 1.5367,
+      "epoch": 22.419891801914275,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.001103204327923429,
+      "loss": 1.2733,
       "step": 215500
     },
     {
-      "epoch": 1.4045126471161975,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0019438194941153522,
-      "loss": 1.5354,
+      "epoch": 22.471910112359552,
+      "grad_norm": 0.373046875,
+      "learning_rate": 0.001101123595505618,
+      "loss": 1.2723,
       "step": 216000
     },
     {
-      "epoch": 1.4077638337993368,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0019436894466480267,
-      "loss": 1.5261,
+      "epoch": 22.52392842280483,
+      "grad_norm": 0.2216796875,
+      "learning_rate": 0.001099042863087807,
+      "loss": 1.274,
       "step": 216500
     },
     {
-      "epoch": 1.411015020482476,
-      "grad_norm": 0.9140625,
-      "learning_rate": 0.001943559399180701,
-      "loss": 1.5305,
+      "epoch": 22.575946733250102,
+      "grad_norm": 0.267578125,
+      "learning_rate": 0.001096962130669996,
+      "loss": 1.2744,
       "step": 217000
     },
     {
-      "epoch": 1.4142662071656154,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0019434293517133754,
-      "loss": 1.5362,
+      "epoch": 22.62796504369538,
+      "grad_norm": 0.1953125,
+      "learning_rate": 0.0010948813982521847,
+      "loss": 1.2758,
       "step": 217500
     },
     {
-      "epoch": 1.4175173938487549,
-      "grad_norm": 0.98828125,
-      "learning_rate": 0.00194329930424605,
-      "loss": 1.5272,
+      "epoch": 22.679983354140656,
+      "grad_norm": 0.28125,
+      "learning_rate": 0.0010928006658343736,
+      "loss": 1.2755,
       "step": 218000
     },
     {
-      "epoch": 1.4207685805318941,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019431692567787242,
-      "loss": 1.5304,
+      "epoch": 22.732001664585933,
+      "grad_norm": 0.23828125,
+      "learning_rate": 0.0010907199334165628,
+      "loss": 1.2742,
       "step": 218500
     },
     {
-      "epoch": 1.4240197672150334,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019430392093113986,
-      "loss": 1.5318,
+      "epoch": 22.78401997503121,
+      "grad_norm": 0.2109375,
+      "learning_rate": 0.0010886392009987517,
+      "loss": 1.2741,
       "step": 219000
     },
     {
-      "epoch": 1.427270953898173,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0019429091618440731,
-      "loss": 1.5312,
+      "epoch": 22.836038285476487,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0010865584685809404,
+      "loss": 1.2739,
       "step": 219500
     },
     {
-      "epoch": 1.4305221405813122,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0019427791143767474,
-      "loss": 1.5338,
+      "epoch": 22.888056595921764,
+      "grad_norm": 0.166015625,
+      "learning_rate": 0.0010844777361631295,
+      "loss": 1.2747,
       "step": 220000
     },
     {
-      "epoch": 1.4337733272644515,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0019426490669094219,
-      "loss": 1.5308,
+      "epoch": 22.94007490636704,
+      "grad_norm": 0.216796875,
+      "learning_rate": 0.0010823970037453184,
+      "loss": 1.2733,
       "step": 220500
     },
     {
-      "epoch": 1.4370245139475908,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0019425190194420963,
-      "loss": 1.5243,
+      "epoch": 22.992093216812318,
+      "grad_norm": 0.197265625,
+      "learning_rate": 0.0010803162713275071,
+      "loss": 1.2734,
       "step": 221000
     },
     {
-      "epoch": 1.4402757006307303,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.001942388971974771,
-      "loss": 1.528,
+      "epoch": 23.0,
+      "eval_loss": 1.3059455156326294,
+      "eval_runtime": 1.6804,
+      "eval_samples_per_second": 595.106,
+      "eval_steps_per_second": 0.595,
+      "step": 221076
+    },
+    {
+      "epoch": 23.044111527257595,
+      "grad_norm": 0.2060546875,
+      "learning_rate": 0.0010782355389096963,
+      "loss": 1.2717,
       "step": 221500
     },
     {
-      "epoch": 1.4435268873138696,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0019422589245074453,
-      "loss": 1.5385,
+      "epoch": 23.096129837702872,
+      "grad_norm": 0.2021484375,
+      "learning_rate": 0.0010761548064918852,
+      "loss": 1.2719,
       "step": 222000
     },
     {
-      "epoch": 1.4467780739970089,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0019421288770401198,
-      "loss": 1.5363,
+      "epoch": 23.14814814814815,
+      "grad_norm": 0.1796875,
+      "learning_rate": 0.0010740740740740743,
+      "loss": 1.2732,
       "step": 222500
     },
     {
-      "epoch": 1.4500292606801484,
-      "grad_norm": 6.3125,
-      "learning_rate": 0.0019419988295727943,
-      "loss": 1.528,
+      "epoch": 23.200166458593426,
+      "grad_norm": 0.318359375,
+      "learning_rate": 0.001071993341656263,
+      "loss": 1.2743,
       "step": 223000
     },
     {
-      "epoch": 1.4532804473632877,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0019418687821054685,
-      "loss": 1.5238,
+      "epoch": 23.252184769038703,
+      "grad_norm": 0.87890625,
+      "learning_rate": 0.001069912609238452,
+      "loss": 1.2724,
       "step": 223500
     },
     {
-      "epoch": 1.456531634046427,
-      "grad_norm": 1.953125,
-      "learning_rate": 0.001941738734638143,
-      "loss": 1.5261,
+      "epoch": 23.30420307948398,
+      "grad_norm": 0.423828125,
+      "learning_rate": 0.001067831876820641,
+      "loss": 1.271,
       "step": 224000
     },
     {
-      "epoch": 1.4597828207295662,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0019416086871708175,
-      "loss": 1.5223,
+      "epoch": 23.356221389929257,
+      "grad_norm": 0.25390625,
+      "learning_rate": 0.0010657511444028297,
+      "loss": 1.2721,
       "step": 224500
     },
     {
-      "epoch": 1.4630340074127055,
-      "grad_norm": 1.796875,
-      "learning_rate": 0.0019414786397034917,
-      "loss": 1.5345,
+      "epoch": 23.408239700374533,
+      "grad_norm": 0.1962890625,
+      "learning_rate": 0.0010636704119850186,
+      "loss": 1.2716,
       "step": 225000
     },
     {
-      "epoch": 1.466285194095845,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019413485922361662,
-      "loss": 1.5254,
+      "epoch": 23.460258010819807,
+      "grad_norm": 0.294921875,
+      "learning_rate": 0.0010615896795672078,
+      "loss": 1.2714,
       "step": 225500
     },
     {
-      "epoch": 1.4695363807789843,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0019412185447688407,
-      "loss": 1.5274,
+      "epoch": 23.512276321265084,
+      "grad_norm": 0.29296875,
+      "learning_rate": 0.0010595089471493967,
+      "loss": 1.2719,
       "step": 226000
     },
     {
-      "epoch": 1.4727875674621238,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.001941088497301515,
-      "loss": 1.5266,
+      "epoch": 23.56429463171036,
+      "grad_norm": 0.189453125,
+      "learning_rate": 0.0010574282147315854,
+      "loss": 1.2716,
       "step": 226500
     },
     {
-      "epoch": 1.476038754145263,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0019409584498341894,
-      "loss": 1.5234,
+      "epoch": 23.616312942155638,
+      "grad_norm": 0.2138671875,
+      "learning_rate": 0.0010553474823137745,
+      "loss": 1.272,
       "step": 227000
     },
     {
-      "epoch": 1.4792899408284024,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.001940828402366864,
-      "loss": 1.5274,
+      "epoch": 23.668331252600915,
+      "grad_norm": 0.890625,
+      "learning_rate": 0.0010532667498959634,
+      "loss": 1.2729,
       "step": 227500
     },
     {
-      "epoch": 1.4825411275115417,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019406983548995382,
-      "loss": 1.5308,
+      "epoch": 23.72034956304619,
+      "grad_norm": 0.2470703125,
+      "learning_rate": 0.0010511860174781521,
+      "loss": 1.2728,
       "step": 228000
     },
     {
-      "epoch": 1.485792314194681,
-      "grad_norm": 1.0859375,
-      "learning_rate": 0.0019405683074322127,
-      "loss": 1.5266,
+      "epoch": 23.77236787349147,
+      "grad_norm": 0.251953125,
+      "learning_rate": 0.0010491052850603413,
+      "loss": 1.2733,
       "step": 228500
     },
     {
-      "epoch": 1.4890435008778204,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0019404382599648873,
-      "loss": 1.5297,
+      "epoch": 23.824386183936745,
+      "grad_norm": 0.1806640625,
+      "learning_rate": 0.0010470245526425302,
+      "loss": 1.2713,
       "step": 229000
     },
     {
-      "epoch": 1.4922946875609597,
-      "grad_norm": 1.078125,
-      "learning_rate": 0.0019403082124975618,
-      "loss": 1.5334,
+      "epoch": 23.876404494382022,
+      "grad_norm": 0.40234375,
+      "learning_rate": 0.0010449438202247193,
+      "loss": 1.2714,
       "step": 229500
     },
     {
-      "epoch": 1.495545874244099,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.001940178165030236,
-      "loss": 1.5235,
+      "epoch": 23.9284228048273,
+      "grad_norm": 0.30078125,
+      "learning_rate": 0.001042863087806908,
+      "loss": 1.2724,
       "step": 230000
     },
     {
-      "epoch": 1.4987970609272385,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0019400481175629106,
-      "loss": 1.52,
+      "epoch": 23.980441115272576,
+      "grad_norm": 0.2197265625,
+      "learning_rate": 0.001040782355389097,
+      "loss": 1.2716,
       "step": 230500
     },
     {
-      "epoch": 1.5020482476103778,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.001939918070095585,
-      "loss": 1.5206,
-      "step": 231000
-    },
-    {
-      "epoch": 1.505299434293517,
-      "grad_norm": 1.3515625,
-      "learning_rate": 0.0019397880226282593,
-      "loss": 1.5168,
-      "step": 231500
-    },
-    {
-      "epoch": 1.5085506209766564,
-      "grad_norm": 2.796875,
-      "learning_rate": 0.0019396579751609338,
-      "loss": 1.5241,
-      "step": 232000
-    },
-    {
-      "epoch": 1.5118018076597957,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0019395279276936083,
-      "loss": 1.5297,
-      "step": 232500
-    },
-    {
-      "epoch": 1.5150529943429352,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0019393978802262825,
-      "loss": 1.526,
-      "step": 233000
-    },
-    {
-      "epoch": 1.5183041810260747,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.001939267832758957,
-      "loss": 1.5229,
-      "step": 233500
-    },
-    {
-      "epoch": 1.521555367709214,
-      "grad_norm": 1.609375,
-      "learning_rate": 0.0019391377852916315,
-      "loss": 1.5213,
-      "step": 234000
-    },
-    {
-      "epoch": 1.5248065543923532,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0019390077378243057,
-      "loss": 1.5181,
-      "step": 234500
-    },
-    {
-      "epoch": 1.5280577410754925,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0019388776903569802,
-      "loss": 1.5245,
-      "step": 235000
-    },
-    {
-      "epoch": 1.5313089277586318,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0019387476428896547,
-      "loss": 1.5236,
-      "step": 235500
-    },
-    {
-      "epoch": 1.534560114441771,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0019386175954223294,
-      "loss": 1.5121,
-      "step": 236000
-    },
-    {
-      "epoch": 1.5378113011249106,
-      "grad_norm": 2.265625,
-      "learning_rate": 0.0019384875479550037,
-      "loss": 1.5178,
-      "step": 236500
-    },
-    {
-      "epoch": 1.5410624878080499,
-      "grad_norm": 1.0859375,
-      "learning_rate": 0.0019383575004876781,
-      "loss": 1.5149,
-      "step": 237000
-    },
-    {
-      "epoch": 1.5443136744911894,
-      "grad_norm": 1.640625,
-      "learning_rate": 0.0019382274530203526,
-      "loss": 1.5149,
-      "step": 237500
-    },
-    {
-      "epoch": 1.5475648611743287,
-      "grad_norm": 1.359375,
-      "learning_rate": 0.0019380974055530269,
-      "loss": 1.5182,
-      "step": 238000
-    },
-    {
-      "epoch": 1.550816047857468,
-      "grad_norm": 1.453125,
-      "learning_rate": 0.0019379673580857014,
-      "loss": 1.5145,
-      "step": 238500
-    },
-    {
-      "epoch": 1.5540672345406072,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0019378373106183758,
-      "loss": 1.5163,
-      "step": 239000
-    },
-    {
-      "epoch": 1.5573184212237465,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.00193770726315105,
-      "loss": 1.5128,
-      "step": 239500
-    },
-    {
-      "epoch": 1.560569607906886,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019375772156837246,
-      "loss": 1.5102,
-      "step": 240000
-    },
-    {
-      "epoch": 1.5638207945900253,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.001937447168216399,
-      "loss": 1.5091,
-      "step": 240500
-    },
-    {
-      "epoch": 1.5670719812731648,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0019373171207490733,
-      "loss": 1.5095,
-      "step": 241000
-    },
-    {
-      "epoch": 1.5703231679563041,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0019371870732817478,
-      "loss": 1.5146,
-      "step": 241500
-    },
-    {
-      "epoch": 1.5735743546394434,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0019370570258144223,
-      "loss": 1.5162,
-      "step": 242000
-    },
-    {
-      "epoch": 1.5768255413225827,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.0019369269783470965,
-      "loss": 1.5214,
-      "step": 242500
-    },
-    {
-      "epoch": 1.580076728005722,
-      "grad_norm": 1.078125,
-      "learning_rate": 0.001936796930879771,
-      "loss": 1.5162,
-      "step": 243000
-    },
-    {
-      "epoch": 1.5833279146888615,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019366668834124457,
-      "loss": 1.5258,
-      "step": 243500
-    },
-    {
-      "epoch": 1.5865791013720008,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0019365368359451202,
-      "loss": 1.5242,
-      "step": 244000
-    },
-    {
-      "epoch": 1.5898302880551403,
-      "grad_norm": 0.97265625,
-      "learning_rate": 0.0019364067884777944,
-      "loss": 1.5162,
-      "step": 244500
-    },
-    {
-      "epoch": 1.5930814747382795,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.001936276741010469,
-      "loss": 1.5204,
-      "step": 245000
-    },
-    {
-      "epoch": 1.5963326614214188,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0019361466935431434,
-      "loss": 1.5206,
-      "step": 245500
-    },
-    {
-      "epoch": 1.5995838481045581,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0019360166460758177,
-      "loss": 1.5266,
-      "step": 246000
-    },
-    {
-      "epoch": 1.6028350347876974,
-      "grad_norm": 1.8828125,
-      "learning_rate": 0.0019358865986084921,
-      "loss": 1.5218,
-      "step": 246500
-    },
-    {
-      "epoch": 1.606086221470837,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0019357565511411666,
-      "loss": 1.5187,
-      "step": 247000
-    },
-    {
-      "epoch": 1.6093374081539762,
-      "grad_norm": 1.59375,
-      "learning_rate": 0.0019356265036738409,
-      "loss": 1.527,
-      "step": 247500
-    },
-    {
-      "epoch": 1.6125885948371157,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.0019354964562065154,
-      "loss": 1.519,
-      "step": 248000
-    },
-    {
-      "epoch": 1.615839781520255,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0019353664087391898,
-      "loss": 1.5178,
-      "step": 248500
-    },
-    {
-      "epoch": 1.6190909682033943,
-      "grad_norm": 0.953125,
-      "learning_rate": 0.001935236361271864,
-      "loss": 1.5131,
-      "step": 249000
-    },
-    {
-      "epoch": 1.6223421548865335,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0019351063138045386,
-      "loss": 1.5118,
-      "step": 249500
-    },
-    {
-      "epoch": 1.6255933415696728,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.001934976266337213,
-      "loss": 1.5144,
-      "step": 250000
-    },
-    {
-      "epoch": 1.6288445282528121,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0019348462188698878,
-      "loss": 1.5197,
-      "step": 250500
-    },
-    {
-      "epoch": 1.6320957149359516,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.001934716171402562,
-      "loss": 1.5172,
-      "step": 251000
-    },
-    {
-      "epoch": 1.6353469016190911,
-      "grad_norm": 1.984375,
-      "learning_rate": 0.0019345861239352365,
-      "loss": 1.5207,
-      "step": 251500
-    },
-    {
-      "epoch": 1.6385980883022304,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.001934456076467911,
-      "loss": 1.5156,
-      "step": 252000
-    },
-    {
-      "epoch": 1.6418492749853697,
-      "grad_norm": 2.71875,
-      "learning_rate": 0.0019343260290005852,
-      "loss": 1.5161,
-      "step": 252500
-    },
-    {
-      "epoch": 1.645100461668509,
-      "grad_norm": 1.1015625,
-      "learning_rate": 0.0019341959815332597,
-      "loss": 1.5193,
-      "step": 253000
-    },
-    {
-      "epoch": 1.6483516483516483,
-      "grad_norm": 33.5,
-      "learning_rate": 0.0019340659340659342,
-      "loss": 1.5211,
-      "step": 253500
-    },
-    {
-      "epoch": 1.6516028350347876,
-      "grad_norm": 1.125,
-      "learning_rate": 0.0019339358865986085,
-      "loss": 1.5132,
-      "step": 254000
-    },
-    {
-      "epoch": 1.654854021717927,
-      "grad_norm": 0.953125,
-      "learning_rate": 0.001933805839131283,
-      "loss": 1.5106,
-      "step": 254500
-    },
-    {
-      "epoch": 1.6581052084010663,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019336757916639574,
-      "loss": 1.5101,
-      "step": 255000
-    },
-    {
-      "epoch": 1.6613563950842059,
-      "grad_norm": 27.125,
-      "learning_rate": 0.0019335457441966317,
-      "loss": 1.5107,
-      "step": 255500
-    },
-    {
-      "epoch": 1.6646075817673451,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.0019334156967293062,
-      "loss": 1.5099,
-      "step": 256000
-    },
-    {
-      "epoch": 1.6678587684504844,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0019332856492619806,
-      "loss": 1.5145,
-      "step": 256500
-    },
-    {
-      "epoch": 1.6711099551336237,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.001933155601794655,
-      "loss": 1.5125,
-      "step": 257000
-    },
-    {
-      "epoch": 1.674361141816763,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0019330255543273294,
-      "loss": 1.5107,
-      "step": 257500
-    },
-    {
-      "epoch": 1.6776123284999025,
-      "grad_norm": 1.4296875,
-      "learning_rate": 0.001932895506860004,
-      "loss": 1.5076,
-      "step": 258000
-    },
-    {
-      "epoch": 1.6808635151830418,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0019327654593926785,
-      "loss": 1.5095,
-      "step": 258500
-    },
-    {
-      "epoch": 1.6841147018661813,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0019326354119253528,
-      "loss": 1.5088,
-      "step": 259000
-    },
-    {
-      "epoch": 1.6873658885493206,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.0019325053644580273,
-      "loss": 1.5099,
-      "step": 259500
-    },
-    {
-      "epoch": 1.6906170752324599,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0019323753169907018,
-      "loss": 1.5095,
-      "step": 260000
-    },
-    {
-      "epoch": 1.6938682619155991,
-      "grad_norm": 1.640625,
-      "learning_rate": 0.001932245269523376,
-      "loss": 1.514,
-      "step": 260500
-    },
-    {
-      "epoch": 1.6971194485987384,
-      "grad_norm": 1.953125,
-      "learning_rate": 0.0019321152220560505,
-      "loss": 1.5228,
-      "step": 261000
-    },
-    {
-      "epoch": 1.700370635281878,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.001931985174588725,
-      "loss": 1.5193,
-      "step": 261500
-    },
-    {
-      "epoch": 1.7036218219650172,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.0019318551271213992,
-      "loss": 1.5193,
-      "step": 262000
-    },
-    {
-      "epoch": 1.7068730086481567,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019317250796540737,
-      "loss": 1.5236,
-      "step": 262500
-    },
-    {
-      "epoch": 1.710124195331296,
-      "grad_norm": 1.3671875,
-      "learning_rate": 0.0019315950321867482,
-      "loss": 1.51,
-      "step": 263000
-    },
-    {
-      "epoch": 1.7133753820144353,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0019314649847194225,
-      "loss": 1.5129,
-      "step": 263500
-    },
-    {
-      "epoch": 1.7166265686975746,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.001931334937252097,
-      "loss": 1.5101,
-      "step": 264000
-    },
-    {
-      "epoch": 1.7198777553807139,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0019312048897847714,
-      "loss": 1.5112,
-      "step": 264500
-    },
-    {
-      "epoch": 1.7231289420638531,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0019310748423174461,
-      "loss": 1.5131,
-      "step": 265000
-    },
-    {
-      "epoch": 1.7263801287469926,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019309447948501204,
-      "loss": 1.5135,
-      "step": 265500
-    },
-    {
-      "epoch": 1.7296313154301322,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019308147473827949,
-      "loss": 1.5105,
-      "step": 266000
-    },
-    {
-      "epoch": 1.7328825021132714,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019306846999154693,
-      "loss": 1.5133,
-      "step": 266500
-    },
-    {
-      "epoch": 1.7361336887964107,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0019305546524481436,
-      "loss": 1.5098,
-      "step": 267000
-    },
-    {
-      "epoch": 1.73938487547955,
-      "grad_norm": 5.84375,
-      "learning_rate": 0.001930424604980818,
-      "loss": 1.5059,
-      "step": 267500
-    },
-    {
-      "epoch": 1.7426360621626893,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0019302945575134926,
-      "loss": 1.514,
-      "step": 268000
-    },
-    {
-      "epoch": 1.7458872488458286,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0019301645100461668,
-      "loss": 1.4967,
-      "step": 268500
-    },
-    {
-      "epoch": 1.749138435528968,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0019300344625788413,
-      "loss": 1.505,
-      "step": 269000
-    },
-    {
-      "epoch": 1.7523896222121074,
-      "grad_norm": 1.640625,
-      "learning_rate": 0.0019299044151115158,
-      "loss": 1.5132,
-      "step": 269500
-    },
-    {
-      "epoch": 1.7556408088952469,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.00192977436764419,
-      "loss": 1.5124,
-      "step": 270000
-    },
-    {
-      "epoch": 1.7588919955783862,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0019296443201768645,
-      "loss": 1.509,
-      "step": 270500
-    },
-    {
-      "epoch": 1.7621431822615254,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.001929514272709539,
-      "loss": 1.5049,
-      "step": 271000
-    },
-    {
-      "epoch": 1.7653943689446647,
-      "grad_norm": 2.359375,
-      "learning_rate": 0.0019293842252422133,
-      "loss": 1.5067,
-      "step": 271500
-    },
-    {
-      "epoch": 1.768645555627804,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0019292541777748877,
-      "loss": 1.5076,
-      "step": 272000
-    },
-    {
-      "epoch": 1.7718967423109435,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.0019291241303075624,
-      "loss": 1.5,
-      "step": 272500
-    },
-    {
-      "epoch": 1.7751479289940828,
-      "grad_norm": 1.484375,
-      "learning_rate": 0.001928994082840237,
-      "loss": 1.5139,
-      "step": 273000
-    },
-    {
-      "epoch": 1.7783991156772223,
-      "grad_norm": 0.875,
-      "learning_rate": 0.0019288640353729112,
-      "loss": 1.5092,
-      "step": 273500
-    },
-    {
-      "epoch": 1.7816503023603616,
-      "grad_norm": 4.34375,
-      "learning_rate": 0.0019287339879055856,
-      "loss": 1.5087,
-      "step": 274000
-    },
-    {
-      "epoch": 1.7849014890435009,
-      "grad_norm": 1.2265625,
-      "learning_rate": 0.0019286039404382601,
-      "loss": 1.5178,
-      "step": 274500
-    },
-    {
-      "epoch": 1.7881526757266402,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019284738929709344,
-      "loss": 1.5253,
-      "step": 275000
-    },
-    {
-      "epoch": 1.7914038624097794,
-      "grad_norm": 2.140625,
-      "learning_rate": 0.0019283438455036089,
-      "loss": 1.5293,
-      "step": 275500
-    },
-    {
-      "epoch": 1.794655049092919,
-      "grad_norm": 1.59375,
-      "learning_rate": 0.0019282137980362833,
-      "loss": 1.528,
-      "step": 276000
-    },
-    {
-      "epoch": 1.7979062357760582,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019280837505689576,
-      "loss": 1.5178,
-      "step": 276500
-    },
-    {
-      "epoch": 1.8011574224591977,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.001927953703101632,
-      "loss": 1.5362,
-      "step": 277000
-    },
-    {
-      "epoch": 1.804408609142337,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0019278236556343066,
-      "loss": 1.5321,
-      "step": 277500
-    },
-    {
-      "epoch": 1.8076597958254763,
-      "grad_norm": 1.53125,
-      "learning_rate": 0.0019276936081669808,
-      "loss": 1.5199,
-      "step": 278000
-    },
-    {
-      "epoch": 1.8109109825086156,
-      "grad_norm": 3.921875,
-      "learning_rate": 0.0019275635606996553,
-      "loss": 1.52,
-      "step": 278500
-    },
-    {
-      "epoch": 1.8141621691917549,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.0019274335132323298,
-      "loss": 1.5242,
-      "step": 279000
-    },
-    {
-      "epoch": 1.8174133558748944,
-      "grad_norm": 2.125,
-      "learning_rate": 0.0019273034657650045,
-      "loss": 1.5241,
-      "step": 279500
-    },
-    {
-      "epoch": 1.8206645425580337,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0019271734182976787,
-      "loss": 1.5294,
-      "step": 280000
-    },
-    {
-      "epoch": 1.8239157292411732,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0019270433708303532,
-      "loss": 1.5173,
-      "step": 280500
-    },
-    {
-      "epoch": 1.8271669159243125,
-      "grad_norm": 2.046875,
-      "learning_rate": 0.0019269133233630277,
-      "loss": 1.5224,
-      "step": 281000
-    },
-    {
-      "epoch": 1.8304181026074517,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.001926783275895702,
-      "loss": 1.5304,
-      "step": 281500
-    },
-    {
-      "epoch": 1.833669289290591,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019266532284283764,
-      "loss": 1.5289,
-      "step": 282000
-    },
-    {
-      "epoch": 1.8369204759737303,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.001926523180961051,
-      "loss": 1.5198,
-      "step": 282500
-    },
-    {
-      "epoch": 1.8401716626568696,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0019263931334937252,
-      "loss": 1.5292,
-      "step": 283000
-    },
-    {
-      "epoch": 1.843422849340009,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0019262630860263997,
-      "loss": 1.5336,
-      "step": 283500
-    },
-    {
-      "epoch": 1.8466740360231486,
-      "grad_norm": 3.90625,
-      "learning_rate": 0.0019261330385590741,
-      "loss": 1.5415,
-      "step": 284000
-    },
-    {
-      "epoch": 1.849925222706288,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0019260029910917484,
-      "loss": 1.5339,
-      "step": 284500
-    },
-    {
-      "epoch": 1.8531764093894272,
-      "grad_norm": 1.3984375,
-      "learning_rate": 0.0019258729436244229,
-      "loss": 1.5423,
-      "step": 285000
-    },
-    {
-      "epoch": 1.8564275960725665,
-      "grad_norm": 1.1875,
-      "learning_rate": 0.0019257428961570973,
-      "loss": 1.5517,
-      "step": 285500
-    },
-    {
-      "epoch": 1.8596787827557058,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0019256128486897716,
-      "loss": 1.5582,
-      "step": 286000
-    },
-    {
-      "epoch": 1.862929969438845,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.001925482801222446,
-      "loss": 1.5598,
-      "step": 286500
-    },
-    {
-      "epoch": 1.8661811561219845,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.0019253527537551208,
-      "loss": 1.5541,
-      "step": 287000
-    },
-    {
-      "epoch": 1.8694323428051238,
-      "grad_norm": 1.1328125,
-      "learning_rate": 0.0019252227062877953,
-      "loss": 1.551,
-      "step": 287500
-    },
-    {
-      "epoch": 1.8726835294882633,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0019250926588204695,
-      "loss": 1.5573,
-      "step": 288000
-    },
-    {
-      "epoch": 1.8759347161714026,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.001924962611353144,
-      "loss": 1.5485,
-      "step": 288500
-    },
-    {
-      "epoch": 1.879185902854542,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0019248325638858185,
-      "loss": 1.5529,
-      "step": 289000
-    },
-    {
-      "epoch": 1.8824370895376812,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0019247025164184927,
-      "loss": 1.5538,
-      "step": 289500
-    },
-    {
-      "epoch": 1.8856882762208205,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0019245724689511672,
-      "loss": 1.5524,
-      "step": 290000
-    },
-    {
-      "epoch": 1.88893946290396,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0019244424214838417,
-      "loss": 1.5582,
-      "step": 290500
-    },
-    {
-      "epoch": 1.8921906495870993,
-      "grad_norm": 2.21875,
-      "learning_rate": 0.001924312374016516,
-      "loss": 1.5682,
-      "step": 291000
-    },
-    {
-      "epoch": 1.8954418362702388,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0019241823265491904,
-      "loss": 1.5639,
-      "step": 291500
-    },
-    {
-      "epoch": 1.898693022953378,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.001924052279081865,
-      "loss": 1.5732,
-      "step": 292000
-    },
-    {
-      "epoch": 1.9019442096365173,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019239222316145392,
-      "loss": 1.5634,
-      "step": 292500
-    },
-    {
-      "epoch": 1.9051953963196566,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0019237921841472137,
-      "loss": 1.5599,
-      "step": 293000
-    },
-    {
-      "epoch": 1.908446583002796,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.0019236621366798881,
-      "loss": 1.5507,
-      "step": 293500
-    },
-    {
-      "epoch": 1.9116977696859354,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0019235320892125628,
-      "loss": 1.5467,
-      "step": 294000
-    },
-    {
-      "epoch": 1.9149489563690747,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.001923402041745237,
-      "loss": 1.5465,
-      "step": 294500
-    },
-    {
-      "epoch": 1.9182001430522142,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.0019232719942779116,
-      "loss": 1.5408,
-      "step": 295000
-    },
-    {
-      "epoch": 1.9214513297353535,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.001923141946810586,
-      "loss": 1.5481,
-      "step": 295500
-    },
-    {
-      "epoch": 1.9247025164184928,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0019230118993432603,
-      "loss": 1.5488,
-      "step": 296000
-    },
-    {
-      "epoch": 1.927953703101632,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.0019228818518759348,
-      "loss": 1.5443,
-      "step": 296500
-    },
-    {
-      "epoch": 1.9312048897847713,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.0019227518044086093,
-      "loss": 1.5579,
-      "step": 297000
-    },
-    {
-      "epoch": 1.9344560764679106,
-      "grad_norm": 1.5078125,
-      "learning_rate": 0.0019226217569412835,
-      "loss": 1.5481,
-      "step": 297500
-    },
-    {
-      "epoch": 1.9377072631510501,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.001922491709473958,
-      "loss": 1.5384,
-      "step": 298000
-    },
-    {
-      "epoch": 1.9409584498341896,
-      "grad_norm": 1.3828125,
-      "learning_rate": 0.0019223616620066325,
-      "loss": 1.5387,
-      "step": 298500
-    },
-    {
-      "epoch": 1.944209636517329,
-      "grad_norm": 1.8125,
-      "learning_rate": 0.0019222316145393068,
-      "loss": 1.5439,
-      "step": 299000
-    },
-    {
-      "epoch": 1.9474608232004682,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019221015670719812,
-      "loss": 1.5443,
-      "step": 299500
-    },
-    {
-      "epoch": 1.9507120098836075,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0019219715196046557,
-      "loss": 1.5366,
-      "step": 300000
-    },
-    {
-      "epoch": 1.9539631965667468,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.00192184147213733,
-      "loss": 1.5391,
-      "step": 300500
-    },
-    {
-      "epoch": 1.957214383249886,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0019217114246700044,
-      "loss": 1.5394,
-      "step": 301000
-    },
-    {
-      "epoch": 1.9604655699330256,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.0019215813772026791,
-      "loss": 1.5341,
-      "step": 301500
-    },
-    {
-      "epoch": 1.9637167566161648,
-      "grad_norm": 1.375,
-      "learning_rate": 0.0019214513297353536,
-      "loss": 1.5414,
-      "step": 302000
-    },
-    {
-      "epoch": 1.9669679432993044,
-      "grad_norm": 1.171875,
-      "learning_rate": 0.0019213212822680279,
-      "loss": 1.5436,
-      "step": 302500
-    },
-    {
-      "epoch": 1.9702191299824436,
-      "grad_norm": 1.2578125,
-      "learning_rate": 0.0019211912348007024,
-      "loss": 1.5284,
-      "step": 303000
-    },
-    {
-      "epoch": 1.973470316665583,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019210611873333768,
-      "loss": 1.5208,
-      "step": 303500
-    },
-    {
-      "epoch": 1.9767215033487222,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.001920931139866051,
-      "loss": 1.5234,
-      "step": 304000
-    },
-    {
-      "epoch": 1.9799726900318615,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0019208010923987256,
-      "loss": 1.5299,
-      "step": 304500
-    },
-    {
-      "epoch": 1.983223876715001,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0019206710449314,
-      "loss": 1.5367,
-      "step": 305000
-    },
-    {
-      "epoch": 1.9864750633981403,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0019205409974640743,
-      "loss": 1.5328,
-      "step": 305500
-    },
-    {
-      "epoch": 1.9897262500812798,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0019204109499967488,
-      "loss": 1.5326,
-      "step": 306000
-    },
-    {
-      "epoch": 1.992977436764419,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0019202809025294233,
-      "loss": 1.5357,
-      "step": 306500
-    },
-    {
-      "epoch": 1.9962286234475584,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0019201508550620975,
-      "loss": 1.5266,
-      "step": 307000
-    },
-    {
-      "epoch": 1.9994798101306976,
-      "grad_norm": 5.8125,
-      "learning_rate": 0.001920020807594772,
-      "loss": 1.5319,
-      "step": 307500
-    },
-    {
-      "epoch": 2.0,
-      "eval_loss": 1.5122345685958862,
-      "eval_runtime": 0.5402,
-      "eval_samples_per_second": 1851.06,
-      "eval_steps_per_second": 29.617,
-      "step": 307580
-    },
-    {
-      "epoch": 2.002730996813837,
-      "grad_norm": 1.359375,
-      "learning_rate": 0.0019198907601274465,
-      "loss": 1.53,
-      "step": 308000
-    },
-    {
-      "epoch": 2.005982183496976,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0019197607126601212,
-      "loss": 1.5226,
-      "step": 308500
-    },
-    {
-      "epoch": 2.009233370180116,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0019196306651927955,
-      "loss": 1.5161,
-      "step": 309000
-    },
-    {
-      "epoch": 2.0124845568632552,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.00191950061772547,
-      "loss": 1.5224,
-      "step": 309500
-    },
-    {
-      "epoch": 2.0157357435463945,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0019193705702581444,
-      "loss": 1.5189,
-      "step": 310000
-    },
-    {
-      "epoch": 2.018986930229534,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0019192405227908187,
-      "loss": 1.5231,
-      "step": 310500
-    },
-    {
-      "epoch": 2.022238116912673,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.0019191104753234932,
-      "loss": 1.5372,
-      "step": 311000
-    },
-    {
-      "epoch": 2.0254893035958124,
-      "grad_norm": 1.5859375,
-      "learning_rate": 0.0019189804278561676,
-      "loss": 1.5421,
-      "step": 311500
-    },
-    {
-      "epoch": 2.0287404902789516,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.001918850380388842,
-      "loss": 1.5291,
-      "step": 312000
-    },
-    {
-      "epoch": 2.0319916769620914,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0019187203329215164,
-      "loss": 1.534,
-      "step": 312500
-    },
-    {
-      "epoch": 2.0352428636452307,
-      "grad_norm": 2.078125,
-      "learning_rate": 0.0019185902854541908,
-      "loss": 1.5307,
-      "step": 313000
-    },
-    {
-      "epoch": 2.03849405032837,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.0019184602379868651,
-      "loss": 1.5325,
-      "step": 313500
-    },
-    {
-      "epoch": 2.0417452370115092,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.0019183301905195396,
-      "loss": 1.5389,
-      "step": 314000
-    },
-    {
-      "epoch": 2.0449964236946485,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.001918200143052214,
-      "loss": 1.5328,
-      "step": 314500
-    },
-    {
-      "epoch": 2.048247610377788,
-      "grad_norm": 1.640625,
-      "learning_rate": 0.0019180700955848883,
-      "loss": 1.5415,
-      "step": 315000
-    },
-    {
-      "epoch": 2.051498797060927,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0019179400481175628,
-      "loss": 1.5549,
-      "step": 315500
-    },
-    {
-      "epoch": 2.0547499837440664,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019178100006502375,
-      "loss": 1.5597,
-      "step": 316000
-    },
-    {
-      "epoch": 2.058001170427206,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001917679953182912,
-      "loss": 1.5947,
-      "step": 316500
-    },
-    {
-      "epoch": 2.0612523571103454,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0019175499057155862,
-      "loss": 1.5896,
-      "step": 317000
-    },
-    {
-      "epoch": 2.0645035437934847,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.0019174198582482607,
-      "loss": 1.5825,
-      "step": 317500
-    },
-    {
-      "epoch": 2.067754730476624,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.0019172898107809352,
-      "loss": 1.5415,
-      "step": 318000
-    },
-    {
-      "epoch": 2.0710059171597632,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0019171597633136095,
-      "loss": 1.5405,
-      "step": 318500
-    },
-    {
-      "epoch": 2.0742571038429025,
-      "grad_norm": 1.5,
-      "learning_rate": 0.001917029715846284,
-      "loss": 1.5333,
-      "step": 319000
-    },
-    {
-      "epoch": 2.077508290526042,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0019168996683789584,
-      "loss": 1.5374,
-      "step": 319500
-    },
-    {
-      "epoch": 2.0807594772091815,
-      "grad_norm": 1.2109375,
-      "learning_rate": 0.0019167696209116327,
-      "loss": 1.5327,
-      "step": 320000
-    },
-    {
-      "epoch": 2.084010663892321,
-      "grad_norm": 1.3046875,
-      "learning_rate": 0.0019166395734443072,
-      "loss": 1.5281,
-      "step": 320500
-    },
-    {
-      "epoch": 2.08726185057546,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0019165095259769816,
-      "loss": 1.529,
-      "step": 321000
-    },
-    {
-      "epoch": 2.0905130372585994,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.001916379478509656,
-      "loss": 1.5322,
-      "step": 321500
-    },
-    {
-      "epoch": 2.0937642239417387,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0019162494310423304,
-      "loss": 1.5284,
-      "step": 322000
-    },
-    {
-      "epoch": 2.097015410624878,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0019161193835750049,
-      "loss": 1.5338,
-      "step": 322500
-    },
-    {
-      "epoch": 2.1002665973080172,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.0019159893361076796,
-      "loss": 1.524,
-      "step": 323000
-    },
-    {
-      "epoch": 2.103517783991157,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0019158592886403538,
-      "loss": 1.5225,
-      "step": 323500
-    },
-    {
-      "epoch": 2.1067689706742962,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0019157292411730283,
-      "loss": 1.5322,
-      "step": 324000
-    },
-    {
-      "epoch": 2.1100201573574355,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0019155991937057028,
-      "loss": 1.5216,
-      "step": 324500
-    },
-    {
-      "epoch": 2.113271344040575,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.001915469146238377,
-      "loss": 1.5237,
-      "step": 325000
-    },
-    {
-      "epoch": 2.116522530723714,
-      "grad_norm": 5.28125,
-      "learning_rate": 0.0019153390987710515,
-      "loss": 1.5298,
-      "step": 325500
-    },
-    {
-      "epoch": 2.1197737174068534,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.001915209051303726,
-      "loss": 1.5249,
-      "step": 326000
-    },
-    {
-      "epoch": 2.1230249040899927,
-      "grad_norm": 1.71875,
-      "learning_rate": 0.0019150790038364003,
-      "loss": 1.5262,
-      "step": 326500
-    },
-    {
-      "epoch": 2.1262760907731324,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0019149489563690747,
-      "loss": 1.5222,
-      "step": 327000
-    },
-    {
-      "epoch": 2.1295272774562717,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0019148189089017492,
-      "loss": 1.5148,
-      "step": 327500
-    },
-    {
-      "epoch": 2.132778464139411,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0019146888614344235,
-      "loss": 1.5151,
-      "step": 328000
-    },
-    {
-      "epoch": 2.1360296508225503,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.001914558813967098,
-      "loss": 1.5202,
-      "step": 328500
-    },
-    {
-      "epoch": 2.1392808375056895,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.0019144287664997724,
-      "loss": 1.5299,
-      "step": 329000
-    },
-    {
-      "epoch": 2.142532024188829,
-      "grad_norm": 1.7265625,
-      "learning_rate": 0.0019142987190324467,
-      "loss": 1.5327,
-      "step": 329500
-    },
-    {
-      "epoch": 2.145783210871968,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0019141686715651212,
-      "loss": 1.5325,
-      "step": 330000
-    },
-    {
-      "epoch": 2.1490343975551074,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.0019140386240977959,
-      "loss": 1.5311,
-      "step": 330500
-    },
-    {
-      "epoch": 2.152285584238247,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0019139085766304703,
-      "loss": 1.5228,
-      "step": 331000
-    },
-    {
-      "epoch": 2.1555367709213864,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0019137785291631446,
-      "loss": 1.5265,
-      "step": 331500
-    },
-    {
-      "epoch": 2.1587879576045257,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.001913648481695819,
-      "loss": 1.5246,
-      "step": 332000
-    },
-    {
-      "epoch": 2.162039144287665,
-      "grad_norm": 1.078125,
-      "learning_rate": 0.0019135184342284936,
-      "loss": 1.5268,
-      "step": 332500
-    },
-    {
-      "epoch": 2.1652903309708043,
-      "grad_norm": 1.8046875,
-      "learning_rate": 0.0019133883867611678,
-      "loss": 1.5286,
-      "step": 333000
-    },
-    {
-      "epoch": 2.1685415176539435,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.0019132583392938423,
-      "loss": 1.5289,
-      "step": 333500
-    },
-    {
-      "epoch": 2.171792704337083,
-      "grad_norm": 3.296875,
-      "learning_rate": 0.0019131282918265168,
-      "loss": 1.5281,
-      "step": 334000
-    },
-    {
-      "epoch": 2.1750438910202226,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.001912998244359191,
-      "loss": 1.5205,
-      "step": 334500
-    },
-    {
-      "epoch": 2.178295077703362,
-      "grad_norm": 1.171875,
-      "learning_rate": 0.0019128681968918655,
-      "loss": 1.518,
-      "step": 335000
-    },
-    {
-      "epoch": 2.181546264386501,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.00191273814942454,
-      "loss": 1.5153,
-      "step": 335500
-    },
-    {
-      "epoch": 2.1847974510696404,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.0019126081019572143,
-      "loss": 1.5212,
-      "step": 336000
-    },
-    {
-      "epoch": 2.1880486377527797,
-      "grad_norm": 1.6875,
-      "learning_rate": 0.0019124780544898887,
-      "loss": 1.5129,
-      "step": 336500
-    },
-    {
-      "epoch": 2.191299824435919,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.0019123480070225632,
-      "loss": 1.523,
-      "step": 337000
-    },
-    {
-      "epoch": 2.1945510111190583,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.001912217959555238,
-      "loss": 1.5178,
-      "step": 337500
-    },
-    {
-      "epoch": 2.197802197802198,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0019120879120879122,
-      "loss": 1.5259,
-      "step": 338000
-    },
-    {
-      "epoch": 2.2010533844853373,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0019119578646205867,
-      "loss": 1.5123,
-      "step": 338500
-    },
-    {
-      "epoch": 2.2043045711684766,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0019118278171532611,
-      "loss": 1.5192,
-      "step": 339000
-    },
-    {
-      "epoch": 2.207555757851616,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0019116977696859354,
-      "loss": 1.5165,
-      "step": 339500
-    },
-    {
-      "epoch": 2.210806944534755,
-      "grad_norm": 0.91796875,
-      "learning_rate": 0.0019115677222186099,
-      "loss": 1.517,
-      "step": 340000
-    },
-    {
-      "epoch": 2.2140581312178944,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0019114376747512843,
-      "loss": 1.5276,
-      "step": 340500
-    },
-    {
-      "epoch": 2.2173093179010337,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0019113076272839586,
-      "loss": 1.5122,
-      "step": 341000
-    },
-    {
-      "epoch": 2.2205605045841734,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.001911177579816633,
-      "loss": 1.5157,
-      "step": 341500
-    },
-    {
-      "epoch": 2.2238116912673127,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0019110475323493076,
-      "loss": 1.5224,
-      "step": 342000
-    },
-    {
-      "epoch": 2.227062877950452,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019109174848819818,
-      "loss": 1.5313,
-      "step": 342500
-    },
-    {
-      "epoch": 2.2303140646335913,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.0019107874374146563,
-      "loss": 1.5349,
-      "step": 343000
-    },
-    {
-      "epoch": 2.2335652513167306,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0019106573899473308,
-      "loss": 1.5298,
-      "step": 343500
-    },
-    {
-      "epoch": 2.23681643799987,
-      "grad_norm": 1.6875,
-      "learning_rate": 0.001910527342480005,
-      "loss": 1.5312,
-      "step": 344000
-    },
-    {
-      "epoch": 2.240067624683009,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0019103972950126795,
-      "loss": 1.5363,
-      "step": 344500
-    },
-    {
-      "epoch": 2.243318811366149,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0019102672475453542,
-      "loss": 1.535,
-      "step": 345000
-    },
-    {
-      "epoch": 2.246569998049288,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0019101372000780287,
-      "loss": 1.5359,
-      "step": 345500
-    },
-    {
-      "epoch": 2.2498211847324274,
-      "grad_norm": 1.21875,
-      "learning_rate": 0.001910007152610703,
-      "loss": 1.5286,
-      "step": 346000
-    },
-    {
-      "epoch": 2.2530723714155667,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0019098771051433774,
-      "loss": 1.5244,
-      "step": 346500
-    },
-    {
-      "epoch": 2.256323558098706,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.001909747057676052,
-      "loss": 1.5219,
-      "step": 347000
-    },
-    {
-      "epoch": 2.2595747447818453,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.0019096170102087262,
-      "loss": 1.5239,
-      "step": 347500
-    },
-    {
-      "epoch": 2.2628259314649846,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.0019094869627414007,
-      "loss": 1.5331,
-      "step": 348000
-    },
-    {
-      "epoch": 2.2660771181481243,
-      "grad_norm": 3.984375,
-      "learning_rate": 0.0019093569152740751,
-      "loss": 1.5267,
-      "step": 348500
-    },
-    {
-      "epoch": 2.2693283048312636,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0019092268678067494,
-      "loss": 1.5241,
-      "step": 349000
-    },
-    {
-      "epoch": 2.272579491514403,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0019090968203394239,
-      "loss": 1.5222,
-      "step": 349500
-    },
-    {
-      "epoch": 2.275830678197542,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0019089667728720984,
-      "loss": 1.5208,
-      "step": 350000
-    },
-    {
-      "epoch": 2.2790818648806814,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0019088367254047726,
-      "loss": 1.5276,
-      "step": 350500
-    },
-    {
-      "epoch": 2.2823330515638207,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.001908706677937447,
-      "loss": 1.5304,
-      "step": 351000
-    },
-    {
-      "epoch": 2.28558423824696,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0019085766304701216,
-      "loss": 1.5276,
-      "step": 351500
-    },
-    {
-      "epoch": 2.2888354249300997,
-      "grad_norm": 1.1328125,
-      "learning_rate": 0.0019084465830027963,
-      "loss": 1.5292,
-      "step": 352000
-    },
-    {
-      "epoch": 2.292086611613239,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0019083165355354705,
-      "loss": 1.5298,
-      "step": 352500
-    },
-    {
-      "epoch": 2.2953377982963783,
-      "grad_norm": 3.234375,
-      "learning_rate": 0.001908186488068145,
-      "loss": 1.5257,
-      "step": 353000
-    },
-    {
-      "epoch": 2.2985889849795176,
-      "grad_norm": 3.953125,
-      "learning_rate": 0.0019080564406008195,
-      "loss": 1.5232,
-      "step": 353500
-    },
-    {
-      "epoch": 2.301840171662657,
-      "grad_norm": 14.6875,
-      "learning_rate": 0.0019079263931334938,
-      "loss": 1.519,
-      "step": 354000
-    },
-    {
-      "epoch": 2.305091358345796,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0019077963456661682,
-      "loss": 1.5199,
-      "step": 354500
-    },
-    {
-      "epoch": 2.3083425450289354,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0019076662981988427,
-      "loss": 1.5216,
-      "step": 355000
-    },
-    {
-      "epoch": 2.3115937317120747,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.001907536250731517,
-      "loss": 1.522,
-      "step": 355500
-    },
-    {
-      "epoch": 2.3148449183952144,
-      "grad_norm": 0.94140625,
-      "learning_rate": 0.0019074062032641914,
-      "loss": 1.5132,
-      "step": 356000
-    },
-    {
-      "epoch": 2.3180961050783537,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.001907276155796866,
-      "loss": 1.5111,
-      "step": 356500
-    },
-    {
-      "epoch": 2.321347291761493,
-      "grad_norm": 2.78125,
-      "learning_rate": 0.0019071461083295402,
-      "loss": 1.5108,
-      "step": 357000
-    },
-    {
-      "epoch": 2.3245984784446323,
-      "grad_norm": 3.3125,
-      "learning_rate": 0.0019070160608622147,
-      "loss": 1.507,
-      "step": 357500
-    },
-    {
-      "epoch": 2.3278496651277716,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0019068860133948891,
-      "loss": 1.5052,
-      "step": 358000
-    },
-    {
-      "epoch": 2.331100851810911,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0019067559659275634,
-      "loss": 1.5098,
-      "step": 358500
-    },
-    {
-      "epoch": 2.33435203849405,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0019066259184602379,
-      "loss": 1.5149,
-      "step": 359000
-    },
-    {
-      "epoch": 2.3376032251771894,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0019064958709929126,
-      "loss": 1.5081,
-      "step": 359500
-    },
-    {
-      "epoch": 2.340854411860329,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.001906365823525587,
-      "loss": 1.5098,
-      "step": 360000
-    },
-    {
-      "epoch": 2.3441055985434684,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0019062357760582613,
-      "loss": 1.5113,
-      "step": 360500
-    },
-    {
-      "epoch": 2.3473567852266077,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.0019061057285909358,
-      "loss": 1.5087,
-      "step": 361000
-    },
-    {
-      "epoch": 2.350607971909747,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0019059756811236103,
-      "loss": 1.5004,
-      "step": 361500
-    },
-    {
-      "epoch": 2.3538591585928863,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0019058456336562845,
-      "loss": 1.5032,
-      "step": 362000
-    },
-    {
-      "epoch": 2.3571103452760256,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.001905715586188959,
-      "loss": 1.5072,
-      "step": 362500
-    },
-    {
-      "epoch": 2.360361531959165,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0019055855387216335,
-      "loss": 1.5085,
-      "step": 363000
-    },
-    {
-      "epoch": 2.3636127186423046,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0019054554912543078,
-      "loss": 1.5041,
-      "step": 363500
-    },
-    {
-      "epoch": 2.366863905325444,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.0019053254437869822,
-      "loss": 1.5088,
-      "step": 364000
-    },
-    {
-      "epoch": 2.370115092008583,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0019051953963196567,
-      "loss": 1.5057,
-      "step": 364500
-    },
-    {
-      "epoch": 2.3733662786917225,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.001905065348852331,
-      "loss": 1.5074,
-      "step": 365000
-    },
-    {
-      "epoch": 2.3766174653748617,
-      "grad_norm": 7.75,
-      "learning_rate": 0.0019049353013850055,
-      "loss": 1.5067,
-      "step": 365500
-    },
-    {
-      "epoch": 2.379868652058001,
-      "grad_norm": 1.5625,
-      "learning_rate": 0.00190480525391768,
-      "loss": 1.5077,
-      "step": 366000
-    },
-    {
-      "epoch": 2.3831198387411403,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.0019046752064503546,
-      "loss": 1.5026,
-      "step": 366500
-    },
-    {
-      "epoch": 2.38637102542428,
-      "grad_norm": 2.265625,
-      "learning_rate": 0.0019045451589830289,
-      "loss": 1.5061,
-      "step": 367000
-    },
-    {
-      "epoch": 2.3896222121074193,
-      "grad_norm": 1.1015625,
-      "learning_rate": 0.0019044151115157034,
-      "loss": 1.5035,
-      "step": 367500
-    },
-    {
-      "epoch": 2.3928733987905586,
-      "grad_norm": 1.1875,
-      "learning_rate": 0.0019042850640483778,
-      "loss": 1.5019,
-      "step": 368000
-    },
-    {
-      "epoch": 2.396124585473698,
-      "grad_norm": 1.5,
-      "learning_rate": 0.0019041550165810521,
-      "loss": 1.4978,
-      "step": 368500
-    },
-    {
-      "epoch": 2.399375772156837,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0019040249691137266,
-      "loss": 1.4969,
-      "step": 369000
-    },
-    {
-      "epoch": 2.4026269588399765,
-      "grad_norm": 2.109375,
-      "learning_rate": 0.001903894921646401,
-      "loss": 1.4945,
-      "step": 369500
-    },
-    {
-      "epoch": 2.4058781455231157,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0019037648741790753,
-      "loss": 1.4858,
-      "step": 370000
-    },
-    {
-      "epoch": 2.4091293322062555,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.0019036348267117498,
-      "loss": 1.4978,
-      "step": 370500
-    },
-    {
-      "epoch": 2.4123805188893948,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.0019035047792444243,
-      "loss": 1.491,
-      "step": 371000
-    },
-    {
-      "epoch": 2.415631705572534,
-      "grad_norm": 1.53125,
-      "learning_rate": 0.0019033747317770985,
-      "loss": 1.498,
-      "step": 371500
-    },
-    {
-      "epoch": 2.4188828922556733,
-      "grad_norm": 1.8828125,
-      "learning_rate": 0.001903244684309773,
-      "loss": 1.4916,
-      "step": 372000
-    },
-    {
-      "epoch": 2.4221340789388126,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0019031146368424475,
-      "loss": 1.4962,
-      "step": 372500
-    },
-    {
-      "epoch": 2.425385265621952,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.0019029845893751218,
-      "loss": 1.4885,
-      "step": 373000
-    },
-    {
-      "epoch": 2.428636452305091,
-      "grad_norm": 1.421875,
-      "learning_rate": 0.0019028545419077962,
-      "loss": 1.4952,
-      "step": 373500
-    },
-    {
-      "epoch": 2.431887638988231,
-      "grad_norm": 1.6953125,
-      "learning_rate": 0.001902724494440471,
-      "loss": 1.4921,
-      "step": 374000
-    },
-    {
-      "epoch": 2.43513882567137,
-      "grad_norm": 0.5625,
-      "learning_rate": 0.0019025944469731454,
-      "loss": 1.4982,
-      "step": 374500
-    },
-    {
-      "epoch": 2.4383900123545095,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0019024643995058197,
-      "loss": 1.5005,
-      "step": 375000
-    },
-    {
-      "epoch": 2.4416411990376488,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0019023343520384942,
-      "loss": 1.4998,
-      "step": 375500
-    },
-    {
-      "epoch": 2.444892385720788,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0019022043045711686,
-      "loss": 1.4939,
-      "step": 376000
-    },
-    {
-      "epoch": 2.4481435724039273,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.001902074257103843,
-      "loss": 1.4971,
-      "step": 376500
-    },
-    {
-      "epoch": 2.4513947590870666,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0019019442096365174,
-      "loss": 1.4931,
-      "step": 377000
-    },
-    {
-      "epoch": 2.4546459457702063,
-      "grad_norm": 2.890625,
-      "learning_rate": 0.0019018141621691919,
-      "loss": 1.4884,
-      "step": 377500
-    },
-    {
-      "epoch": 2.4578971324533456,
-      "grad_norm": 2.0625,
-      "learning_rate": 0.0019016841147018661,
-      "loss": 1.4982,
-      "step": 378000
-    },
-    {
-      "epoch": 2.461148319136485,
-      "grad_norm": 1.0625,
-      "learning_rate": 0.0019015540672345406,
-      "loss": 1.496,
-      "step": 378500
-    },
-    {
-      "epoch": 2.464399505819624,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001901424019767215,
-      "loss": 1.4937,
-      "step": 379000
-    },
-    {
-      "epoch": 2.4676506925027635,
-      "grad_norm": 1.2109375,
-      "learning_rate": 0.0019012939722998893,
-      "loss": 1.4894,
-      "step": 379500
-    },
-    {
-      "epoch": 2.4709018791859028,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0019011639248325638,
-      "loss": 1.4896,
-      "step": 380000
-    },
-    {
-      "epoch": 2.474153065869042,
-      "grad_norm": 3.859375,
-      "learning_rate": 0.0019010338773652383,
-      "loss": 1.492,
-      "step": 380500
-    },
-    {
-      "epoch": 2.4774042525521818,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.001900903829897913,
-      "loss": 1.499,
-      "step": 381000
-    },
-    {
-      "epoch": 2.480655439235321,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0019007737824305873,
-      "loss": 1.4932,
-      "step": 381500
-    },
-    {
-      "epoch": 2.4839066259184603,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0019006437349632617,
-      "loss": 1.4962,
-      "step": 382000
-    },
-    {
-      "epoch": 2.4871578126015996,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0019005136874959362,
-      "loss": 1.4942,
-      "step": 382500
-    },
-    {
-      "epoch": 2.490408999284739,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0019003836400286105,
-      "loss": 1.487,
-      "step": 383000
-    },
-    {
-      "epoch": 2.493660185967878,
-      "grad_norm": 0.75,
-      "learning_rate": 0.001900253592561285,
-      "loss": 1.4892,
-      "step": 383500
-    },
-    {
-      "epoch": 2.4969113726510175,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.0019001235450939594,
-      "loss": 1.4959,
-      "step": 384000
-    },
-    {
-      "epoch": 2.500162559334157,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0018999934976266337,
-      "loss": 1.4904,
-      "step": 384500
-    },
-    {
-      "epoch": 2.503413746017296,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.0018998634501593082,
-      "loss": 1.4926,
-      "step": 385000
-    },
-    {
-      "epoch": 2.5066649327004358,
-      "grad_norm": 2.390625,
-      "learning_rate": 0.0018997334026919826,
-      "loss": 1.4885,
-      "step": 385500
-    },
-    {
-      "epoch": 2.509916119383575,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001899603355224657,
-      "loss": 1.489,
-      "step": 386000
-    },
-    {
-      "epoch": 2.5131673060667143,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0018994733077573314,
-      "loss": 1.4856,
-      "step": 386500
-    },
-    {
-      "epoch": 2.5164184927498536,
-      "grad_norm": 2.359375,
-      "learning_rate": 0.0018993432602900059,
-      "loss": 1.4854,
-      "step": 387000
-    },
-    {
-      "epoch": 2.519669679432993,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0018992132128226801,
-      "loss": 1.487,
-      "step": 387500
-    },
-    {
-      "epoch": 2.5229208661161326,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0018990831653553546,
-      "loss": 1.4987,
-      "step": 388000
-    },
-    {
-      "epoch": 2.5261720527992715,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018989531178880293,
-      "loss": 1.4923,
-      "step": 388500
-    },
-    {
-      "epoch": 2.529423239482411,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0018988230704207038,
-      "loss": 1.4905,
-      "step": 389000
-    },
-    {
-      "epoch": 2.5326744261655505,
-      "grad_norm": 4.875,
-      "learning_rate": 0.001898693022953378,
-      "loss": 1.4929,
-      "step": 389500
-    },
-    {
-      "epoch": 2.53592561284869,
-      "grad_norm": 1.2109375,
-      "learning_rate": 0.0018985629754860525,
-      "loss": 1.4889,
-      "step": 390000
-    },
-    {
-      "epoch": 2.539176799531829,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.001898432928018727,
-      "loss": 1.4915,
-      "step": 390500
-    },
-    {
-      "epoch": 2.5424279862149683,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0018983028805514013,
-      "loss": 1.484,
-      "step": 391000
-    },
-    {
-      "epoch": 2.545679172898108,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0018981728330840757,
-      "loss": 1.4877,
-      "step": 391500
-    },
-    {
-      "epoch": 2.548930359581247,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018980427856167502,
-      "loss": 1.4979,
-      "step": 392000
-    },
-    {
-      "epoch": 2.5521815462643866,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018979127381494245,
-      "loss": 1.4893,
-      "step": 392500
-    },
-    {
-      "epoch": 2.555432732947526,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.001897782690682099,
-      "loss": 1.4904,
-      "step": 393000
-    },
-    {
-      "epoch": 2.558683919630665,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018976526432147734,
-      "loss": 1.4982,
-      "step": 393500
-    },
-    {
-      "epoch": 2.5619351063138045,
-      "grad_norm": 1.46875,
-      "learning_rate": 0.0018975225957474477,
-      "loss": 1.4952,
-      "step": 394000
-    },
-    {
-      "epoch": 2.565186292996944,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018973925482801222,
-      "loss": 1.4959,
-      "step": 394500
-    },
-    {
-      "epoch": 2.568437479680083,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018972625008127967,
-      "loss": 1.488,
-      "step": 395000
-    },
-    {
-      "epoch": 2.5716886663632224,
-      "grad_norm": 1.734375,
-      "learning_rate": 0.0018971324533454713,
-      "loss": 1.491,
-      "step": 395500
-    },
-    {
-      "epoch": 2.574939853046362,
-      "grad_norm": 1.609375,
-      "learning_rate": 0.0018970024058781456,
-      "loss": 1.4935,
-      "step": 396000
-    },
-    {
-      "epoch": 2.5781910397295014,
-      "grad_norm": 1.1328125,
-      "learning_rate": 0.00189687235841082,
-      "loss": 1.4867,
-      "step": 396500
-    },
-    {
-      "epoch": 2.5814422264126407,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0018967423109434946,
-      "loss": 1.4899,
-      "step": 397000
-    },
-    {
-      "epoch": 2.58469341309578,
-      "grad_norm": 2.890625,
-      "learning_rate": 0.0018966122634761688,
-      "loss": 1.4949,
-      "step": 397500
-    },
-    {
-      "epoch": 2.587944599778919,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0018964822160088433,
-      "loss": 1.4928,
-      "step": 398000
-    },
-    {
-      "epoch": 2.5911957864620585,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0018963521685415178,
-      "loss": 1.4994,
-      "step": 398500
-    },
-    {
-      "epoch": 2.594446973145198,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.001896222121074192,
-      "loss": 1.4884,
-      "step": 399000
-    },
-    {
-      "epoch": 2.5976981598283375,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0018960920736068665,
-      "loss": 1.479,
-      "step": 399500
-    },
-    {
-      "epoch": 2.600949346511477,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.001895962026139541,
-      "loss": 1.4862,
-      "step": 400000
-    },
-    {
-      "epoch": 2.604200533194616,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0018958319786722153,
-      "loss": 1.4912,
-      "step": 400500
-    },
-    {
-      "epoch": 2.6074517198777554,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0018957019312048897,
-      "loss": 1.4863,
-      "step": 401000
-    },
-    {
-      "epoch": 2.6107029065608947,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018955718837375642,
-      "loss": 1.4892,
-      "step": 401500
-    },
-    {
-      "epoch": 2.613954093244034,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0018954418362702385,
-      "loss": 1.4843,
-      "step": 402000
-    },
-    {
-      "epoch": 2.6172052799271732,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.001895311788802913,
-      "loss": 1.4803,
-      "step": 402500
-    },
-    {
-      "epoch": 2.620456466610313,
-      "grad_norm": 0.99609375,
-      "learning_rate": 0.0018951817413355877,
-      "loss": 1.4781,
-      "step": 403000
-    },
-    {
-      "epoch": 2.6237076532934522,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0018950516938682621,
-      "loss": 1.484,
-      "step": 403500
-    },
-    {
-      "epoch": 2.6269588399765915,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018949216464009364,
-      "loss": 1.4841,
-      "step": 404000
-    },
-    {
-      "epoch": 2.630210026659731,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0018947915989336109,
-      "loss": 1.4807,
-      "step": 404500
-    },
-    {
-      "epoch": 2.63346121334287,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018946615514662854,
-      "loss": 1.4828,
-      "step": 405000
-    },
-    {
-      "epoch": 2.6367124000260094,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0018945315039989596,
-      "loss": 1.4817,
-      "step": 405500
-    },
-    {
-      "epoch": 2.6399635867091487,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.001894401456531634,
-      "loss": 1.4844,
-      "step": 406000
-    },
-    {
-      "epoch": 2.6432147733922884,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0018942714090643086,
-      "loss": 1.4832,
-      "step": 406500
-    },
-    {
-      "epoch": 2.6464659600754277,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018941413615969828,
-      "loss": 1.4784,
-      "step": 407000
-    },
-    {
-      "epoch": 2.649717146758567,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0018940113141296573,
-      "loss": 1.4795,
-      "step": 407500
-    },
-    {
-      "epoch": 2.6529683334417062,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018938812666623318,
-      "loss": 1.4795,
-      "step": 408000
-    },
-    {
-      "epoch": 2.6562195201248455,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.001893751219195006,
-      "loss": 1.4835,
-      "step": 408500
-    },
-    {
-      "epoch": 2.659470706807985,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018936211717276805,
-      "loss": 1.4775,
-      "step": 409000
-    },
-    {
-      "epoch": 2.662721893491124,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.001893491124260355,
-      "loss": 1.4808,
-      "step": 409500
-    },
-    {
-      "epoch": 2.665973080174264,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0018933610767930297,
-      "loss": 1.4789,
-      "step": 410000
-    },
-    {
-      "epoch": 2.669224266857403,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.001893231029325704,
-      "loss": 1.4741,
-      "step": 410500
-    },
-    {
-      "epoch": 2.6724754535405424,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0018931009818583784,
-      "loss": 1.4803,
-      "step": 411000
-    },
-    {
-      "epoch": 2.6757266402236817,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.001892970934391053,
-      "loss": 1.4856,
-      "step": 411500
-    },
-    {
-      "epoch": 2.678977826906821,
-      "grad_norm": 1.4296875,
-      "learning_rate": 0.0018928408869237272,
-      "loss": 1.4796,
-      "step": 412000
-    },
-    {
-      "epoch": 2.6822290135899602,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0018927108394564017,
-      "loss": 1.4844,
-      "step": 412500
-    },
-    {
-      "epoch": 2.6854802002730995,
-      "grad_norm": 3.4375,
-      "learning_rate": 0.0018925807919890761,
-      "loss": 1.4866,
-      "step": 413000
-    },
-    {
-      "epoch": 2.6887313869562393,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0018924507445217504,
-      "loss": 1.496,
-      "step": 413500
-    },
-    {
-      "epoch": 2.691982573639378,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018923206970544249,
-      "loss": 1.4882,
-      "step": 414000
-    },
-    {
-      "epoch": 2.695233760322518,
-      "grad_norm": 2.734375,
-      "learning_rate": 0.0018921906495870994,
-      "loss": 1.4902,
-      "step": 414500
-    },
-    {
-      "epoch": 2.698484947005657,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018920606021197736,
-      "loss": 1.4878,
-      "step": 415000
-    },
-    {
-      "epoch": 2.7017361336887964,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001891930554652448,
-      "loss": 1.4858,
-      "step": 415500
-    },
-    {
-      "epoch": 2.7049873203719357,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018918005071851226,
-      "loss": 1.4857,
-      "step": 416000
-    },
-    {
-      "epoch": 2.708238507055075,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.0018916704597177968,
-      "loss": 1.4811,
-      "step": 416500
-    },
-    {
-      "epoch": 2.7114896937382147,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018915404122504713,
-      "loss": 1.481,
-      "step": 417000
-    },
-    {
-      "epoch": 2.7147408804213535,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.001891410364783146,
-      "loss": 1.4776,
-      "step": 417500
-    },
-    {
-      "epoch": 2.7179920671044933,
-      "grad_norm": 15.0625,
-      "learning_rate": 0.0018912803173158205,
-      "loss": 1.4783,
-      "step": 418000
-    },
-    {
-      "epoch": 2.7212432537876325,
-      "grad_norm": 1.1640625,
-      "learning_rate": 0.0018911502698484948,
-      "loss": 1.4751,
-      "step": 418500
-    },
-    {
-      "epoch": 2.724494440470772,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.0018910202223811692,
-      "loss": 1.4733,
-      "step": 419000
-    },
-    {
-      "epoch": 2.727745627153911,
-      "grad_norm": 1.8359375,
-      "learning_rate": 0.0018908901749138437,
-      "loss": 1.4783,
-      "step": 419500
-    },
-    {
-      "epoch": 2.7309968138370504,
-      "grad_norm": 1.6171875,
-      "learning_rate": 0.001890760127446518,
-      "loss": 1.4742,
-      "step": 420000
-    },
-    {
-      "epoch": 2.73424800052019,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018906300799791925,
-      "loss": 1.4728,
-      "step": 420500
-    },
-    {
-      "epoch": 2.737499187203329,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.001890500032511867,
-      "loss": 1.4777,
-      "step": 421000
-    },
-    {
-      "epoch": 2.7407503738864687,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018903699850445412,
-      "loss": 1.4702,
-      "step": 421500
-    },
-    {
-      "epoch": 2.744001560569608,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018902399375772157,
-      "loss": 1.474,
-      "step": 422000
-    },
-    {
-      "epoch": 2.7472527472527473,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.0018901098901098902,
-      "loss": 1.4753,
-      "step": 422500
-    },
-    {
-      "epoch": 2.7505039339358865,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0018899798426425644,
-      "loss": 1.4768,
-      "step": 423000
-    },
-    {
-      "epoch": 2.753755120619026,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.001889849795175239,
-      "loss": 1.4777,
-      "step": 423500
-    },
-    {
-      "epoch": 2.7570063073021656,
-      "grad_norm": 1.921875,
-      "learning_rate": 0.0018897197477079134,
-      "loss": 1.4726,
-      "step": 424000
-    },
-    {
-      "epoch": 2.7602574939853044,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.001889589700240588,
-      "loss": 1.477,
-      "step": 424500
-    },
-    {
-      "epoch": 2.763508680668444,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0018894596527732623,
-      "loss": 1.4721,
-      "step": 425000
-    },
-    {
-      "epoch": 2.7667598673515834,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0018893296053059368,
-      "loss": 1.4787,
-      "step": 425500
-    },
-    {
-      "epoch": 2.7700110540347227,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0018891995578386113,
-      "loss": 1.4727,
-      "step": 426000
-    },
-    {
-      "epoch": 2.773262240717862,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0018890695103712855,
-      "loss": 1.4694,
-      "step": 426500
-    },
-    {
-      "epoch": 2.7765134274010013,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.00188893946290396,
-      "loss": 1.476,
-      "step": 427000
-    },
-    {
-      "epoch": 2.7797646140841406,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018888094154366345,
-      "loss": 1.4783,
-      "step": 427500
-    },
-    {
-      "epoch": 2.78301580076728,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0018886793679693088,
-      "loss": 1.4707,
-      "step": 428000
-    },
-    {
-      "epoch": 2.7862669874504196,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0018885493205019832,
-      "loss": 1.4717,
-      "step": 428500
-    },
-    {
-      "epoch": 2.789518174133559,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0018884192730346577,
-      "loss": 1.4716,
-      "step": 429000
-    },
-    {
-      "epoch": 2.792769360816698,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001888289225567332,
-      "loss": 1.4667,
-      "step": 429500
-    },
-    {
-      "epoch": 2.7960205474998374,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018881591781000065,
-      "loss": 1.4683,
-      "step": 430000
-    },
-    {
-      "epoch": 2.7992717341829767,
-      "grad_norm": 12.0625,
-      "learning_rate": 0.001888029130632681,
-      "loss": 1.4719,
-      "step": 430500
-    },
-    {
-      "epoch": 2.802522920866116,
-      "grad_norm": 1.9296875,
-      "learning_rate": 0.0018878990831653552,
-      "loss": 1.4711,
-      "step": 431000
-    },
-    {
-      "epoch": 2.8057741075492553,
-      "grad_norm": 1.390625,
-      "learning_rate": 0.0018877690356980297,
-      "loss": 1.4727,
-      "step": 431500
-    },
-    {
-      "epoch": 2.809025294232395,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0018876389882307044,
-      "loss": 1.472,
-      "step": 432000
-    },
-    {
-      "epoch": 2.8122764809155343,
-      "grad_norm": 0.515625,
-      "learning_rate": 0.0018875089407633789,
-      "loss": 1.4796,
-      "step": 432500
-    },
-    {
-      "epoch": 2.8155276675986736,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.0018873788932960531,
-      "loss": 1.4719,
-      "step": 433000
-    },
-    {
-      "epoch": 2.818778854281813,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0018872488458287276,
-      "loss": 1.4764,
-      "step": 433500
-    },
-    {
-      "epoch": 2.822030040964952,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.001887118798361402,
-      "loss": 1.4712,
-      "step": 434000
-    },
-    {
-      "epoch": 2.8252812276480914,
-      "grad_norm": 12.875,
-      "learning_rate": 0.0018869887508940763,
-      "loss": 1.4664,
-      "step": 434500
-    },
-    {
-      "epoch": 2.8285324143312307,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0018868587034267508,
-      "loss": 1.4708,
-      "step": 435000
-    },
-    {
-      "epoch": 2.8317836010143704,
-      "grad_norm": 3.203125,
-      "learning_rate": 0.0018867286559594253,
-      "loss": 1.4718,
-      "step": 435500
-    },
-    {
-      "epoch": 2.8350347876975097,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018865986084920996,
-      "loss": 1.4719,
-      "step": 436000
-    },
-    {
-      "epoch": 2.838285974380649,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.001886468561024774,
-      "loss": 1.4692,
-      "step": 436500
-    },
-    {
-      "epoch": 2.8415371610637883,
-      "grad_norm": 2.953125,
-      "learning_rate": 0.0018863385135574485,
-      "loss": 1.4735,
-      "step": 437000
-    },
-    {
-      "epoch": 2.8447883477469276,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0018862084660901228,
-      "loss": 1.4724,
-      "step": 437500
-    },
-    {
-      "epoch": 2.848039534430067,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018860784186227973,
-      "loss": 1.4687,
-      "step": 438000
-    },
-    {
-      "epoch": 2.851290721113206,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0018859483711554717,
-      "loss": 1.4734,
-      "step": 438500
-    },
-    {
-      "epoch": 2.854541907796346,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0018858183236881464,
-      "loss": 1.4705,
-      "step": 439000
-    },
-    {
-      "epoch": 2.857793094479485,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0018856882762208207,
-      "loss": 1.4702,
-      "step": 439500
-    },
-    {
-      "epoch": 2.8610442811626244,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0018855582287534952,
-      "loss": 1.4725,
-      "step": 440000
-    },
-    {
-      "epoch": 2.8642954678457637,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0018854281812861696,
-      "loss": 1.4784,
-      "step": 440500
-    },
-    {
-      "epoch": 2.867546654528903,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.001885298133818844,
-      "loss": 1.4737,
-      "step": 441000
-    },
-    {
-      "epoch": 2.8707978412120423,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0018851680863515184,
-      "loss": 1.4762,
-      "step": 441500
-    },
-    {
-      "epoch": 2.8740490278951816,
-      "grad_norm": 1.2421875,
-      "learning_rate": 0.0018850380388841929,
-      "loss": 1.4729,
-      "step": 442000
-    },
-    {
-      "epoch": 2.8773002145783213,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018849079914168671,
-      "loss": 1.4756,
-      "step": 442500
-    },
-    {
-      "epoch": 2.8805514012614606,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0018847779439495416,
-      "loss": 1.4757,
-      "step": 443000
-    },
-    {
-      "epoch": 2.8838025879446,
-      "grad_norm": 0.56640625,
-      "learning_rate": 0.001884647896482216,
-      "loss": 1.4769,
-      "step": 443500
-    },
-    {
-      "epoch": 2.887053774627739,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0018845178490148903,
-      "loss": 1.4805,
-      "step": 444000
-    },
-    {
-      "epoch": 2.8903049613108784,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0018843878015475648,
-      "loss": 1.4756,
-      "step": 444500
-    },
-    {
-      "epoch": 2.8935561479940177,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0018842577540802393,
-      "loss": 1.4753,
-      "step": 445000
-    },
-    {
-      "epoch": 2.896807334677157,
-      "grad_norm": 2.875,
-      "learning_rate": 0.0018841277066129136,
-      "loss": 1.474,
-      "step": 445500
-    },
-    {
-      "epoch": 2.9000585213602967,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.001883997659145588,
-      "loss": 1.4762,
-      "step": 446000
-    },
-    {
-      "epoch": 2.9033097080434356,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0018838676116782627,
-      "loss": 1.4703,
-      "step": 446500
-    },
-    {
-      "epoch": 2.9065608947265753,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0018837375642109372,
-      "loss": 1.4681,
-      "step": 447000
-    },
-    {
-      "epoch": 2.9098120814097146,
-      "grad_norm": 2.6875,
-      "learning_rate": 0.0018836075167436115,
-      "loss": 1.4768,
-      "step": 447500
-    },
-    {
-      "epoch": 2.913063268092854,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.001883477469276286,
-      "loss": 1.4705,
-      "step": 448000
-    },
-    {
-      "epoch": 2.916314454775993,
-      "grad_norm": 1.8828125,
-      "learning_rate": 0.0018833474218089604,
-      "loss": 1.4775,
-      "step": 448500
-    },
-    {
-      "epoch": 2.9195656414591324,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0018832173743416347,
-      "loss": 1.4738,
-      "step": 449000
-    },
-    {
-      "epoch": 2.922816828142272,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.0018830873268743092,
-      "loss": 1.4776,
-      "step": 449500
-    },
-    {
-      "epoch": 2.926068014825411,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0018829572794069837,
-      "loss": 1.4748,
-      "step": 450000
-    },
-    {
-      "epoch": 2.9293192015085507,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.001882827231939658,
-      "loss": 1.4722,
-      "step": 450500
-    },
-    {
-      "epoch": 2.93257038819169,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0018826971844723324,
-      "loss": 1.476,
-      "step": 451000
-    },
-    {
-      "epoch": 2.9358215748748293,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0018825671370050069,
-      "loss": 1.4676,
-      "step": 451500
-    },
-    {
-      "epoch": 2.9390727615579686,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018824370895376811,
-      "loss": 1.4714,
-      "step": 452000
-    },
-    {
-      "epoch": 2.942323948241108,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0018823070420703556,
-      "loss": 1.468,
-      "step": 452500
-    },
-    {
-      "epoch": 2.9455751349242476,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.00188217699460303,
-      "loss": 1.472,
-      "step": 453000
-    },
-    {
-      "epoch": 2.9488263216073864,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0018820469471357048,
-      "loss": 1.4727,
-      "step": 453500
-    },
-    {
-      "epoch": 2.952077508290526,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.001881916899668379,
-      "loss": 1.4722,
-      "step": 454000
-    },
-    {
-      "epoch": 2.9553286949736655,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0018817868522010535,
-      "loss": 1.4691,
-      "step": 454500
-    },
-    {
-      "epoch": 2.9585798816568047,
-      "grad_norm": 0.75,
-      "learning_rate": 0.001881656804733728,
-      "loss": 1.4649,
-      "step": 455000
-    },
-    {
-      "epoch": 2.961831068339944,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018815267572664023,
-      "loss": 1.4633,
-      "step": 455500
-    },
-    {
-      "epoch": 2.9650822550230833,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018813967097990767,
-      "loss": 1.461,
-      "step": 456000
-    },
-    {
-      "epoch": 2.968333441706223,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0018812666623317512,
-      "loss": 1.4665,
-      "step": 456500
-    },
-    {
-      "epoch": 2.971584628389362,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018811366148644255,
-      "loss": 1.4658,
-      "step": 457000
-    },
-    {
-      "epoch": 2.9748358150725016,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0018810065673971,
-      "loss": 1.4662,
-      "step": 457500
-    },
-    {
-      "epoch": 2.978087001755641,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0018808765199297744,
-      "loss": 1.4632,
-      "step": 458000
-    },
-    {
-      "epoch": 2.98133818843878,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0018807464724624487,
-      "loss": 1.4701,
-      "step": 458500
-    },
-    {
-      "epoch": 2.9845893751219195,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0018806164249951232,
-      "loss": 1.4664,
-      "step": 459000
-    },
-    {
-      "epoch": 2.9878405618050587,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018804863775277977,
-      "loss": 1.4679,
-      "step": 459500
-    },
-    {
-      "epoch": 2.991091748488198,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.001880356330060472,
-      "loss": 1.4674,
-      "step": 460000
-    },
-    {
-      "epoch": 2.9943429351713373,
-      "grad_norm": 1.515625,
-      "learning_rate": 0.0018802262825931464,
-      "loss": 1.4695,
-      "step": 460500
-    },
-    {
-      "epoch": 2.997594121854477,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.001880096235125821,
-      "loss": 1.464,
-      "step": 461000
-    },
-    {
-      "epoch": 3.0,
-      "eval_loss": 1.4555257558822632,
-      "eval_runtime": 0.5334,
-      "eval_samples_per_second": 1874.801,
-      "eval_steps_per_second": 29.997,
-      "step": 461370
-    },
-    {
-      "epoch": 3.0008453085376163,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0018799661876584956,
-      "loss": 1.4686,
-      "step": 461500
-    },
-    {
-      "epoch": 3.0040964952207556,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0018798361401911698,
-      "loss": 1.4627,
-      "step": 462000
-    },
-    {
-      "epoch": 3.007347681903895,
-      "grad_norm": 2.734375,
-      "learning_rate": 0.0018797060927238443,
-      "loss": 1.4682,
-      "step": 462500
-    },
-    {
-      "epoch": 3.010598868587034,
-      "grad_norm": 1.0,
-      "learning_rate": 0.0018795760452565188,
-      "loss": 1.4645,
-      "step": 463000
-    },
-    {
-      "epoch": 3.0138500552701735,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001879445997789193,
-      "loss": 1.4582,
-      "step": 463500
-    },
-    {
-      "epoch": 3.0171012419533128,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0018793159503218675,
-      "loss": 1.462,
-      "step": 464000
-    },
-    {
-      "epoch": 3.0203524286364525,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.001879185902854542,
-      "loss": 1.4655,
-      "step": 464500
-    },
-    {
-      "epoch": 3.0236036153195918,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.0018790558553872163,
-      "loss": 1.4624,
-      "step": 465000
-    },
-    {
-      "epoch": 3.026854802002731,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0018789258079198908,
-      "loss": 1.4621,
-      "step": 465500
-    },
-    {
-      "epoch": 3.0301059886858703,
-      "grad_norm": 1.25,
-      "learning_rate": 0.0018787957604525652,
-      "loss": 1.4682,
-      "step": 466000
-    },
-    {
-      "epoch": 3.0333571753690096,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0018786657129852395,
-      "loss": 1.4636,
-      "step": 466500
-    },
-    {
-      "epoch": 3.036608362052149,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.001878535665517914,
-      "loss": 1.4661,
-      "step": 467000
-    },
-    {
-      "epoch": 3.039859548735288,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0018784056180505884,
-      "loss": 1.4604,
-      "step": 467500
-    },
-    {
-      "epoch": 3.043110735418428,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018782755705832631,
-      "loss": 1.4597,
-      "step": 468000
-    },
-    {
-      "epoch": 3.046361922101567,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018781455231159374,
-      "loss": 1.4634,
-      "step": 468500
-    },
-    {
-      "epoch": 3.0496131087847065,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0018780154756486119,
-      "loss": 1.4617,
-      "step": 469000
-    },
-    {
-      "epoch": 3.0528642954678458,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018778854281812864,
-      "loss": 1.4653,
-      "step": 469500
-    },
-    {
-      "epoch": 3.056115482150985,
-      "grad_norm": 1.4609375,
-      "learning_rate": 0.0018777553807139606,
-      "loss": 1.4659,
-      "step": 470000
-    },
-    {
-      "epoch": 3.0593666688341243,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.001877625333246635,
-      "loss": 1.4602,
-      "step": 470500
-    },
-    {
-      "epoch": 3.0626178555172636,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.0018774952857793096,
-      "loss": 1.4627,
-      "step": 471000
-    },
-    {
-      "epoch": 3.0658690422004033,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.0018773652383119838,
-      "loss": 1.4682,
-      "step": 471500
-    },
-    {
-      "epoch": 3.0691202288835426,
-      "grad_norm": 2.609375,
-      "learning_rate": 0.0018772351908446583,
-      "loss": 1.4656,
-      "step": 472000
-    },
-    {
-      "epoch": 3.072371415566682,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018771051433773328,
-      "loss": 1.4598,
-      "step": 472500
-    },
-    {
-      "epoch": 3.075622602249821,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.001876975095910007,
-      "loss": 1.4602,
-      "step": 473000
-    },
-    {
-      "epoch": 3.0788737889329605,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018768450484426815,
-      "loss": 1.4621,
-      "step": 473500
-    },
-    {
-      "epoch": 3.0821249756160998,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.001876715000975356,
-      "loss": 1.4628,
-      "step": 474000
-    },
-    {
-      "epoch": 3.085376162299239,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0018765849535080303,
-      "loss": 1.4626,
-      "step": 474500
-    },
-    {
-      "epoch": 3.088627348982379,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0018764549060407048,
-      "loss": 1.466,
-      "step": 475000
-    },
-    {
-      "epoch": 3.091878535665518,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0018763248585733795,
-      "loss": 1.462,
-      "step": 475500
-    },
-    {
-      "epoch": 3.0951297223486574,
-      "grad_norm": 7.71875,
-      "learning_rate": 0.001876194811106054,
-      "loss": 1.464,
-      "step": 476000
-    },
-    {
-      "epoch": 3.0983809090317966,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0018760647636387282,
-      "loss": 1.4624,
-      "step": 476500
-    },
-    {
-      "epoch": 3.101632095714936,
-      "grad_norm": 1.4921875,
-      "learning_rate": 0.0018759347161714027,
-      "loss": 1.4656,
-      "step": 477000
-    },
-    {
-      "epoch": 3.104883282398075,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0018758046687040772,
-      "loss": 1.464,
-      "step": 477500
-    },
-    {
-      "epoch": 3.1081344690812145,
-      "grad_norm": 0.9140625,
-      "learning_rate": 0.0018756746212367514,
-      "loss": 1.461,
-      "step": 478000
-    },
-    {
-      "epoch": 3.1113856557643538,
-      "grad_norm": 2.328125,
-      "learning_rate": 0.0018755445737694259,
-      "loss": 1.4651,
-      "step": 478500
-    },
-    {
-      "epoch": 3.1146368424474935,
-      "grad_norm": 1.1328125,
-      "learning_rate": 0.0018754145263021004,
-      "loss": 1.4645,
-      "step": 479000
-    },
-    {
-      "epoch": 3.117888029130633,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0018752844788347746,
-      "loss": 1.4721,
-      "step": 479500
-    },
-    {
-      "epoch": 3.121139215813772,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018751544313674491,
-      "loss": 1.4656,
-      "step": 480000
-    },
-    {
-      "epoch": 3.1243904024969114,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0018750243839001236,
-      "loss": 1.464,
-      "step": 480500
-    },
-    {
-      "epoch": 3.1276415891800506,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0018748943364327978,
-      "loss": 1.4671,
-      "step": 481000
-    },
-    {
-      "epoch": 3.13089277586319,
-      "grad_norm": 1.71875,
-      "learning_rate": 0.0018747642889654723,
-      "loss": 1.4645,
-      "step": 481500
-    },
-    {
-      "epoch": 3.134143962546329,
-      "grad_norm": 1.8359375,
-      "learning_rate": 0.0018746342414981468,
-      "loss": 1.4666,
-      "step": 482000
-    },
-    {
-      "epoch": 3.137395149229469,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018745041940308215,
-      "loss": 1.4619,
-      "step": 482500
-    },
-    {
-      "epoch": 3.1406463359126082,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0018743741465634958,
-      "loss": 1.4618,
-      "step": 483000
-    },
-    {
-      "epoch": 3.1438975225957475,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0018742440990961702,
-      "loss": 1.4534,
-      "step": 483500
-    },
-    {
-      "epoch": 3.147148709278887,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018741140516288447,
-      "loss": 1.4633,
-      "step": 484000
-    },
-    {
-      "epoch": 3.150399895962026,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.001873984004161519,
-      "loss": 1.4596,
-      "step": 484500
-    },
-    {
-      "epoch": 3.1536510826451654,
-      "grad_norm": 1.9453125,
-      "learning_rate": 0.0018738539566941935,
-      "loss": 1.4652,
-      "step": 485000
-    },
-    {
-      "epoch": 3.1569022693283046,
-      "grad_norm": 0.98828125,
-      "learning_rate": 0.001873723909226868,
-      "loss": 1.4645,
-      "step": 485500
-    },
-    {
-      "epoch": 3.1601534560114444,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018735938617595422,
-      "loss": 1.4604,
-      "step": 486000
-    },
-    {
-      "epoch": 3.1634046426945837,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018734638142922167,
-      "loss": 1.4634,
-      "step": 486500
-    },
-    {
-      "epoch": 3.166655829377723,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0018733337668248912,
-      "loss": 1.4572,
-      "step": 487000
-    },
-    {
-      "epoch": 3.1699070160608622,
-      "grad_norm": 1.1875,
-      "learning_rate": 0.0018732037193575654,
-      "loss": 1.4651,
-      "step": 487500
-    },
-    {
-      "epoch": 3.1731582027440015,
-      "grad_norm": 1.6328125,
-      "learning_rate": 0.00187307367189024,
-      "loss": 1.465,
-      "step": 488000
-    },
-    {
-      "epoch": 3.176409389427141,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0018729436244229144,
-      "loss": 1.4643,
-      "step": 488500
-    },
-    {
-      "epoch": 3.17966057611028,
-      "grad_norm": 2.234375,
-      "learning_rate": 0.0018728135769555886,
-      "loss": 1.4622,
-      "step": 489000
-    },
-    {
-      "epoch": 3.1829117627934194,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.0018726835294882631,
-      "loss": 1.4618,
-      "step": 489500
-    },
-    {
-      "epoch": 3.186162949476559,
-      "grad_norm": 4.8125,
-      "learning_rate": 0.0018725534820209378,
-      "loss": 1.4592,
-      "step": 490000
-    },
-    {
-      "epoch": 3.1894141361596984,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018724234345536123,
-      "loss": 1.4562,
-      "step": 490500
-    },
-    {
-      "epoch": 3.1926653228428377,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018722933870862866,
-      "loss": 1.4643,
-      "step": 491000
-    },
-    {
-      "epoch": 3.195916509525977,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.001872163339618961,
-      "loss": 1.4631,
-      "step": 491500
-    },
-    {
-      "epoch": 3.1991676962091162,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0018720332921516355,
-      "loss": 1.4638,
-      "step": 492000
-    },
-    {
-      "epoch": 3.2024188828922555,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0018719032446843098,
-      "loss": 1.4637,
-      "step": 492500
-    },
-    {
-      "epoch": 3.205670069575395,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018717731972169842,
-      "loss": 1.4646,
-      "step": 493000
-    },
-    {
-      "epoch": 3.2089212562585345,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018716431497496587,
-      "loss": 1.4647,
-      "step": 493500
-    },
-    {
-      "epoch": 3.212172442941674,
-      "grad_norm": 4.71875,
-      "learning_rate": 0.001871513102282333,
-      "loss": 1.4617,
-      "step": 494000
-    },
-    {
-      "epoch": 3.215423629624813,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0018713830548150075,
-      "loss": 1.4578,
-      "step": 494500
-    },
-    {
-      "epoch": 3.2186748163079524,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.001871253007347682,
-      "loss": 1.4573,
-      "step": 495000
-    },
-    {
-      "epoch": 3.2219260029910917,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0018711229598803562,
-      "loss": 1.4608,
-      "step": 495500
-    },
-    {
-      "epoch": 3.225177189674231,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0018709929124130307,
-      "loss": 1.4641,
-      "step": 496000
-    },
-    {
-      "epoch": 3.2284283763573702,
-      "grad_norm": 1.34375,
-      "learning_rate": 0.0018708628649457052,
-      "loss": 1.4638,
-      "step": 496500
-    },
-    {
-      "epoch": 3.23167956304051,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0018707328174783799,
-      "loss": 1.4615,
-      "step": 497000
-    },
-    {
-      "epoch": 3.2349307497236492,
-      "grad_norm": 1.21875,
-      "learning_rate": 0.0018706027700110541,
-      "loss": 1.4576,
-      "step": 497500
-    },
-    {
-      "epoch": 3.2381819364067885,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.0018704727225437286,
-      "loss": 1.4655,
-      "step": 498000
-    },
-    {
-      "epoch": 3.241433123089928,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.001870342675076403,
-      "loss": 1.4625,
-      "step": 498500
-    },
-    {
-      "epoch": 3.244684309773067,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0018702126276090773,
-      "loss": 1.4666,
-      "step": 499000
-    },
-    {
-      "epoch": 3.2479354964562064,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0018700825801417518,
-      "loss": 1.4697,
-      "step": 499500
-    },
-    {
-      "epoch": 3.2511866831393457,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018699525326744263,
-      "loss": 1.4659,
-      "step": 500000
-    },
-    {
-      "epoch": 3.2544378698224854,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.0018698224852071006,
-      "loss": 1.4683,
-      "step": 500500
-    },
-    {
-      "epoch": 3.2576890565056247,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.001869692437739775,
-      "loss": 1.4658,
-      "step": 501000
-    },
-    {
-      "epoch": 3.260940243188764,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0018695623902724495,
-      "loss": 1.4642,
-      "step": 501500
-    },
-    {
-      "epoch": 3.2641914298719032,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0018694323428051238,
-      "loss": 1.4566,
-      "step": 502000
-    },
-    {
-      "epoch": 3.2674426165550425,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.0018693022953377983,
-      "loss": 1.4618,
-      "step": 502500
-    },
-    {
-      "epoch": 3.270693803238182,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0018691722478704727,
-      "loss": 1.4692,
-      "step": 503000
-    },
-    {
-      "epoch": 3.273944989921321,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.001869042200403147,
-      "loss": 1.4591,
-      "step": 503500
-    },
-    {
-      "epoch": 3.277196176604461,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0018689121529358215,
-      "loss": 1.4591,
-      "step": 504000
-    },
-    {
-      "epoch": 3.2804473632876,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0018687821054684962,
-      "loss": 1.4616,
-      "step": 504500
-    },
-    {
-      "epoch": 3.2836985499707394,
-      "grad_norm": 1.296875,
-      "learning_rate": 0.0018686520580011706,
-      "loss": 1.4581,
-      "step": 505000
-    },
-    {
-      "epoch": 3.2869497366538787,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.001868522010533845,
-      "loss": 1.4645,
-      "step": 505500
-    },
-    {
-      "epoch": 3.290200923337018,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018683919630665194,
-      "loss": 1.468,
-      "step": 506000
-    },
-    {
-      "epoch": 3.2934521100201573,
-      "grad_norm": 1.2734375,
-      "learning_rate": 0.0018682619155991939,
-      "loss": 1.4623,
-      "step": 506500
-    },
-    {
-      "epoch": 3.2967032967032965,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0018681318681318681,
-      "loss": 1.4576,
-      "step": 507000
-    },
-    {
-      "epoch": 3.2999544833864363,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0018680018206645426,
-      "loss": 1.4624,
-      "step": 507500
-    },
-    {
-      "epoch": 3.3032056700695756,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.001867871773197217,
-      "loss": 1.4646,
-      "step": 508000
-    },
-    {
-      "epoch": 3.306456856752715,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0018677417257298913,
-      "loss": 1.4571,
-      "step": 508500
-    },
-    {
-      "epoch": 3.309708043435854,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018676116782625658,
-      "loss": 1.4602,
-      "step": 509000
-    },
-    {
-      "epoch": 3.3129592301189934,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018674816307952403,
-      "loss": 1.4644,
-      "step": 509500
-    },
-    {
-      "epoch": 3.3162104168021327,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018673515833279146,
-      "loss": 1.4602,
-      "step": 510000
-    },
-    {
-      "epoch": 3.319461603485272,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.001867221535860589,
-      "loss": 1.4605,
-      "step": 510500
-    },
-    {
-      "epoch": 3.3227127901684117,
-      "grad_norm": 8.0625,
-      "learning_rate": 0.0018670914883932635,
-      "loss": 1.4608,
-      "step": 511000
-    },
-    {
-      "epoch": 3.325963976851551,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0018669614409259382,
-      "loss": 1.4653,
-      "step": 511500
-    },
-    {
-      "epoch": 3.3292151635346903,
-      "grad_norm": 1.2578125,
-      "learning_rate": 0.0018668313934586125,
-      "loss": 1.4617,
-      "step": 512000
-    },
-    {
-      "epoch": 3.3324663502178296,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.001866701345991287,
-      "loss": 1.4624,
-      "step": 512500
-    },
-    {
-      "epoch": 3.335717536900969,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0018665712985239614,
-      "loss": 1.4737,
-      "step": 513000
-    },
-    {
-      "epoch": 3.338968723584108,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0018664412510566357,
-      "loss": 1.4689,
-      "step": 513500
-    },
-    {
-      "epoch": 3.3422199102672474,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0018663112035893102,
-      "loss": 1.4679,
-      "step": 514000
-    },
-    {
-      "epoch": 3.345471096950387,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.0018661811561219847,
-      "loss": 1.4666,
-      "step": 514500
-    },
-    {
-      "epoch": 3.3487222836335264,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.001866051108654659,
-      "loss": 1.4707,
-      "step": 515000
-    },
-    {
-      "epoch": 3.3519734703166657,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0018659210611873334,
-      "loss": 1.4658,
-      "step": 515500
-    },
-    {
-      "epoch": 3.355224656999805,
-      "grad_norm": 1.2265625,
-      "learning_rate": 0.0018657910137200079,
-      "loss": 1.4621,
-      "step": 516000
-    },
-    {
-      "epoch": 3.3584758436829443,
-      "grad_norm": 1.71875,
-      "learning_rate": 0.0018656609662526821,
-      "loss": 1.4627,
-      "step": 516500
-    },
-    {
-      "epoch": 3.3617270303660836,
-      "grad_norm": 2.28125,
-      "learning_rate": 0.0018655309187853566,
-      "loss": 1.466,
-      "step": 517000
-    },
-    {
-      "epoch": 3.364978217049223,
-      "grad_norm": 2.1875,
-      "learning_rate": 0.001865400871318031,
-      "loss": 1.4665,
-      "step": 517500
-    },
-    {
-      "epoch": 3.368229403732362,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0018652708238507054,
-      "loss": 1.4685,
-      "step": 518000
-    },
-    {
-      "epoch": 3.3714805904155014,
-      "grad_norm": 0.94140625,
-      "learning_rate": 0.0018651407763833798,
-      "loss": 1.4629,
-      "step": 518500
-    },
-    {
-      "epoch": 3.374731777098641,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0018650107289160545,
-      "loss": 1.4591,
-      "step": 519000
-    },
-    {
-      "epoch": 3.3779829637817804,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.001864880681448729,
-      "loss": 1.4592,
-      "step": 519500
-    },
-    {
-      "epoch": 3.3812341504649197,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0018647506339814033,
-      "loss": 1.4595,
-      "step": 520000
-    },
-    {
-      "epoch": 3.384485337148059,
-      "grad_norm": 1.25,
-      "learning_rate": 0.0018646205865140777,
-      "loss": 1.454,
-      "step": 520500
-    },
-    {
-      "epoch": 3.3877365238311983,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0018644905390467522,
-      "loss": 1.4567,
-      "step": 521000
-    },
-    {
-      "epoch": 3.3909877105143376,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0018643604915794265,
-      "loss": 1.4625,
-      "step": 521500
-    },
-    {
-      "epoch": 3.394238897197477,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.001864230444112101,
-      "loss": 1.4636,
-      "step": 522000
-    },
-    {
-      "epoch": 3.3974900838806166,
-      "grad_norm": 1.1875,
-      "learning_rate": 0.0018641003966447754,
-      "loss": 1.4584,
-      "step": 522500
-    },
-    {
-      "epoch": 3.400741270563756,
-      "grad_norm": 12.1875,
-      "learning_rate": 0.0018639703491774497,
-      "loss": 1.4628,
-      "step": 523000
-    },
-    {
-      "epoch": 3.403992457246895,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018638403017101242,
-      "loss": 1.4558,
-      "step": 523500
-    },
-    {
-      "epoch": 3.4072436439300344,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0018637102542427987,
-      "loss": 1.4548,
-      "step": 524000
-    },
-    {
-      "epoch": 3.4104948306131737,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.001863580206775473,
-      "loss": 1.4562,
-      "step": 524500
-    },
-    {
-      "epoch": 3.413746017296313,
-      "grad_norm": 1.640625,
-      "learning_rate": 0.0018634501593081474,
-      "loss": 1.4587,
-      "step": 525000
-    },
-    {
-      "epoch": 3.4169972039794523,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018633201118408219,
-      "loss": 1.4576,
-      "step": 525500
-    },
-    {
-      "epoch": 3.420248390662592,
-      "grad_norm": 1.4140625,
-      "learning_rate": 0.0018631900643734966,
-      "loss": 1.4567,
-      "step": 526000
-    },
-    {
-      "epoch": 3.4234995773457313,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0018630600169061708,
-      "loss": 1.4547,
-      "step": 526500
-    },
-    {
-      "epoch": 3.4267507640288706,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0018629299694388453,
-      "loss": 1.4625,
-      "step": 527000
-    },
-    {
-      "epoch": 3.43000195071201,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0018627999219715198,
-      "loss": 1.4562,
-      "step": 527500
-    },
-    {
-      "epoch": 3.433253137395149,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.001862669874504194,
-      "loss": 1.4553,
-      "step": 528000
-    },
-    {
-      "epoch": 3.4365043240782884,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018625398270368685,
-      "loss": 1.4575,
-      "step": 528500
-    },
-    {
-      "epoch": 3.4397555107614277,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.001862409779569543,
-      "loss": 1.4557,
-      "step": 529000
-    },
-    {
-      "epoch": 3.4430066974445674,
-      "grad_norm": 7.375,
-      "learning_rate": 0.0018622797321022173,
-      "loss": 1.4513,
-      "step": 529500
-    },
-    {
-      "epoch": 3.4462578841277067,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018621496846348918,
-      "loss": 1.4547,
-      "step": 530000
-    },
-    {
-      "epoch": 3.449509070810846,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018620196371675662,
-      "loss": 1.4489,
-      "step": 530500
-    },
-    {
-      "epoch": 3.4527602574939853,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0018618895897002405,
-      "loss": 1.4588,
-      "step": 531000
-    },
-    {
-      "epoch": 3.4560114441771246,
-      "grad_norm": 1.6484375,
-      "learning_rate": 0.001861759542232915,
-      "loss": 1.4543,
-      "step": 531500
-    },
-    {
-      "epoch": 3.459262630860264,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.0018616294947655895,
-      "loss": 1.4538,
-      "step": 532000
-    },
-    {
-      "epoch": 3.462513817543403,
-      "grad_norm": 1.8515625,
-      "learning_rate": 0.0018614994472982637,
-      "loss": 1.4574,
-      "step": 532500
-    },
-    {
-      "epoch": 3.465765004226543,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0018613693998309382,
-      "loss": 1.4542,
-      "step": 533000
-    },
-    {
-      "epoch": 3.469016190909682,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0018612393523636129,
-      "loss": 1.4478,
-      "step": 533500
-    },
-    {
-      "epoch": 3.4722673775928214,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0018611093048962874,
-      "loss": 1.4491,
-      "step": 534000
-    },
-    {
-      "epoch": 3.4755185642759607,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018609792574289616,
-      "loss": 1.4509,
-      "step": 534500
-    },
-    {
-      "epoch": 3.4787697509591,
-      "grad_norm": 1.328125,
-      "learning_rate": 0.001860849209961636,
-      "loss": 1.4515,
-      "step": 535000
-    },
-    {
-      "epoch": 3.4820209376422393,
-      "grad_norm": 1.4296875,
-      "learning_rate": 0.0018607191624943106,
-      "loss": 1.4496,
-      "step": 535500
-    },
-    {
-      "epoch": 3.4852721243253786,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0018605891150269848,
-      "loss": 1.4531,
-      "step": 536000
-    },
-    {
-      "epoch": 3.4885233110085183,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0018604590675596593,
-      "loss": 1.4456,
-      "step": 536500
-    },
-    {
-      "epoch": 3.4917744976916576,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.0018603290200923338,
-      "loss": 1.445,
-      "step": 537000
-    },
-    {
-      "epoch": 3.495025684374797,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.001860198972625008,
-      "loss": 1.4524,
-      "step": 537500
-    },
-    {
-      "epoch": 3.498276871057936,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0018600689251576825,
-      "loss": 1.4515,
-      "step": 538000
-    },
-    {
-      "epoch": 3.5015280577410755,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.001859938877690357,
-      "loss": 1.452,
-      "step": 538500
-    },
-    {
-      "epoch": 3.5047792444242147,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.0018598088302230313,
-      "loss": 1.4509,
-      "step": 539000
-    },
-    {
-      "epoch": 3.508030431107354,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0018596787827557058,
-      "loss": 1.4488,
-      "step": 539500
-    },
-    {
-      "epoch": 3.5112816177904937,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0018595487352883802,
-      "loss": 1.4503,
-      "step": 540000
-    },
-    {
-      "epoch": 3.514532804473633,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.001859418687821055,
-      "loss": 1.4458,
-      "step": 540500
-    },
-    {
-      "epoch": 3.5177839911567723,
-      "grad_norm": 3.40625,
-      "learning_rate": 0.0018592886403537292,
-      "loss": 1.448,
-      "step": 541000
-    },
-    {
-      "epoch": 3.5210351778399116,
-      "grad_norm": 0.953125,
-      "learning_rate": 0.0018591585928864037,
-      "loss": 1.4465,
-      "step": 541500
-    },
-    {
-      "epoch": 3.524286364523051,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0018590285454190782,
-      "loss": 1.4497,
-      "step": 542000
-    },
-    {
-      "epoch": 3.52753755120619,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.0018588984979517524,
-      "loss": 1.4566,
-      "step": 542500
-    },
-    {
-      "epoch": 3.5307887378893295,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.001858768450484427,
-      "loss": 1.446,
-      "step": 543000
-    },
-    {
-      "epoch": 3.534039924572469,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0018586384030171014,
-      "loss": 1.4494,
-      "step": 543500
-    },
-    {
-      "epoch": 3.537291111255608,
-      "grad_norm": 1.546875,
-      "learning_rate": 0.0018585083555497756,
-      "loss": 1.4507,
-      "step": 544000
-    },
-    {
-      "epoch": 3.5405422979387478,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0018583783080824501,
-      "loss": 1.4519,
-      "step": 544500
-    },
-    {
-      "epoch": 3.543793484621887,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.0018582482606151246,
-      "loss": 1.4519,
-      "step": 545000
-    },
-    {
-      "epoch": 3.5470446713050263,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0018581182131477989,
-      "loss": 1.4518,
-      "step": 545500
-    },
-    {
-      "epoch": 3.5502958579881656,
-      "grad_norm": 1.1171875,
-      "learning_rate": 0.0018579881656804733,
-      "loss": 1.4468,
-      "step": 546000
-    },
-    {
-      "epoch": 3.553547044671305,
-      "grad_norm": 1.4296875,
-      "learning_rate": 0.0018578581182131478,
-      "loss": 1.4491,
-      "step": 546500
-    },
-    {
-      "epoch": 3.5567982313544446,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001857728070745822,
-      "loss": 1.4475,
-      "step": 547000
-    },
-    {
-      "epoch": 3.5600494180375835,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.0018575980232784966,
-      "loss": 1.4443,
-      "step": 547500
-    },
-    {
-      "epoch": 3.563300604720723,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0018574679758111712,
-      "loss": 1.4528,
-      "step": 548000
-    },
-    {
-      "epoch": 3.5665517914038625,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018573379283438457,
-      "loss": 1.4504,
-      "step": 548500
-    },
-    {
-      "epoch": 3.5698029780870018,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.00185720788087652,
-      "loss": 1.4515,
-      "step": 549000
-    },
-    {
-      "epoch": 3.573054164770141,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0018570778334091945,
-      "loss": 1.4514,
-      "step": 549500
-    },
-    {
-      "epoch": 3.5763053514532803,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.001856947785941869,
-      "loss": 1.4486,
-      "step": 550000
-    },
-    {
-      "epoch": 3.57955653813642,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0018568177384745432,
-      "loss": 1.4558,
-      "step": 550500
-    },
-    {
-      "epoch": 3.582807724819559,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.0018566876910072177,
-      "loss": 1.4568,
-      "step": 551000
-    },
-    {
-      "epoch": 3.5860589115026986,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0018565576435398922,
-      "loss": 1.4611,
-      "step": 551500
-    },
-    {
-      "epoch": 3.589310098185838,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0018564275960725664,
-      "loss": 1.4546,
-      "step": 552000
-    },
-    {
-      "epoch": 3.592561284868977,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.001856297548605241,
-      "loss": 1.4468,
-      "step": 552500
-    },
-    {
-      "epoch": 3.5958124715521165,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0018561675011379154,
-      "loss": 1.4537,
-      "step": 553000
-    },
-    {
-      "epoch": 3.5990636582352558,
-      "grad_norm": 2.21875,
-      "learning_rate": 0.0018560374536705896,
-      "loss": 1.4525,
-      "step": 553500
-    },
-    {
-      "epoch": 3.6023148449183955,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0018559074062032641,
-      "loss": 1.4554,
-      "step": 554000
-    },
-    {
-      "epoch": 3.6055660316015343,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0018557773587359386,
-      "loss": 1.4624,
-      "step": 554500
-    },
-    {
-      "epoch": 3.608817218284674,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0018556473112686133,
-      "loss": 1.45,
-      "step": 555000
-    },
-    {
-      "epoch": 3.6120684049678133,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0018555172638012876,
-      "loss": 1.4379,
-      "step": 555500
-    },
-    {
-      "epoch": 3.6153195916509526,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.001855387216333962,
-      "loss": 1.4456,
-      "step": 556000
-    },
-    {
-      "epoch": 3.618570778334092,
-      "grad_norm": 1.546875,
-      "learning_rate": 0.0018552571688666365,
-      "loss": 1.4477,
-      "step": 556500
-    },
-    {
-      "epoch": 3.621821965017231,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0018551271213993108,
-      "loss": 1.4479,
-      "step": 557000
-    },
-    {
-      "epoch": 3.6250731517003705,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0018549970739319853,
-      "loss": 1.4461,
-      "step": 557500
-    },
-    {
-      "epoch": 3.6283243383835098,
-      "grad_norm": 0.578125,
-      "learning_rate": 0.0018548670264646597,
-      "loss": 1.4451,
-      "step": 558000
-    },
-    {
-      "epoch": 3.6315755250666495,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.001854736978997334,
-      "loss": 1.4457,
-      "step": 558500
-    },
-    {
-      "epoch": 3.6348267117497888,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.0018546069315300085,
-      "loss": 1.4474,
-      "step": 559000
-    },
-    {
-      "epoch": 3.638077898432928,
-      "grad_norm": 0.875,
-      "learning_rate": 0.001854476884062683,
-      "loss": 1.4448,
-      "step": 559500
-    },
-    {
-      "epoch": 3.6413290851160673,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0018543468365953572,
-      "loss": 1.4462,
-      "step": 560000
-    },
-    {
-      "epoch": 3.6445802717992066,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0018542167891280317,
-      "loss": 1.446,
-      "step": 560500
-    },
-    {
-      "epoch": 3.647831458482346,
-      "grad_norm": 1.3359375,
-      "learning_rate": 0.0018540867416607062,
-      "loss": 1.4452,
-      "step": 561000
-    },
-    {
-      "epoch": 3.651082645165485,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0018539566941933804,
-      "loss": 1.4469,
-      "step": 561500
-    },
-    {
-      "epoch": 3.654333831848625,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.001853826646726055,
-      "loss": 1.4522,
-      "step": 562000
-    },
-    {
-      "epoch": 3.657585018531764,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.0018536965992587296,
-      "loss": 1.4482,
-      "step": 562500
-    },
-    {
-      "epoch": 3.6608362052149035,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.001853566551791404,
-      "loss": 1.4472,
-      "step": 563000
-    },
-    {
-      "epoch": 3.664087391898043,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018534365043240783,
-      "loss": 1.443,
-      "step": 563500
-    },
-    {
-      "epoch": 3.667338578581182,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0018533064568567528,
-      "loss": 1.4432,
-      "step": 564000
-    },
-    {
-      "epoch": 3.6705897652643213,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.0018531764093894273,
-      "loss": 1.4502,
-      "step": 564500
-    },
-    {
-      "epoch": 3.6738409519474606,
-      "grad_norm": 0.9765625,
-      "learning_rate": 0.0018530463619221016,
-      "loss": 1.4471,
-      "step": 565000
-    },
-    {
-      "epoch": 3.6770921386306004,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.001852916314454776,
-      "loss": 1.4464,
-      "step": 565500
-    },
-    {
-      "epoch": 3.6803433253137396,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.0018527862669874505,
-      "loss": 1.4476,
-      "step": 566000
-    },
-    {
-      "epoch": 3.683594511996879,
-      "grad_norm": 0.546875,
-      "learning_rate": 0.0018526562195201248,
-      "loss": 1.4494,
-      "step": 566500
-    },
-    {
-      "epoch": 3.686845698680018,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018525261720527993,
-      "loss": 1.4483,
-      "step": 567000
-    },
-    {
-      "epoch": 3.6900968853631575,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018523961245854737,
-      "loss": 1.4481,
-      "step": 567500
-    },
-    {
-      "epoch": 3.693348072046297,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.001852266077118148,
-      "loss": 1.4484,
-      "step": 568000
-    },
-    {
-      "epoch": 3.696599258729436,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.0018521360296508225,
-      "loss": 1.4434,
-      "step": 568500
-    },
-    {
-      "epoch": 3.699850445412576,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001852005982183497,
-      "loss": 1.4446,
-      "step": 569000
-    },
-    {
-      "epoch": 3.703101632095715,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.0018518759347161717,
-      "loss": 1.443,
-      "step": 569500
-    },
-    {
-      "epoch": 3.7063528187788544,
-      "grad_norm": 0.53515625,
-      "learning_rate": 0.001851745887248846,
-      "loss": 1.4453,
-      "step": 570000
-    },
-    {
-      "epoch": 3.7096040054619936,
-      "grad_norm": 1.21875,
-      "learning_rate": 0.0018516158397815204,
-      "loss": 1.4477,
-      "step": 570500
-    },
-    {
-      "epoch": 3.712855192145133,
-      "grad_norm": 1.4140625,
-      "learning_rate": 0.0018514857923141949,
-      "loss": 1.441,
-      "step": 571000
-    },
-    {
-      "epoch": 3.716106378828272,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018513557448468691,
-      "loss": 1.4444,
-      "step": 571500
-    },
-    {
-      "epoch": 3.7193575655114115,
-      "grad_norm": 1.4140625,
-      "learning_rate": 0.0018512256973795436,
-      "loss": 1.4424,
-      "step": 572000
-    },
-    {
-      "epoch": 3.7226087521945512,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.001851095649912218,
-      "loss": 1.4451,
-      "step": 572500
-    },
-    {
-      "epoch": 3.7258599388776905,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0018509656024448924,
-      "loss": 1.4453,
-      "step": 573000
-    },
-    {
-      "epoch": 3.72911112556083,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018508355549775668,
-      "loss": 1.446,
-      "step": 573500
-    },
-    {
-      "epoch": 3.732362312243969,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0018507055075102413,
-      "loss": 1.4498,
-      "step": 574000
-    },
-    {
-      "epoch": 3.7356134989271084,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0018505754600429156,
-      "loss": 1.4429,
-      "step": 574500
-    },
-    {
-      "epoch": 3.7388646856102477,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.00185044541257559,
-      "loss": 1.4459,
-      "step": 575000
-    },
-    {
-      "epoch": 3.742115872293387,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0018503153651082645,
-      "loss": 1.4468,
-      "step": 575500
-    },
-    {
-      "epoch": 3.7453670589765267,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0018501853176409388,
-      "loss": 1.4479,
-      "step": 576000
-    },
-    {
-      "epoch": 3.7486182456596655,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0018500552701736133,
-      "loss": 1.4447,
-      "step": 576500
-    },
-    {
-      "epoch": 3.7518694323428052,
-      "grad_norm": 7.25,
-      "learning_rate": 0.001849925222706288,
-      "loss": 1.447,
-      "step": 577000
-    },
-    {
-      "epoch": 3.7551206190259445,
-      "grad_norm": 1.265625,
-      "learning_rate": 0.0018497951752389624,
-      "loss": 1.443,
-      "step": 577500
-    },
-    {
-      "epoch": 3.758371805709084,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.0018496651277716367,
-      "loss": 1.4386,
-      "step": 578000
-    },
-    {
-      "epoch": 3.761622992392223,
-      "grad_norm": 3.828125,
-      "learning_rate": 0.0018495350803043112,
-      "loss": 1.4434,
-      "step": 578500
-    },
-    {
-      "epoch": 3.7648741790753624,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0018494050328369857,
-      "loss": 1.4513,
-      "step": 579000
-    },
-    {
-      "epoch": 3.768125365758502,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.00184927498536966,
-      "loss": 1.444,
-      "step": 579500
-    },
-    {
-      "epoch": 3.771376552441641,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018491449379023344,
-      "loss": 1.444,
-      "step": 580000
-    },
-    {
-      "epoch": 3.7746277391247807,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.0018490148904350089,
-      "loss": 1.449,
-      "step": 580500
-    },
-    {
-      "epoch": 3.77787892580792,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0018488848429676831,
-      "loss": 1.447,
-      "step": 581000
-    },
-    {
-      "epoch": 3.7811301124910592,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0018487547955003576,
-      "loss": 1.4415,
-      "step": 581500
-    },
-    {
-      "epoch": 3.7843812991741985,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.001848624748033032,
-      "loss": 1.443,
-      "step": 582000
-    },
-    {
-      "epoch": 3.787632485857338,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018484947005657064,
-      "loss": 1.4442,
-      "step": 582500
-    },
-    {
-      "epoch": 3.7908836725404775,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0018483646530983808,
-      "loss": 1.4434,
-      "step": 583000
-    },
-    {
-      "epoch": 3.7941348592236164,
-      "grad_norm": 0.9140625,
-      "learning_rate": 0.0018482346056310553,
-      "loss": 1.4487,
-      "step": 583500
-    },
-    {
-      "epoch": 3.797386045906756,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.00184810455816373,
-      "loss": 1.4402,
-      "step": 584000
-    },
-    {
-      "epoch": 3.8006372325898954,
-      "grad_norm": 0.5546875,
-      "learning_rate": 0.0018479745106964043,
-      "loss": 1.4463,
-      "step": 584500
-    },
-    {
-      "epoch": 3.8038884192730347,
-      "grad_norm": 0.53125,
-      "learning_rate": 0.0018478444632290788,
-      "loss": 1.4456,
-      "step": 585000
-    },
-    {
-      "epoch": 3.807139605956174,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018477144157617532,
-      "loss": 1.4485,
-      "step": 585500
-    },
-    {
-      "epoch": 3.8103907926393132,
-      "grad_norm": 1.6796875,
-      "learning_rate": 0.0018475843682944275,
-      "loss": 1.4514,
-      "step": 586000
-    },
-    {
-      "epoch": 3.813641979322453,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.001847454320827102,
-      "loss": 1.4494,
-      "step": 586500
-    },
-    {
-      "epoch": 3.816893166005592,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0018473242733597765,
-      "loss": 1.4423,
-      "step": 587000
-    },
-    {
-      "epoch": 3.8201443526887315,
-      "grad_norm": 0.9765625,
-      "learning_rate": 0.0018471942258924507,
-      "loss": 1.454,
-      "step": 587500
-    },
-    {
-      "epoch": 3.823395539371871,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0018470641784251252,
-      "loss": 1.4479,
-      "step": 588000
-    },
-    {
-      "epoch": 3.82664672605501,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0018469341309577997,
-      "loss": 1.449,
-      "step": 588500
-    },
-    {
-      "epoch": 3.8298979127381494,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.001846804083490474,
-      "loss": 1.4458,
-      "step": 589000
-    },
-    {
-      "epoch": 3.8331490994212887,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0018466740360231484,
-      "loss": 1.4449,
-      "step": 589500
-    },
-    {
-      "epoch": 3.836400286104428,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018465439885558229,
-      "loss": 1.4478,
-      "step": 590000
-    },
-    {
-      "epoch": 3.8396514727875672,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0018464139410884972,
-      "loss": 1.4433,
-      "step": 590500
-    },
-    {
-      "epoch": 3.842902659470707,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0018462838936211716,
-      "loss": 1.442,
-      "step": 591000
-    },
-    {
-      "epoch": 3.8461538461538463,
-      "grad_norm": 7.4375,
-      "learning_rate": 0.0018461538461538463,
-      "loss": 1.4469,
-      "step": 591500
-    },
-    {
-      "epoch": 3.8494050328369855,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018460237986865208,
-      "loss": 1.4432,
-      "step": 592000
-    },
-    {
-      "epoch": 3.852656219520125,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.001845893751219195,
-      "loss": 1.4445,
-      "step": 592500
-    },
-    {
-      "epoch": 3.855907406203264,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0018457637037518695,
-      "loss": 1.4427,
-      "step": 593000
-    },
-    {
-      "epoch": 3.8591585928864034,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.001845633656284544,
-      "loss": 1.4461,
-      "step": 593500
-    },
-    {
-      "epoch": 3.8624097795695427,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0018455036088172183,
-      "loss": 1.4441,
-      "step": 594000
-    },
-    {
-      "epoch": 3.8656609662526824,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018453735613498928,
-      "loss": 1.4397,
-      "step": 594500
-    },
-    {
-      "epoch": 3.8689121529358217,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0018452435138825672,
-      "loss": 1.4462,
-      "step": 595000
-    },
-    {
-      "epoch": 3.872163339618961,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0018451134664152415,
-      "loss": 1.4442,
-      "step": 595500
-    },
-    {
-      "epoch": 3.8754145263021003,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.001844983418947916,
-      "loss": 1.4517,
-      "step": 596000
-    },
-    {
-      "epoch": 3.8786657129852395,
-      "grad_norm": 1.359375,
-      "learning_rate": 0.0018448533714805905,
-      "loss": 1.4499,
-      "step": 596500
-    },
-    {
-      "epoch": 3.881916899668379,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0018447233240132647,
-      "loss": 1.4482,
-      "step": 597000
-    },
-    {
-      "epoch": 3.885168086351518,
-      "grad_norm": 1.4375,
-      "learning_rate": 0.0018445932765459392,
-      "loss": 1.4522,
-      "step": 597500
-    },
-    {
-      "epoch": 3.888419273034658,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0018444632290786137,
-      "loss": 1.4533,
-      "step": 598000
-    },
-    {
-      "epoch": 3.891670459717797,
-      "grad_norm": 1.75,
-      "learning_rate": 0.0018443331816112884,
-      "loss": 1.4641,
-      "step": 598500
-    },
-    {
-      "epoch": 3.8949216464009364,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0018442031341439626,
-      "loss": 1.4639,
-      "step": 599000
-    },
-    {
-      "epoch": 3.8981728330840757,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018440730866766371,
-      "loss": 1.4723,
-      "step": 599500
-    },
-    {
-      "epoch": 3.901424019767215,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.0018439430392093116,
-      "loss": 1.47,
-      "step": 600000
-    },
-    {
-      "epoch": 3.9046752064503543,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018438129917419859,
-      "loss": 1.4575,
-      "step": 600500
-    },
-    {
-      "epoch": 3.9079263931334935,
-      "grad_norm": 1.3046875,
-      "learning_rate": 0.0018436829442746603,
-      "loss": 1.4491,
-      "step": 601000
-    },
-    {
-      "epoch": 3.9111775798166333,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0018435528968073348,
-      "loss": 1.4543,
-      "step": 601500
-    },
-    {
-      "epoch": 3.9144287664997726,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.001843422849340009,
-      "loss": 1.4517,
-      "step": 602000
-    },
-    {
-      "epoch": 3.917679953182912,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018432928018726836,
-      "loss": 1.4506,
-      "step": 602500
-    },
-    {
-      "epoch": 3.920931139866051,
-      "grad_norm": 0.625,
-      "learning_rate": 0.001843162754405358,
-      "loss": 1.4549,
-      "step": 603000
-    },
-    {
-      "epoch": 3.9241823265491904,
-      "grad_norm": 1.7578125,
-      "learning_rate": 0.0018430327069380323,
-      "loss": 1.4574,
-      "step": 603500
-    },
-    {
-      "epoch": 3.9274335132323297,
-      "grad_norm": 1.671875,
-      "learning_rate": 0.0018429026594707068,
-      "loss": 1.4556,
-      "step": 604000
-    },
-    {
-      "epoch": 3.930684699915469,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0018427726120033812,
-      "loss": 1.4504,
-      "step": 604500
-    },
-    {
-      "epoch": 3.9339358865986087,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018426425645360555,
-      "loss": 1.4489,
-      "step": 605000
-    },
-    {
-      "epoch": 3.937187073281748,
-      "grad_norm": 1.453125,
-      "learning_rate": 0.00184251251706873,
-      "loss": 1.4473,
-      "step": 605500
-    },
-    {
-      "epoch": 3.9404382599648873,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0018423824696014047,
-      "loss": 1.4407,
-      "step": 606000
-    },
-    {
-      "epoch": 3.9436894466480266,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0018422524221340792,
-      "loss": 1.4481,
-      "step": 606500
-    },
-    {
-      "epoch": 3.946940633331166,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018421223746667534,
-      "loss": 1.4457,
-      "step": 607000
-    },
-    {
-      "epoch": 3.950191820014305,
-      "grad_norm": 1.0,
-      "learning_rate": 0.001841992327199428,
-      "loss": 1.4491,
-      "step": 607500
-    },
-    {
-      "epoch": 3.9534430066974444,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0018418622797321024,
-      "loss": 1.443,
-      "step": 608000
-    },
-    {
-      "epoch": 3.956694193380584,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0018417322322647766,
-      "loss": 1.4424,
-      "step": 608500
-    },
-    {
-      "epoch": 3.959945380063723,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0018416021847974511,
-      "loss": 1.4371,
-      "step": 609000
-    },
-    {
-      "epoch": 3.9631965667468627,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.0018414721373301256,
-      "loss": 1.439,
-      "step": 609500
-    },
-    {
-      "epoch": 3.966447753430002,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0018413420898627999,
-      "loss": 1.4452,
-      "step": 610000
-    },
-    {
-      "epoch": 3.9696989401131413,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0018412120423954743,
-      "loss": 1.4444,
-      "step": 610500
-    },
-    {
-      "epoch": 3.9729501267962806,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0018410819949281488,
-      "loss": 1.4489,
-      "step": 611000
-    },
-    {
-      "epoch": 3.97620131347942,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.001840951947460823,
-      "loss": 1.4499,
-      "step": 611500
-    },
-    {
-      "epoch": 3.9794525001625596,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0018408218999934976,
-      "loss": 1.4398,
-      "step": 612000
-    },
-    {
-      "epoch": 3.9827036868456984,
-      "grad_norm": 1.140625,
-      "learning_rate": 0.001840691852526172,
-      "loss": 1.4407,
-      "step": 612500
-    },
-    {
-      "epoch": 3.985954873528838,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018405618050588467,
-      "loss": 1.4432,
-      "step": 613000
-    },
-    {
-      "epoch": 3.9892060602119774,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.001840431757591521,
-      "loss": 1.4482,
-      "step": 613500
-    },
-    {
-      "epoch": 3.9924572468951167,
-      "grad_norm": 1.390625,
-      "learning_rate": 0.0018403017101241955,
-      "loss": 1.4452,
-      "step": 614000
-    },
-    {
-      "epoch": 3.995708433578256,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.00184017166265687,
-      "loss": 1.4436,
-      "step": 614500
-    },
-    {
-      "epoch": 3.9989596202613953,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0018400416151895442,
-      "loss": 1.4495,
-      "step": 615000
-    },
-    {
-      "epoch": 4.0,
-      "eval_loss": 1.4305795431137085,
-      "eval_runtime": 0.5339,
-      "eval_samples_per_second": 1873.085,
-      "eval_steps_per_second": 29.969,
-      "step": 615160
-    },
-    {
-      "epoch": 4.002210806944535,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0018399115677222187,
-      "loss": 1.4472,
-      "step": 615500
-    },
-    {
-      "epoch": 4.005461993627674,
-      "grad_norm": 2.375,
-      "learning_rate": 0.0018397815202548932,
-      "loss": 1.4458,
-      "step": 616000
-    },
-    {
-      "epoch": 4.008713180310814,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0018396514727875674,
-      "loss": 1.4438,
-      "step": 616500
-    },
-    {
-      "epoch": 4.011964366993952,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.001839521425320242,
-      "loss": 1.4449,
-      "step": 617000
-    },
-    {
-      "epoch": 4.015215553677092,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0018393913778529164,
-      "loss": 1.4383,
-      "step": 617500
-    },
-    {
-      "epoch": 4.018466740360232,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0018392613303855907,
-      "loss": 1.4475,
-      "step": 618000
-    },
-    {
-      "epoch": 4.021717927043371,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0018391312829182651,
-      "loss": 1.4417,
-      "step": 618500
-    },
-    {
-      "epoch": 4.0249691137265105,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0018390012354509396,
-      "loss": 1.4449,
-      "step": 619000
-    },
-    {
-      "epoch": 4.028220300409649,
-      "grad_norm": 2.34375,
-      "learning_rate": 0.0018388711879836139,
-      "loss": 1.4429,
-      "step": 619500
-    },
-    {
-      "epoch": 4.031471487092789,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0018387411405162883,
-      "loss": 1.4423,
-      "step": 620000
-    },
-    {
-      "epoch": 4.034722673775928,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.001838611093048963,
-      "loss": 1.4394,
-      "step": 620500
-    },
-    {
-      "epoch": 4.037973860459068,
-      "grad_norm": 3.515625,
-      "learning_rate": 0.0018384810455816375,
-      "loss": 1.4429,
-      "step": 621000
-    },
-    {
-      "epoch": 4.041225047142207,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0018383509981143118,
-      "loss": 1.4435,
-      "step": 621500
-    },
-    {
-      "epoch": 4.044476233825346,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0018382209506469863,
-      "loss": 1.444,
-      "step": 622000
-    },
-    {
-      "epoch": 4.047727420508486,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018380909031796607,
-      "loss": 1.4436,
-      "step": 622500
-    },
-    {
-      "epoch": 4.050978607191625,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001837960855712335,
-      "loss": 1.4432,
-      "step": 623000
-    },
-    {
-      "epoch": 4.0542297938747645,
-      "grad_norm": 1.6171875,
-      "learning_rate": 0.0018378308082450095,
-      "loss": 1.4503,
-      "step": 623500
-    },
-    {
-      "epoch": 4.057480980557903,
-      "grad_norm": 4.625,
-      "learning_rate": 0.001837700760777684,
-      "loss": 1.4511,
-      "step": 624000
-    },
-    {
-      "epoch": 4.060732167241043,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0018375707133103582,
-      "loss": 1.444,
-      "step": 624500
-    },
-    {
-      "epoch": 4.063983353924183,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0018374406658430327,
-      "loss": 1.4435,
-      "step": 625000
-    },
-    {
-      "epoch": 4.067234540607322,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0018373106183757072,
-      "loss": 1.4421,
-      "step": 625500
-    },
-    {
-      "epoch": 4.070485727290461,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0018371805709083814,
-      "loss": 1.4455,
-      "step": 626000
-    },
-    {
-      "epoch": 4.0737369139736,
-      "grad_norm": 1.25,
-      "learning_rate": 0.001837050523441056,
-      "loss": 1.4422,
-      "step": 626500
-    },
-    {
-      "epoch": 4.07698810065674,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0018369204759737304,
-      "loss": 1.4459,
-      "step": 627000
-    },
-    {
-      "epoch": 4.080239287339879,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.001836790428506405,
-      "loss": 1.4457,
-      "step": 627500
-    },
-    {
-      "epoch": 4.0834904740230185,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0018366603810390794,
-      "loss": 1.4458,
-      "step": 628000
-    },
-    {
-      "epoch": 4.086741660706158,
-      "grad_norm": 1.2890625,
-      "learning_rate": 0.0018365303335717538,
-      "loss": 1.4424,
-      "step": 628500
-    },
-    {
-      "epoch": 4.089992847389297,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018364002861044283,
-      "loss": 1.4486,
-      "step": 629000
-    },
-    {
-      "epoch": 4.093244034072437,
-      "grad_norm": 1.828125,
-      "learning_rate": 0.0018362702386371026,
-      "loss": 1.4482,
-      "step": 629500
-    },
-    {
-      "epoch": 4.096495220755576,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.001836140191169777,
-      "loss": 1.4395,
-      "step": 630000
-    },
-    {
-      "epoch": 4.099746407438715,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0018360101437024515,
-      "loss": 1.4456,
-      "step": 630500
-    },
-    {
-      "epoch": 4.102997594121854,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018358800962351258,
-      "loss": 1.4444,
-      "step": 631000
-    },
-    {
-      "epoch": 4.106248780804994,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0018357500487678003,
-      "loss": 1.443,
-      "step": 631500
-    },
-    {
-      "epoch": 4.109499967488133,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0018356200013004747,
-      "loss": 1.4445,
-      "step": 632000
-    },
-    {
-      "epoch": 4.1127511541712725,
-      "grad_norm": 1.3671875,
-      "learning_rate": 0.001835489953833149,
-      "loss": 1.4599,
-      "step": 632500
-    },
-    {
-      "epoch": 4.116002340854412,
-      "grad_norm": 2.1875,
-      "learning_rate": 0.0018353599063658235,
-      "loss": 1.448,
-      "step": 633000
-    },
-    {
-      "epoch": 4.119253527537551,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.001835229858898498,
-      "loss": 1.4514,
-      "step": 633500
-    },
-    {
-      "epoch": 4.122504714220691,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018350998114311722,
-      "loss": 1.4508,
-      "step": 634000
-    },
-    {
-      "epoch": 4.12575590090383,
-      "grad_norm": 9.625,
-      "learning_rate": 0.0018349697639638467,
-      "loss": 1.4499,
-      "step": 634500
-    },
-    {
-      "epoch": 4.129007087586969,
-      "grad_norm": 1.0,
-      "learning_rate": 0.0018348397164965214,
-      "loss": 1.4394,
-      "step": 635000
-    },
-    {
-      "epoch": 4.132258274270108,
-      "grad_norm": 1.28125,
-      "learning_rate": 0.0018347096690291959,
-      "loss": 1.4405,
-      "step": 635500
-    },
-    {
-      "epoch": 4.135509460953248,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018345796215618701,
-      "loss": 1.4426,
-      "step": 636000
-    },
-    {
-      "epoch": 4.138760647636388,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.0018344495740945446,
-      "loss": 1.4407,
-      "step": 636500
-    },
-    {
-      "epoch": 4.1420118343195265,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.001834319526627219,
-      "loss": 1.4451,
-      "step": 637000
-    },
-    {
-      "epoch": 4.145263021002666,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.0018341894791598934,
-      "loss": 1.4492,
-      "step": 637500
-    },
-    {
-      "epoch": 4.148514207685805,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018340594316925678,
-      "loss": 1.4399,
-      "step": 638000
-    },
-    {
-      "epoch": 4.151765394368945,
-      "grad_norm": 0.91796875,
-      "learning_rate": 0.0018339293842252423,
-      "loss": 1.4504,
-      "step": 638500
-    },
-    {
-      "epoch": 4.155016581052084,
-      "grad_norm": 1.5390625,
-      "learning_rate": 0.0018337993367579166,
-      "loss": 1.4557,
-      "step": 639000
-    },
-    {
-      "epoch": 4.158267767735223,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.001833669289290591,
-      "loss": 1.4523,
-      "step": 639500
-    },
-    {
-      "epoch": 4.161518954418363,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018335392418232655,
-      "loss": 1.4516,
-      "step": 640000
-    },
-    {
-      "epoch": 4.164770141101502,
-      "grad_norm": 2.984375,
-      "learning_rate": 0.0018334091943559398,
-      "loss": 1.4507,
-      "step": 640500
-    },
-    {
-      "epoch": 4.168021327784642,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0018332791468886143,
-      "loss": 1.4546,
-      "step": 641000
-    },
-    {
-      "epoch": 4.1712725144677805,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0018331490994212888,
-      "loss": 1.4501,
-      "step": 641500
-    },
-    {
-      "epoch": 4.17452370115092,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001833019051953963,
-      "loss": 1.4429,
-      "step": 642000
-    },
-    {
-      "epoch": 4.177774887834059,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0018328890044866377,
-      "loss": 1.4459,
-      "step": 642500
-    },
-    {
-      "epoch": 4.181026074517199,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0018327589570193122,
-      "loss": 1.4485,
-      "step": 643000
-    },
-    {
-      "epoch": 4.1842772612003385,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.0018326289095519867,
-      "loss": 1.4511,
-      "step": 643500
-    },
-    {
-      "epoch": 4.187528447883477,
-      "grad_norm": 1.8828125,
-      "learning_rate": 0.001832498862084661,
-      "loss": 1.4479,
-      "step": 644000
-    },
-    {
-      "epoch": 4.190779634566617,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0018323688146173354,
-      "loss": 1.451,
-      "step": 644500
-    },
-    {
-      "epoch": 4.194030821249756,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0018322387671500099,
-      "loss": 1.4497,
-      "step": 645000
-    },
-    {
-      "epoch": 4.197282007932896,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0018321087196826842,
-      "loss": 1.4446,
-      "step": 645500
-    },
-    {
-      "epoch": 4.2005331946160345,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0018319786722153586,
-      "loss": 1.4413,
-      "step": 646000
-    },
-    {
-      "epoch": 4.203784381299174,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.001831848624748033,
-      "loss": 1.445,
-      "step": 646500
-    },
-    {
-      "epoch": 4.207035567982314,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0018317185772807074,
-      "loss": 1.4409,
-      "step": 647000
-    },
-    {
-      "epoch": 4.210286754665453,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0018315885298133818,
-      "loss": 1.4424,
-      "step": 647500
-    },
-    {
-      "epoch": 4.2135379413485925,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0018314584823460563,
-      "loss": 1.4467,
-      "step": 648000
-    },
-    {
-      "epoch": 4.216789128031731,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0018313284348787306,
-      "loss": 1.4442,
-      "step": 648500
-    },
-    {
-      "epoch": 4.220040314714871,
-      "grad_norm": 1.2734375,
-      "learning_rate": 0.001831198387411405,
-      "loss": 1.4404,
-      "step": 649000
-    },
-    {
-      "epoch": 4.22329150139801,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.0018310683399440798,
-      "loss": 1.441,
-      "step": 649500
-    },
-    {
-      "epoch": 4.22654268808115,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0018309382924767542,
-      "loss": 1.442,
-      "step": 650000
-    },
-    {
-      "epoch": 4.229793874764289,
-      "grad_norm": 3.046875,
-      "learning_rate": 0.0018308082450094285,
-      "loss": 1.4432,
-      "step": 650500
-    },
-    {
-      "epoch": 4.233045061447428,
-      "grad_norm": 2.25,
-      "learning_rate": 0.001830678197542103,
-      "loss": 1.4433,
-      "step": 651000
-    },
-    {
-      "epoch": 4.236296248130568,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018305481500747775,
-      "loss": 1.4452,
-      "step": 651500
-    },
-    {
-      "epoch": 4.239547434813707,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018304181026074517,
-      "loss": 1.4406,
-      "step": 652000
-    },
-    {
-      "epoch": 4.2427986214968465,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0018302880551401262,
-      "loss": 1.4419,
-      "step": 652500
-    },
-    {
-      "epoch": 4.246049808179985,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.0018301580076728007,
-      "loss": 1.4422,
-      "step": 653000
-    },
-    {
-      "epoch": 4.249300994863125,
-      "grad_norm": 3.53125,
-      "learning_rate": 0.001830027960205475,
-      "loss": 1.4436,
-      "step": 653500
-    },
-    {
-      "epoch": 4.252552181546265,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0018298979127381494,
-      "loss": 1.4412,
-      "step": 654000
-    },
-    {
-      "epoch": 4.255803368229404,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.001829767865270824,
-      "loss": 1.4406,
-      "step": 654500
-    },
-    {
-      "epoch": 4.259054554912543,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0018296378178034982,
-      "loss": 1.4354,
-      "step": 655000
-    },
-    {
-      "epoch": 4.262305741595682,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0018295077703361726,
-      "loss": 1.4452,
-      "step": 655500
-    },
-    {
-      "epoch": 4.265556928278822,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018293777228688471,
-      "loss": 1.4366,
-      "step": 656000
-    },
-    {
-      "epoch": 4.268808114961961,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018292476754015214,
-      "loss": 1.4403,
-      "step": 656500
-    },
-    {
-      "epoch": 4.2720593016451005,
-      "grad_norm": 2.421875,
-      "learning_rate": 0.001829117627934196,
-      "loss": 1.4437,
-      "step": 657000
-    },
-    {
-      "epoch": 4.275310488328239,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0018289875804668706,
-      "loss": 1.4443,
-      "step": 657500
-    },
-    {
-      "epoch": 4.278561675011379,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.001828857532999545,
-      "loss": 1.4399,
-      "step": 658000
-    },
-    {
-      "epoch": 4.281812861694519,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0018287274855322193,
-      "loss": 1.4402,
-      "step": 658500
-    },
-    {
-      "epoch": 4.285064048377658,
-      "grad_norm": 0.91015625,
-      "learning_rate": 0.0018285974380648938,
-      "loss": 1.4419,
-      "step": 659000
-    },
-    {
-      "epoch": 4.288315235060797,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0018284673905975682,
-      "loss": 1.441,
-      "step": 659500
-    },
-    {
-      "epoch": 4.291566421743936,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018283373431302425,
-      "loss": 1.4357,
-      "step": 660000
-    },
-    {
-      "epoch": 4.294817608427076,
-      "grad_norm": 2.453125,
-      "learning_rate": 0.001828207295662917,
-      "loss": 1.4341,
-      "step": 660500
-    },
-    {
-      "epoch": 4.298068795110215,
-      "grad_norm": 2.015625,
-      "learning_rate": 0.0018280772481955915,
-      "loss": 1.4512,
-      "step": 661000
-    },
-    {
-      "epoch": 4.3013199817933545,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0018279472007282657,
-      "loss": 1.4393,
-      "step": 661500
-    },
-    {
-      "epoch": 4.304571168476494,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0018278171532609402,
-      "loss": 1.4444,
-      "step": 662000
-    },
-    {
-      "epoch": 4.307822355159633,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018276871057936147,
-      "loss": 1.4476,
-      "step": 662500
-    },
-    {
-      "epoch": 4.311073541842773,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.001827557058326289,
-      "loss": 1.4516,
-      "step": 663000
-    },
-    {
-      "epoch": 4.314324728525912,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0018274270108589634,
-      "loss": 1.4479,
-      "step": 663500
-    },
-    {
-      "epoch": 4.317575915209051,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0018272969633916381,
-      "loss": 1.4526,
-      "step": 664000
-    },
-    {
-      "epoch": 4.32082710189219,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0018271669159243126,
-      "loss": 1.4503,
-      "step": 664500
-    },
-    {
-      "epoch": 4.32407828857533,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0018270368684569869,
-      "loss": 1.4487,
-      "step": 665000
-    },
-    {
-      "epoch": 4.32732947525847,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018269068209896613,
-      "loss": 1.4459,
-      "step": 665500
-    },
-    {
-      "epoch": 4.3305806619416085,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018267767735223358,
-      "loss": 1.4458,
-      "step": 666000
-    },
-    {
-      "epoch": 4.333831848624748,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.00182664672605501,
-      "loss": 1.4371,
-      "step": 666500
-    },
-    {
-      "epoch": 4.337083035307887,
-      "grad_norm": 1.625,
-      "learning_rate": 0.0018265166785876846,
-      "loss": 1.447,
-      "step": 667000
-    },
-    {
-      "epoch": 4.340334221991027,
-      "grad_norm": 0.9140625,
-      "learning_rate": 0.001826386631120359,
-      "loss": 1.4428,
-      "step": 667500
-    },
-    {
-      "epoch": 4.343585408674166,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0018262565836530333,
-      "loss": 1.4406,
-      "step": 668000
-    },
-    {
-      "epoch": 4.346836595357305,
-      "grad_norm": 1.2734375,
-      "learning_rate": 0.0018261265361857078,
-      "loss": 1.4477,
-      "step": 668500
-    },
-    {
-      "epoch": 4.350087782040445,
-      "grad_norm": 1.5390625,
-      "learning_rate": 0.0018259964887183823,
-      "loss": 1.445,
-      "step": 669000
-    },
-    {
-      "epoch": 4.353338968723584,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0018258664412510565,
-      "loss": 1.442,
-      "step": 669500
-    },
-    {
-      "epoch": 4.356590155406724,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.001825736393783731,
-      "loss": 1.449,
-      "step": 670000
-    },
-    {
-      "epoch": 4.3598413420898625,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018256063463164055,
-      "loss": 1.4543,
-      "step": 670500
-    },
-    {
-      "epoch": 4.363092528773002,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0018254762988490797,
-      "loss": 1.4485,
-      "step": 671000
-    },
-    {
-      "epoch": 4.366343715456141,
-      "grad_norm": 2.265625,
-      "learning_rate": 0.0018253462513817544,
-      "loss": 1.4508,
-      "step": 671500
-    },
-    {
-      "epoch": 4.369594902139281,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.001825216203914429,
-      "loss": 1.4501,
-      "step": 672000
-    },
-    {
-      "epoch": 4.3728460888224205,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.0018250861564471034,
-      "loss": 1.4442,
-      "step": 672500
-    },
-    {
-      "epoch": 4.376097275505559,
-      "grad_norm": 0.98828125,
-      "learning_rate": 0.0018249561089797777,
-      "loss": 1.4462,
-      "step": 673000
-    },
-    {
-      "epoch": 4.379348462188699,
-      "grad_norm": 3.125,
-      "learning_rate": 0.0018248260615124521,
-      "loss": 1.4463,
-      "step": 673500
-    },
-    {
-      "epoch": 4.382599648871838,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0018246960140451266,
-      "loss": 1.4506,
-      "step": 674000
-    },
-    {
-      "epoch": 4.385850835554978,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0018245659665778009,
-      "loss": 1.4447,
-      "step": 674500
-    },
-    {
-      "epoch": 4.3891020222381165,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0018244359191104753,
-      "loss": 1.4543,
-      "step": 675000
-    },
-    {
-      "epoch": 4.392353208921256,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0018243058716431498,
-      "loss": 1.4488,
-      "step": 675500
-    },
-    {
-      "epoch": 4.395604395604396,
-      "grad_norm": 3.1875,
-      "learning_rate": 0.001824175824175824,
-      "loss": 1.4386,
-      "step": 676000
-    },
-    {
-      "epoch": 4.398855582287535,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0018240457767084986,
-      "loss": 1.4409,
-      "step": 676500
-    },
-    {
-      "epoch": 4.4021067689706745,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.001823915729241173,
-      "loss": 1.4445,
-      "step": 677000
-    },
-    {
-      "epoch": 4.405357955653813,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018237856817738473,
-      "loss": 1.4439,
-      "step": 677500
-    },
-    {
-      "epoch": 4.408609142336953,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0018236556343065218,
-      "loss": 1.4467,
-      "step": 678000
-    },
-    {
-      "epoch": 4.411860329020092,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0018235255868391965,
-      "loss": 1.4435,
-      "step": 678500
-    },
-    {
-      "epoch": 4.415111515703232,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.001823395539371871,
-      "loss": 1.4387,
-      "step": 679000
-    },
-    {
-      "epoch": 4.418362702386371,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0018232654919045452,
-      "loss": 1.4442,
-      "step": 679500
-    },
-    {
-      "epoch": 4.42161388906951,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0018231354444372197,
-      "loss": 1.4385,
-      "step": 680000
-    },
-    {
-      "epoch": 4.42486507575265,
-      "grad_norm": 2.40625,
-      "learning_rate": 0.0018230053969698942,
-      "loss": 1.4436,
-      "step": 680500
-    },
-    {
-      "epoch": 4.428116262435789,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0018228753495025684,
-      "loss": 1.4407,
-      "step": 681000
-    },
-    {
-      "epoch": 4.4313674491189285,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.001822745302035243,
-      "loss": 1.4402,
-      "step": 681500
-    },
-    {
-      "epoch": 4.434618635802067,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018226152545679174,
-      "loss": 1.4414,
-      "step": 682000
-    },
-    {
-      "epoch": 4.437869822485207,
-      "grad_norm": 0.890625,
-      "learning_rate": 0.0018224852071005917,
-      "loss": 1.4379,
-      "step": 682500
-    },
-    {
-      "epoch": 4.441121009168347,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0018223551596332661,
-      "loss": 1.4403,
-      "step": 683000
-    },
-    {
-      "epoch": 4.444372195851486,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0018222251121659406,
-      "loss": 1.4432,
-      "step": 683500
-    },
-    {
-      "epoch": 4.447623382534625,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0018220950646986149,
-      "loss": 1.4378,
-      "step": 684000
-    },
-    {
-      "epoch": 4.450874569217764,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018219650172312894,
-      "loss": 1.439,
-      "step": 684500
-    },
-    {
-      "epoch": 4.454125755900904,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0018218349697639638,
-      "loss": 1.435,
-      "step": 685000
-    },
-    {
-      "epoch": 4.457376942584043,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.001821704922296638,
-      "loss": 1.4391,
-      "step": 685500
-    },
-    {
-      "epoch": 4.4606281292671826,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018215748748293128,
-      "loss": 1.4392,
-      "step": 686000
-    },
-    {
-      "epoch": 4.463879315950322,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018214448273619873,
-      "loss": 1.4382,
-      "step": 686500
-    },
-    {
-      "epoch": 4.467130502633461,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.0018213147798946617,
-      "loss": 1.4411,
-      "step": 687000
-    },
-    {
-      "epoch": 4.470381689316601,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001821184732427336,
-      "loss": 1.4439,
-      "step": 687500
-    },
-    {
-      "epoch": 4.47363287599974,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0018210546849600105,
-      "loss": 1.4478,
-      "step": 688000
-    },
-    {
-      "epoch": 4.476884062682879,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.001820924637492685,
-      "loss": 1.4437,
-      "step": 688500
-    },
-    {
-      "epoch": 4.480135249366018,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0018207945900253592,
-      "loss": 1.4461,
-      "step": 689000
-    },
-    {
-      "epoch": 4.483386436049158,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018206645425580337,
-      "loss": 1.4388,
-      "step": 689500
-    },
-    {
-      "epoch": 4.486637622732298,
-      "grad_norm": 1.5078125,
-      "learning_rate": 0.0018205344950907082,
-      "loss": 1.4433,
-      "step": 690000
-    },
-    {
-      "epoch": 4.4898888094154366,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0018204044476233824,
-      "loss": 1.4442,
-      "step": 690500
-    },
-    {
-      "epoch": 4.493139996098576,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.001820274400156057,
-      "loss": 1.4312,
-      "step": 691000
-    },
-    {
-      "epoch": 4.496391182781715,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0018201443526887314,
-      "loss": 1.4469,
-      "step": 691500
-    },
-    {
-      "epoch": 4.499642369464855,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018200143052214057,
-      "loss": 1.4389,
-      "step": 692000
-    },
-    {
-      "epoch": 4.502893556147994,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0018198842577540801,
-      "loss": 1.4402,
-      "step": 692500
-    },
-    {
-      "epoch": 4.506144742831133,
-      "grad_norm": 1.3984375,
-      "learning_rate": 0.0018197542102867548,
-      "loss": 1.4366,
-      "step": 693000
-    },
-    {
-      "epoch": 4.509395929514273,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0018196241628194293,
-      "loss": 1.4418,
-      "step": 693500
-    },
-    {
-      "epoch": 4.512647116197412,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018194941153521036,
-      "loss": 1.439,
-      "step": 694000
-    },
-    {
-      "epoch": 4.515898302880552,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.001819364067884778,
-      "loss": 1.4341,
-      "step": 694500
-    },
-    {
-      "epoch": 4.519149489563691,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018192340204174525,
-      "loss": 1.4412,
-      "step": 695000
-    },
-    {
-      "epoch": 4.52240067624683,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0018191039729501268,
-      "loss": 1.4458,
-      "step": 695500
-    },
-    {
-      "epoch": 4.525651862929969,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0018189739254828013,
-      "loss": 1.4464,
-      "step": 696000
-    },
-    {
-      "epoch": 4.528903049613109,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0018188438780154758,
-      "loss": 1.4463,
-      "step": 696500
-    },
-    {
-      "epoch": 4.532154236296249,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.00181871383054815,
-      "loss": 1.4492,
-      "step": 697000
-    },
-    {
-      "epoch": 4.535405422979387,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018185837830808245,
-      "loss": 1.4423,
-      "step": 697500
-    },
-    {
-      "epoch": 4.538656609662527,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.001818453735613499,
-      "loss": 1.4375,
-      "step": 698000
-    },
-    {
-      "epoch": 4.541907796345666,
-      "grad_norm": 4.5,
-      "learning_rate": 0.0018183236881461732,
-      "loss": 1.445,
-      "step": 698500
-    },
-    {
-      "epoch": 4.545158983028806,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.0018181936406788477,
-      "loss": 1.4405,
-      "step": 699000
-    },
-    {
-      "epoch": 4.548410169711945,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018180635932115222,
-      "loss": 1.4406,
-      "step": 699500
-    },
-    {
-      "epoch": 4.551661356395084,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018179335457441965,
-      "loss": 1.442,
-      "step": 700000
-    },
-    {
-      "epoch": 4.554912543078224,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0018178034982768711,
-      "loss": 1.4395,
-      "step": 700500
-    },
-    {
-      "epoch": 4.558163729761363,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018176734508095456,
-      "loss": 1.4396,
-      "step": 701000
-    },
-    {
-      "epoch": 4.561414916444503,
-      "grad_norm": 1.3984375,
-      "learning_rate": 0.00181754340334222,
-      "loss": 1.4441,
-      "step": 701500
-    },
-    {
-      "epoch": 4.564666103127641,
-      "grad_norm": 2.03125,
-      "learning_rate": 0.0018174133558748944,
-      "loss": 1.4435,
-      "step": 702000
-    },
-    {
-      "epoch": 4.567917289810781,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0018172833084075688,
-      "loss": 1.4422,
-      "step": 702500
-    },
-    {
-      "epoch": 4.57116847649392,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0018171532609402433,
-      "loss": 1.4483,
-      "step": 703000
-    },
-    {
-      "epoch": 4.57441966317706,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0018170232134729176,
-      "loss": 1.4467,
-      "step": 703500
-    },
-    {
-      "epoch": 4.5776708498601995,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.001816893166005592,
-      "loss": 1.4474,
-      "step": 704000
-    },
-    {
-      "epoch": 4.580922036543338,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.0018167631185382665,
-      "loss": 1.4412,
-      "step": 704500
-    },
-    {
-      "epoch": 4.584173223226478,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.0018166330710709408,
-      "loss": 1.4412,
-      "step": 705000
-    },
-    {
-      "epoch": 4.587424409909617,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.0018165030236036153,
-      "loss": 1.4442,
-      "step": 705500
-    },
-    {
-      "epoch": 4.590675596592757,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0018163729761362898,
-      "loss": 1.4428,
-      "step": 706000
-    },
-    {
-      "epoch": 4.593926783275895,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.001816242928668964,
-      "loss": 1.4454,
-      "step": 706500
-    },
-    {
-      "epoch": 4.597177969959035,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0018161128812016385,
-      "loss": 1.4456,
-      "step": 707000
-    },
-    {
-      "epoch": 4.600429156642175,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018159828337343132,
-      "loss": 1.4355,
-      "step": 707500
-    },
-    {
-      "epoch": 4.603680343325314,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.0018158527862669877,
-      "loss": 1.4477,
-      "step": 708000
-    },
-    {
-      "epoch": 4.6069315300084535,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.001815722738799662,
-      "loss": 1.4491,
-      "step": 708500
-    },
-    {
-      "epoch": 4.610182716691592,
-      "grad_norm": 3.078125,
-      "learning_rate": 0.0018155926913323364,
-      "loss": 1.4513,
-      "step": 709000
-    },
-    {
-      "epoch": 4.613433903374732,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.001815462643865011,
-      "loss": 1.4504,
-      "step": 709500
-    },
-    {
-      "epoch": 4.616685090057871,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.0018153325963976852,
-      "loss": 1.455,
-      "step": 710000
-    },
-    {
-      "epoch": 4.619936276741011,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.0018152025489303596,
-      "loss": 1.4542,
-      "step": 710500
-    },
-    {
-      "epoch": 4.623187463424149,
-      "grad_norm": 0.55859375,
-      "learning_rate": 0.0018150725014630341,
-      "loss": 1.4562,
-      "step": 711000
-    },
-    {
-      "epoch": 4.626438650107289,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0018149424539957084,
-      "loss": 1.4514,
-      "step": 711500
-    },
-    {
-      "epoch": 4.629689836790429,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0018148124065283829,
-      "loss": 1.444,
-      "step": 712000
-    },
-    {
-      "epoch": 4.632941023473568,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0018146823590610573,
-      "loss": 1.4446,
-      "step": 712500
-    },
-    {
-      "epoch": 4.6361922101567075,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0018145523115937316,
-      "loss": 1.4515,
-      "step": 713000
-    },
-    {
-      "epoch": 4.639443396839846,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.001814422264126406,
-      "loss": 1.449,
-      "step": 713500
-    },
-    {
-      "epoch": 4.642694583522986,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0018142922166590806,
-      "loss": 1.4478,
-      "step": 714000
-    },
-    {
-      "epoch": 4.645945770206125,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018141621691917548,
-      "loss": 1.4463,
-      "step": 714500
-    },
-    {
-      "epoch": 4.649196956889265,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0018140321217244295,
-      "loss": 1.4528,
-      "step": 715000
-    },
-    {
-      "epoch": 4.652448143572403,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.001813902074257104,
-      "loss": 1.4528,
-      "step": 715500
-    },
-    {
-      "epoch": 4.655699330255543,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018137720267897785,
-      "loss": 1.4492,
-      "step": 716000
-    },
-    {
-      "epoch": 4.658950516938683,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0018136419793224527,
-      "loss": 1.4516,
-      "step": 716500
-    },
-    {
-      "epoch": 4.662201703621822,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.0018135119318551272,
-      "loss": 1.4532,
-      "step": 717000
-    },
-    {
-      "epoch": 4.6654528903049615,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0018133818843878017,
-      "loss": 1.4468,
-      "step": 717500
-    },
-    {
-      "epoch": 4.6687040769881,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.001813251836920476,
-      "loss": 1.4498,
-      "step": 718000
-    },
-    {
-      "epoch": 4.67195526367124,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0018131217894531504,
-      "loss": 1.4473,
-      "step": 718500
-    },
-    {
-      "epoch": 4.675206450354379,
-      "grad_norm": 1.0,
-      "learning_rate": 0.001812991741985825,
-      "loss": 1.4454,
-      "step": 719000
-    },
-    {
-      "epoch": 4.678457637037519,
-      "grad_norm": 2.921875,
-      "learning_rate": 0.0018128616945184992,
-      "loss": 1.4448,
-      "step": 719500
-    },
-    {
-      "epoch": 4.681708823720658,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0018127316470511736,
-      "loss": 1.4431,
-      "step": 720000
-    },
-    {
-      "epoch": 4.684960010403797,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0018126015995838481,
-      "loss": 1.4483,
-      "step": 720500
-    },
-    {
-      "epoch": 4.688211197086937,
-      "grad_norm": 0.9921875,
-      "learning_rate": 0.0018124715521165224,
-      "loss": 1.4487,
-      "step": 721000
-    },
-    {
-      "epoch": 4.691462383770076,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018123415046491969,
-      "loss": 1.4505,
-      "step": 721500
-    },
-    {
-      "epoch": 4.6947135704532155,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0018122114571818716,
-      "loss": 1.4404,
-      "step": 722000
-    },
-    {
-      "epoch": 4.697964757136354,
-      "grad_norm": 1.65625,
-      "learning_rate": 0.001812081409714546,
-      "loss": 1.4502,
-      "step": 722500
-    },
-    {
-      "epoch": 4.701215943819494,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018119513622472203,
-      "loss": 1.4484,
-      "step": 723000
-    },
-    {
-      "epoch": 4.704467130502634,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0018118213147798948,
-      "loss": 1.4488,
-      "step": 723500
-    },
-    {
-      "epoch": 4.707718317185773,
-      "grad_norm": 2.296875,
-      "learning_rate": 0.0018116912673125693,
-      "loss": 1.4532,
-      "step": 724000
-    },
-    {
-      "epoch": 4.710969503868912,
-      "grad_norm": 1.1796875,
-      "learning_rate": 0.0018115612198452435,
-      "loss": 1.4492,
-      "step": 724500
-    },
-    {
-      "epoch": 4.714220690552051,
-      "grad_norm": 1.6484375,
-      "learning_rate": 0.001811431172377918,
-      "loss": 1.4479,
-      "step": 725000
-    },
-    {
-      "epoch": 4.717471877235191,
-      "grad_norm": 1.1640625,
-      "learning_rate": 0.0018113011249105925,
-      "loss": 1.4552,
-      "step": 725500
-    },
-    {
-      "epoch": 4.72072306391833,
-      "grad_norm": 0.58203125,
-      "learning_rate": 0.0018111710774432667,
-      "loss": 1.4542,
-      "step": 726000
-    },
-    {
-      "epoch": 4.7239742506014695,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0018110410299759412,
-      "loss": 1.4606,
-      "step": 726500
-    },
-    {
-      "epoch": 4.727225437284609,
-      "grad_norm": 1.125,
-      "learning_rate": 0.0018109109825086157,
-      "loss": 1.4569,
-      "step": 727000
-    },
-    {
-      "epoch": 4.730476623967748,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.00181078093504129,
-      "loss": 1.4634,
-      "step": 727500
-    },
-    {
-      "epoch": 4.733727810650888,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018106508875739644,
-      "loss": 1.461,
-      "step": 728000
-    },
-    {
-      "epoch": 4.736978997334027,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.001810520840106639,
-      "loss": 1.4565,
-      "step": 728500
-    },
-    {
-      "epoch": 4.740230184017166,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0018103907926393132,
-      "loss": 1.4479,
-      "step": 729000
-    },
-    {
-      "epoch": 4.743481370700305,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0018102607451719879,
-      "loss": 1.452,
-      "step": 729500
-    },
-    {
-      "epoch": 4.746732557383445,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018101306977046623,
-      "loss": 1.4529,
-      "step": 730000
-    },
-    {
-      "epoch": 4.749983744066585,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018100006502373368,
-      "loss": 1.4525,
-      "step": 730500
-    },
-    {
-      "epoch": 4.7532349307497235,
-      "grad_norm": 1.3125,
-      "learning_rate": 0.001809870602770011,
-      "loss": 1.4505,
-      "step": 731000
-    },
-    {
-      "epoch": 4.756486117432863,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0018097405553026856,
-      "loss": 1.4475,
-      "step": 731500
-    },
-    {
-      "epoch": 4.759737304116002,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.00180961050783536,
-      "loss": 1.4491,
-      "step": 732000
-    },
-    {
-      "epoch": 4.762988490799142,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0018094804603680343,
-      "loss": 1.4475,
-      "step": 732500
-    },
-    {
-      "epoch": 4.766239677482281,
-      "grad_norm": 1.078125,
-      "learning_rate": 0.0018093504129007088,
-      "loss": 1.4493,
-      "step": 733000
-    },
-    {
-      "epoch": 4.76949086416542,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0018092203654333833,
-      "loss": 1.446,
-      "step": 733500
-    },
-    {
-      "epoch": 4.77274205084856,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018090903179660575,
-      "loss": 1.4519,
-      "step": 734000
-    },
-    {
-      "epoch": 4.775993237531699,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.001808960270498732,
-      "loss": 1.4519,
-      "step": 734500
-    },
-    {
-      "epoch": 4.779244424214839,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.0018088302230314065,
-      "loss": 1.4519,
-      "step": 735000
-    },
-    {
-      "epoch": 4.7824956108979775,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0018087001755640807,
-      "loss": 1.4519,
-      "step": 735500
-    },
-    {
-      "epoch": 4.785746797581117,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018085701280967552,
-      "loss": 1.4503,
-      "step": 736000
-    },
-    {
-      "epoch": 4.788997984264256,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.00180844008062943,
-      "loss": 1.4491,
-      "step": 736500
-    },
-    {
-      "epoch": 4.792249170947396,
-      "grad_norm": 0.98046875,
-      "learning_rate": 0.0018083100331621044,
-      "loss": 1.4455,
-      "step": 737000
-    },
-    {
-      "epoch": 4.7955003576305355,
-      "grad_norm": 1.28125,
-      "learning_rate": 0.0018081799856947787,
-      "loss": 1.4475,
-      "step": 737500
-    },
-    {
-      "epoch": 4.798751544313674,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.0018080499382274531,
-      "loss": 1.4454,
-      "step": 738000
-    },
-    {
-      "epoch": 4.802002730996814,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0018079198907601276,
-      "loss": 1.4446,
-      "step": 738500
-    },
-    {
-      "epoch": 4.805253917679953,
-      "grad_norm": 3.140625,
-      "learning_rate": 0.0018077898432928019,
-      "loss": 1.4439,
-      "step": 739000
-    },
-    {
-      "epoch": 4.808505104363093,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0018076597958254764,
-      "loss": 1.4392,
-      "step": 739500
-    },
-    {
-      "epoch": 4.8117562910462315,
-      "grad_norm": 1.21875,
-      "learning_rate": 0.0018075297483581508,
-      "loss": 1.4405,
-      "step": 740000
-    },
-    {
-      "epoch": 4.815007477729371,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.001807399700890825,
-      "loss": 1.4409,
-      "step": 740500
-    },
-    {
-      "epoch": 4.818258664412511,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018072696534234996,
-      "loss": 1.4425,
-      "step": 741000
-    },
-    {
-      "epoch": 4.82150985109565,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.001807139605956174,
-      "loss": 1.4373,
-      "step": 741500
-    },
-    {
-      "epoch": 4.8247610377787895,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0018070095584888483,
-      "loss": 1.4415,
-      "step": 742000
-    },
-    {
-      "epoch": 4.828012224461928,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.0018068795110215228,
-      "loss": 1.4401,
-      "step": 742500
-    },
-    {
-      "epoch": 4.831263411145068,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.0018067494635541973,
-      "loss": 1.4448,
-      "step": 743000
-    },
-    {
-      "epoch": 4.834514597828207,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.0018066194160868715,
-      "loss": 1.4407,
-      "step": 743500
-    },
-    {
-      "epoch": 4.837765784511347,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0018064893686195462,
-      "loss": 1.4454,
-      "step": 744000
-    },
-    {
-      "epoch": 4.841016971194486,
-      "grad_norm": 3.75,
-      "learning_rate": 0.0018063593211522207,
-      "loss": 1.4433,
-      "step": 744500
-    },
-    {
-      "epoch": 4.844268157877625,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018062292736848952,
-      "loss": 1.441,
-      "step": 745000
-    },
-    {
-      "epoch": 4.847519344560765,
-      "grad_norm": 1.78125,
-      "learning_rate": 0.0018060992262175694,
-      "loss": 1.4374,
-      "step": 745500
-    },
-    {
-      "epoch": 4.850770531243904,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.001805969178750244,
-      "loss": 1.4399,
-      "step": 746000
-    },
-    {
-      "epoch": 4.8540217179270435,
-      "grad_norm": 4.46875,
-      "learning_rate": 0.0018058391312829184,
-      "loss": 1.4392,
-      "step": 746500
-    },
-    {
-      "epoch": 4.857272904610182,
-      "grad_norm": 1.7578125,
-      "learning_rate": 0.0018057090838155927,
-      "loss": 1.4403,
-      "step": 747000
-    },
-    {
-      "epoch": 4.860524091293322,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018055790363482671,
-      "loss": 1.4378,
-      "step": 747500
-    },
-    {
-      "epoch": 4.863775277976462,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0018054489888809416,
-      "loss": 1.4415,
-      "step": 748000
-    },
-    {
-      "epoch": 4.867026464659601,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0018053189414136159,
-      "loss": 1.4444,
-      "step": 748500
-    },
-    {
-      "epoch": 4.87027765134274,
-      "grad_norm": 1.359375,
-      "learning_rate": 0.0018051888939462904,
-      "loss": 1.441,
-      "step": 749000
-    },
-    {
-      "epoch": 4.873528838025879,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0018050588464789648,
-      "loss": 1.4387,
-      "step": 749500
-    },
-    {
-      "epoch": 4.876780024709019,
-      "grad_norm": 0.625,
-      "learning_rate": 0.001804928799011639,
-      "loss": 1.4346,
-      "step": 750000
-    },
-    {
-      "epoch": 4.880031211392158,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0018047987515443136,
-      "loss": 1.4347,
-      "step": 750500
-    },
-    {
-      "epoch": 4.8832823980752975,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.0018046687040769883,
-      "loss": 1.4413,
-      "step": 751000
-    },
-    {
-      "epoch": 4.886533584758437,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0018045386566096628,
-      "loss": 1.4479,
-      "step": 751500
-    },
-    {
-      "epoch": 4.889784771441576,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.001804408609142337,
-      "loss": 1.446,
-      "step": 752000
-    },
-    {
-      "epoch": 4.893035958124716,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0018042785616750115,
-      "loss": 1.4515,
-      "step": 752500
-    },
-    {
-      "epoch": 4.896287144807855,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.001804148514207686,
-      "loss": 1.448,
-      "step": 753000
-    },
-    {
-      "epoch": 4.899538331490994,
-      "grad_norm": 1.2109375,
-      "learning_rate": 0.0018040184667403602,
-      "loss": 1.4473,
-      "step": 753500
-    },
-    {
-      "epoch": 4.902789518174133,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018038884192730347,
-      "loss": 1.448,
-      "step": 754000
-    },
-    {
-      "epoch": 4.906040704857273,
-      "grad_norm": 2.140625,
-      "learning_rate": 0.0018037583718057092,
-      "loss": 1.4512,
-      "step": 754500
-    },
-    {
-      "epoch": 4.909291891540413,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0018036283243383835,
-      "loss": 1.4474,
-      "step": 755000
-    },
-    {
-      "epoch": 4.9125430782235515,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.001803498276871058,
-      "loss": 1.4573,
-      "step": 755500
-    },
-    {
-      "epoch": 4.915794264906691,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0018033682294037324,
-      "loss": 1.4442,
-      "step": 756000
-    },
-    {
-      "epoch": 4.91904545158983,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.0018032381819364067,
-      "loss": 1.4525,
-      "step": 756500
-    },
-    {
-      "epoch": 4.92229663827297,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0018031081344690812,
-      "loss": 1.4477,
-      "step": 757000
-    },
-    {
-      "epoch": 4.925547824956109,
-      "grad_norm": 1.0625,
-      "learning_rate": 0.0018029780870017556,
-      "loss": 1.4477,
-      "step": 757500
-    },
-    {
-      "epoch": 4.928799011639248,
-      "grad_norm": 2.0,
-      "learning_rate": 0.0018028480395344299,
-      "loss": 1.4513,
-      "step": 758000
-    },
-    {
-      "epoch": 4.932050198322388,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0018027179920671046,
-      "loss": 1.4502,
-      "step": 758500
-    },
-    {
-      "epoch": 4.935301385005527,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.001802587944599779,
-      "loss": 1.4486,
-      "step": 759000
-    },
-    {
-      "epoch": 4.938552571688667,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.0018024578971324535,
-      "loss": 1.4453,
-      "step": 759500
-    },
-    {
-      "epoch": 4.9418037583718055,
-      "grad_norm": 4.0,
-      "learning_rate": 0.0018023278496651278,
-      "loss": 1.4467,
-      "step": 760000
-    },
-    {
-      "epoch": 4.945054945054945,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018021978021978023,
-      "loss": 1.4401,
-      "step": 760500
-    },
-    {
-      "epoch": 4.948306131738084,
-      "grad_norm": 1.328125,
-      "learning_rate": 0.0018020677547304768,
-      "loss": 1.4461,
-      "step": 761000
-    },
-    {
-      "epoch": 4.951557318421224,
-      "grad_norm": 0.875,
-      "learning_rate": 0.001801937707263151,
-      "loss": 1.4454,
-      "step": 761500
-    },
-    {
-      "epoch": 4.9548085051043635,
-      "grad_norm": 1.1640625,
-      "learning_rate": 0.0018018076597958255,
-      "loss": 1.4468,
-      "step": 762000
-    },
-    {
-      "epoch": 4.958059691787502,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0018016776123285,
-      "loss": 1.4547,
-      "step": 762500
-    },
-    {
-      "epoch": 4.961310878470642,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0018015475648611742,
-      "loss": 1.4528,
-      "step": 763000
-    },
-    {
-      "epoch": 4.964562065153781,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0018014175173938487,
-      "loss": 1.4477,
-      "step": 763500
-    },
-    {
-      "epoch": 4.967813251836921,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018012874699265232,
-      "loss": 1.4547,
-      "step": 764000
-    },
-    {
-      "epoch": 4.9710644385200595,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0018011574224591975,
-      "loss": 1.4555,
-      "step": 764500
-    },
-    {
-      "epoch": 4.974315625203199,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001801027374991872,
-      "loss": 1.4538,
-      "step": 765000
-    },
-    {
-      "epoch": 4.977566811886339,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0018008973275245466,
-      "loss": 1.4523,
-      "step": 765500
-    },
-    {
-      "epoch": 4.980817998569478,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0018007672800572211,
-      "loss": 1.4484,
-      "step": 766000
-    },
-    {
-      "epoch": 4.9840691852526176,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0018006372325898954,
-      "loss": 1.4489,
-      "step": 766500
-    },
-    {
-      "epoch": 4.987320371935756,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0018005071851225699,
-      "loss": 1.4465,
-      "step": 767000
-    },
-    {
-      "epoch": 4.990571558618896,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0018003771376552443,
-      "loss": 1.4522,
-      "step": 767500
-    },
-    {
-      "epoch": 4.993822745302035,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0018002470901879186,
-      "loss": 1.453,
-      "step": 768000
-    },
-    {
-      "epoch": 4.997073931985175,
-      "grad_norm": 1.375,
-      "learning_rate": 0.001800117042720593,
-      "loss": 1.4571,
-      "step": 768500
-    },
-    {
-      "epoch": 5.0,
-      "eval_loss": 1.441786766052246,
-      "eval_runtime": 0.5442,
-      "eval_samples_per_second": 1837.612,
-      "eval_steps_per_second": 29.402,
-      "step": 768950
-    },
-    {
-      "epoch": 5.0003251186683135,
-      "grad_norm": 0.5703125,
-      "learning_rate": 0.0017999869952532676,
-      "loss": 1.45,
-      "step": 769000
-    },
-    {
-      "epoch": 5.003576305351453,
-      "grad_norm": 1.6953125,
-      "learning_rate": 0.0017998569477859418,
-      "loss": 1.4452,
-      "step": 769500
-    },
-    {
-      "epoch": 5.006827492034593,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0017997269003186163,
-      "loss": 1.4399,
-      "step": 770000
-    },
-    {
-      "epoch": 5.010078678717732,
-      "grad_norm": 0.6015625,
-      "learning_rate": 0.0017995968528512908,
-      "loss": 1.4414,
-      "step": 770500
-    },
-    {
-      "epoch": 5.0133298654008716,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.001799466805383965,
-      "loss": 1.4448,
-      "step": 771000
-    },
-    {
-      "epoch": 5.01658105208401,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0017993367579166395,
-      "loss": 1.4429,
-      "step": 771500
-    },
-    {
-      "epoch": 5.01983223876715,
-      "grad_norm": 2.015625,
-      "learning_rate": 0.001799206710449314,
-      "loss": 1.4388,
-      "step": 772000
-    },
-    {
-      "epoch": 5.023083425450289,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0017990766629819882,
-      "loss": 1.442,
-      "step": 772500
-    },
-    {
-      "epoch": 5.026334612133429,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.001798946615514663,
-      "loss": 1.4433,
-      "step": 773000
-    },
-    {
-      "epoch": 5.029585798816568,
-      "grad_norm": 1.4609375,
-      "learning_rate": 0.0017988165680473374,
-      "loss": 1.443,
-      "step": 773500
-    },
-    {
-      "epoch": 5.032836985499707,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.001798686520580012,
-      "loss": 1.448,
-      "step": 774000
-    },
-    {
-      "epoch": 5.036088172182847,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0017985564731126862,
-      "loss": 1.4465,
-      "step": 774500
-    },
-    {
-      "epoch": 5.039339358865986,
-      "grad_norm": 1.265625,
-      "learning_rate": 0.0017984264256453606,
-      "loss": 1.4448,
-      "step": 775000
-    },
-    {
-      "epoch": 5.042590545549126,
-      "grad_norm": 1.234375,
-      "learning_rate": 0.0017982963781780351,
-      "loss": 1.4418,
-      "step": 775500
-    },
-    {
-      "epoch": 5.045841732232264,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0017981663307107094,
-      "loss": 1.4425,
-      "step": 776000
-    },
-    {
-      "epoch": 5.049092918915404,
-      "grad_norm": 1.6171875,
-      "learning_rate": 0.0017980362832433839,
-      "loss": 1.444,
-      "step": 776500
-    },
-    {
-      "epoch": 5.052344105598544,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0017979062357760583,
-      "loss": 1.4431,
-      "step": 777000
-    },
-    {
-      "epoch": 5.055595292281683,
-      "grad_norm": 2.078125,
-      "learning_rate": 0.0017977761883087326,
-      "loss": 1.4459,
-      "step": 777500
-    },
-    {
-      "epoch": 5.058846478964822,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.001797646140841407,
-      "loss": 1.4453,
-      "step": 778000
-    },
-    {
-      "epoch": 5.062097665647961,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0017975160933740816,
-      "loss": 1.452,
-      "step": 778500
-    },
-    {
-      "epoch": 5.065348852331101,
-      "grad_norm": 1.5,
-      "learning_rate": 0.0017973860459067558,
-      "loss": 1.4492,
-      "step": 779000
-    },
-    {
-      "epoch": 5.06860003901424,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0017972559984394303,
-      "loss": 1.4533,
-      "step": 779500
-    },
-    {
-      "epoch": 5.07185122569738,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.001797125950972105,
-      "loss": 1.4462,
-      "step": 780000
-    },
-    {
-      "epoch": 5.075102412380519,
-      "grad_norm": 1.2734375,
-      "learning_rate": 0.0017969959035047795,
-      "loss": 1.4468,
-      "step": 780500
-    },
-    {
-      "epoch": 5.078353599063658,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0017968658560374537,
-      "loss": 1.4464,
-      "step": 781000
-    },
-    {
-      "epoch": 5.081604785746798,
-      "grad_norm": 0.953125,
-      "learning_rate": 0.0017967358085701282,
-      "loss": 1.4535,
-      "step": 781500
-    },
-    {
-      "epoch": 5.084855972429937,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0017966057611028027,
-      "loss": 1.4473,
-      "step": 782000
-    },
-    {
-      "epoch": 5.088107159113076,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.001796475713635477,
-      "loss": 1.4522,
-      "step": 782500
-    },
-    {
-      "epoch": 5.091358345796215,
-      "grad_norm": 1.4140625,
-      "learning_rate": 0.0017963456661681514,
-      "loss": 1.4525,
-      "step": 783000
-    },
-    {
-      "epoch": 5.094609532479355,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.001796215618700826,
-      "loss": 1.447,
-      "step": 783500
-    },
-    {
-      "epoch": 5.097860719162495,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0017960855712335002,
-      "loss": 1.4464,
-      "step": 784000
-    },
-    {
-      "epoch": 5.101111905845634,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0017959555237661746,
-      "loss": 1.4419,
-      "step": 784500
-    },
-    {
-      "epoch": 5.104363092528773,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0017958254762988491,
-      "loss": 1.4459,
-      "step": 785000
-    },
-    {
-      "epoch": 5.107614279211912,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0017956954288315234,
-      "loss": 1.4475,
-      "step": 785500
-    },
-    {
-      "epoch": 5.110865465895052,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0017955653813641979,
-      "loss": 1.4389,
-      "step": 786000
-    },
-    {
-      "epoch": 5.114116652578191,
-      "grad_norm": 1.0859375,
-      "learning_rate": 0.0017954353338968723,
-      "loss": 1.4455,
-      "step": 786500
-    },
-    {
-      "epoch": 5.11736783926133,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0017953052864295466,
-      "loss": 1.4381,
-      "step": 787000
-    },
-    {
-      "epoch": 5.12061902594447,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0017951752389622213,
-      "loss": 1.4444,
-      "step": 787500
-    },
-    {
-      "epoch": 5.123870212627609,
-      "grad_norm": 9.1875,
-      "learning_rate": 0.0017950451914948958,
-      "loss": 1.4436,
-      "step": 788000
-    },
-    {
-      "epoch": 5.127121399310749,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0017949151440275703,
-      "loss": 1.4437,
-      "step": 788500
-    },
-    {
-      "epoch": 5.130372585993888,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0017947850965602445,
-      "loss": 1.4385,
-      "step": 789000
-    },
-    {
-      "epoch": 5.133623772677027,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.001794655049092919,
-      "loss": 1.4424,
-      "step": 789500
-    },
-    {
-      "epoch": 5.136874959360166,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.0017945250016255935,
-      "loss": 1.4419,
-      "step": 790000
-    },
-    {
-      "epoch": 5.140126146043306,
-      "grad_norm": 2.765625,
-      "learning_rate": 0.0017943949541582677,
-      "loss": 1.4416,
-      "step": 790500
-    },
-    {
-      "epoch": 5.143377332726445,
-      "grad_norm": 1.5546875,
-      "learning_rate": 0.0017942649066909422,
-      "loss": 1.4391,
-      "step": 791000
-    },
-    {
-      "epoch": 5.146628519409584,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0017941348592236167,
-      "loss": 1.4361,
-      "step": 791500
-    },
-    {
-      "epoch": 5.149879706092724,
-      "grad_norm": 0.91796875,
-      "learning_rate": 0.001794004811756291,
-      "loss": 1.4397,
-      "step": 792000
-    },
-    {
-      "epoch": 5.153130892775863,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0017938747642889654,
-      "loss": 1.4351,
-      "step": 792500
-    },
-    {
-      "epoch": 5.156382079459003,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.00179374471682164,
-      "loss": 1.4365,
-      "step": 793000
-    },
-    {
-      "epoch": 5.159633266142142,
-      "grad_norm": 6.6875,
-      "learning_rate": 0.0017936146693543142,
-      "loss": 1.4359,
-      "step": 793500
-    },
-    {
-      "epoch": 5.162884452825281,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0017934846218869887,
-      "loss": 1.4403,
-      "step": 794000
-    },
-    {
-      "epoch": 5.16613563950842,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0017933545744196634,
-      "loss": 1.4368,
-      "step": 794500
-    },
-    {
-      "epoch": 5.16938682619156,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0017932245269523378,
-      "loss": 1.4316,
-      "step": 795000
-    },
-    {
-      "epoch": 5.1726380128747,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.001793094479485012,
-      "loss": 1.4363,
-      "step": 795500
-    },
-    {
-      "epoch": 5.175889199557838,
-      "grad_norm": 1.1015625,
-      "learning_rate": 0.0017929644320176866,
-      "loss": 1.4426,
-      "step": 796000
-    },
-    {
-      "epoch": 5.179140386240978,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.001792834384550361,
-      "loss": 1.4405,
-      "step": 796500
-    },
-    {
-      "epoch": 5.182391572924117,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.0017927043370830353,
-      "loss": 1.4329,
-      "step": 797000
-    },
-    {
-      "epoch": 5.185642759607257,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0017925742896157098,
-      "loss": 1.4337,
-      "step": 797500
-    },
-    {
-      "epoch": 5.188893946290396,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.0017924442421483843,
-      "loss": 1.4318,
-      "step": 798000
-    },
-    {
-      "epoch": 5.192145132973535,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0017923141946810585,
-      "loss": 1.4362,
-      "step": 798500
-    },
-    {
-      "epoch": 5.195396319656675,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.001792184147213733,
-      "loss": 1.4352,
-      "step": 799000
-    },
-    {
-      "epoch": 5.198647506339814,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0017920540997464075,
-      "loss": 1.4379,
-      "step": 799500
-    },
-    {
-      "epoch": 5.201898693022954,
-      "grad_norm": 1.4296875,
-      "learning_rate": 0.0017919240522790817,
-      "loss": 1.4319,
-      "step": 800000
-    },
-    {
-      "epoch": 5.205149879706092,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0017917940048117562,
-      "loss": 1.4339,
-      "step": 800500
-    },
-    {
-      "epoch": 5.208401066389232,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0017916639573444307,
-      "loss": 1.4374,
-      "step": 801000
-    },
-    {
-      "epoch": 5.211652253072371,
-      "grad_norm": 1.5,
-      "learning_rate": 0.001791533909877105,
-      "loss": 1.4314,
-      "step": 801500
-    },
-    {
-      "epoch": 5.214903439755511,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0017914038624097797,
-      "loss": 1.4339,
-      "step": 802000
-    },
-    {
-      "epoch": 5.2181546264386505,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0017912738149424541,
-      "loss": 1.4345,
-      "step": 802500
-    },
-    {
-      "epoch": 5.221405813121789,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0017911437674751286,
-      "loss": 1.4352,
-      "step": 803000
-    },
-    {
-      "epoch": 5.224656999804929,
-      "grad_norm": 1.265625,
-      "learning_rate": 0.0017910137200078029,
-      "loss": 1.4291,
-      "step": 803500
-    },
-    {
-      "epoch": 5.227908186488068,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0017908836725404774,
-      "loss": 1.4321,
-      "step": 804000
-    },
-    {
-      "epoch": 5.231159373171208,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.0017907536250731518,
-      "loss": 1.4279,
-      "step": 804500
-    },
-    {
-      "epoch": 5.2344105598543464,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.001790623577605826,
-      "loss": 1.4371,
-      "step": 805000
-    },
-    {
-      "epoch": 5.237661746537486,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0017904935301385006,
-      "loss": 1.431,
-      "step": 805500
-    },
-    {
-      "epoch": 5.240912933220626,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.001790363482671175,
-      "loss": 1.4389,
-      "step": 806000
-    },
-    {
-      "epoch": 5.244164119903765,
-      "grad_norm": 1.3359375,
-      "learning_rate": 0.0017902334352038493,
-      "loss": 1.433,
-      "step": 806500
-    },
-    {
-      "epoch": 5.2474153065869045,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017901033877365238,
-      "loss": 1.4342,
-      "step": 807000
-    },
-    {
-      "epoch": 5.250666493270043,
-      "grad_norm": 4.71875,
-      "learning_rate": 0.0017899733402691983,
-      "loss": 1.4353,
-      "step": 807500
-    },
-    {
-      "epoch": 5.253917679953183,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0017898432928018725,
-      "loss": 1.4306,
-      "step": 808000
-    },
-    {
-      "epoch": 5.257168866636322,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.001789713245334547,
-      "loss": 1.4336,
-      "step": 808500
-    },
-    {
-      "epoch": 5.260420053319462,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0017895831978672217,
-      "loss": 1.4366,
-      "step": 809000
-    },
-    {
-      "epoch": 5.263671240002601,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0017894531503998962,
-      "loss": 1.4348,
-      "step": 809500
-    },
-    {
-      "epoch": 5.26692242668574,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.0017893231029325705,
-      "loss": 1.4361,
-      "step": 810000
-    },
-    {
-      "epoch": 5.27017361336888,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.001789193055465245,
-      "loss": 1.4286,
-      "step": 810500
-    },
-    {
-      "epoch": 5.273424800052019,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0017890630079979194,
-      "loss": 1.4307,
-      "step": 811000
-    },
-    {
-      "epoch": 5.2766759867351585,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0017889329605305937,
-      "loss": 1.4324,
-      "step": 811500
-    },
-    {
-      "epoch": 5.279927173418297,
-      "grad_norm": 1.296875,
-      "learning_rate": 0.0017888029130632681,
-      "loss": 1.4263,
-      "step": 812000
-    },
-    {
-      "epoch": 5.283178360101437,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0017886728655959426,
-      "loss": 1.4316,
-      "step": 812500
-    },
-    {
-      "epoch": 5.286429546784577,
-      "grad_norm": 1.171875,
-      "learning_rate": 0.0017885428181286169,
-      "loss": 1.4346,
-      "step": 813000
-    },
-    {
-      "epoch": 5.289680733467716,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0017884127706612914,
-      "loss": 1.4345,
-      "step": 813500
-    },
-    {
-      "epoch": 5.292931920150855,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0017882827231939658,
-      "loss": 1.4316,
-      "step": 814000
-    },
-    {
-      "epoch": 5.296183106833994,
-      "grad_norm": 1.8671875,
-      "learning_rate": 0.00178815267572664,
-      "loss": 1.4313,
-      "step": 814500
-    },
-    {
-      "epoch": 5.299434293517134,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0017880226282593146,
-      "loss": 1.4309,
-      "step": 815000
-    },
-    {
-      "epoch": 5.302685480200273,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.001787892580791989,
-      "loss": 1.4336,
-      "step": 815500
-    },
-    {
-      "epoch": 5.3059366668834125,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017877625333246633,
-      "loss": 1.4283,
-      "step": 816000
-    },
-    {
-      "epoch": 5.309187853566552,
-      "grad_norm": 0.9765625,
-      "learning_rate": 0.001787632485857338,
-      "loss": 1.43,
-      "step": 816500
-    },
-    {
-      "epoch": 5.312439040249691,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0017875024383900125,
-      "loss": 1.4354,
-      "step": 817000
-    },
-    {
-      "epoch": 5.315690226932831,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.001787372390922687,
-      "loss": 1.4303,
-      "step": 817500
-    },
-    {
-      "epoch": 5.31894141361597,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0017872423434553612,
-      "loss": 1.4338,
-      "step": 818000
-    },
-    {
-      "epoch": 5.322192600299109,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0017871122959880357,
-      "loss": 1.4386,
-      "step": 818500
-    },
-    {
-      "epoch": 5.325443786982248,
-      "grad_norm": 1.1171875,
-      "learning_rate": 0.0017869822485207102,
-      "loss": 1.4364,
-      "step": 819000
-    },
-    {
-      "epoch": 5.328694973665388,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0017868522010533845,
-      "loss": 1.4372,
-      "step": 819500
-    },
-    {
-      "epoch": 5.331946160348528,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.001786722153586059,
-      "loss": 1.4403,
-      "step": 820000
-    },
-    {
-      "epoch": 5.3351973470316665,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0017865921061187334,
-      "loss": 1.432,
-      "step": 820500
-    },
-    {
-      "epoch": 5.338448533714806,
-      "grad_norm": 0.890625,
-      "learning_rate": 0.0017864620586514077,
-      "loss": 1.429,
-      "step": 821000
-    },
-    {
-      "epoch": 5.341699720397945,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0017863320111840822,
-      "loss": 1.4301,
-      "step": 821500
-    },
-    {
-      "epoch": 5.344950907081085,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0017862019637167566,
-      "loss": 1.4332,
-      "step": 822000
-    },
-    {
-      "epoch": 5.348202093764224,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.001786071916249431,
-      "loss": 1.4334,
-      "step": 822500
-    },
-    {
-      "epoch": 5.351453280447363,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0017859418687821054,
-      "loss": 1.4321,
-      "step": 823000
-    },
-    {
-      "epoch": 5.354704467130503,
-      "grad_norm": 0.84375,
-      "learning_rate": 0.00178581182131478,
-      "loss": 1.4315,
-      "step": 823500
-    },
-    {
-      "epoch": 5.357955653813642,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.0017856817738474545,
-      "loss": 1.4369,
-      "step": 824000
-    },
-    {
-      "epoch": 5.361206840496782,
-      "grad_norm": 2.40625,
-      "learning_rate": 0.0017855517263801288,
-      "loss": 1.4303,
-      "step": 824500
-    },
-    {
-      "epoch": 5.3644580271799205,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0017854216789128033,
-      "loss": 1.4386,
-      "step": 825000
-    },
-    {
-      "epoch": 5.36770921386306,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0017852916314454778,
-      "loss": 1.4326,
-      "step": 825500
-    },
-    {
-      "epoch": 5.370960400546199,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001785161583978152,
-      "loss": 1.4357,
-      "step": 826000
-    },
-    {
-      "epoch": 5.374211587229339,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0017850315365108265,
-      "loss": 1.4397,
-      "step": 826500
-    },
-    {
-      "epoch": 5.3774627739124785,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.001784901489043501,
-      "loss": 1.4392,
-      "step": 827000
-    },
-    {
-      "epoch": 5.380713960595617,
-      "grad_norm": 1.0625,
-      "learning_rate": 0.0017847714415761752,
-      "loss": 1.4344,
-      "step": 827500
-    },
-    {
-      "epoch": 5.383965147278757,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0017846413941088497,
-      "loss": 1.4347,
-      "step": 828000
-    },
-    {
-      "epoch": 5.387216333961896,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0017845113466415242,
-      "loss": 1.4388,
-      "step": 828500
-    },
-    {
-      "epoch": 5.390467520645036,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0017843812991741985,
-      "loss": 1.4361,
-      "step": 829000
-    },
-    {
-      "epoch": 5.3937187073281745,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.001784251251706873,
-      "loss": 1.4407,
-      "step": 829500
-    },
-    {
-      "epoch": 5.396969894011314,
-      "grad_norm": 1.4375,
-      "learning_rate": 0.0017841212042395474,
-      "loss": 1.4423,
-      "step": 830000
-    },
-    {
-      "epoch": 5.400221080694454,
-      "grad_norm": 1.1015625,
-      "learning_rate": 0.0017839911567722217,
-      "loss": 1.4417,
-      "step": 830500
-    },
-    {
-      "epoch": 5.403472267377593,
-      "grad_norm": 1.7265625,
-      "learning_rate": 0.0017838611093048964,
-      "loss": 1.4446,
-      "step": 831000
-    },
-    {
-      "epoch": 5.4067234540607325,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0017837310618375709,
-      "loss": 1.4417,
-      "step": 831500
-    },
-    {
-      "epoch": 5.409974640743871,
-      "grad_norm": 0.984375,
-      "learning_rate": 0.0017836010143702453,
-      "loss": 1.4407,
-      "step": 832000
-    },
-    {
-      "epoch": 5.413225827427011,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0017834709669029196,
-      "loss": 1.4398,
-      "step": 832500
-    },
-    {
-      "epoch": 5.41647701411015,
-      "grad_norm": 0.9765625,
-      "learning_rate": 0.001783340919435594,
-      "loss": 1.444,
-      "step": 833000
-    },
-    {
-      "epoch": 5.41972820079329,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0017832108719682686,
-      "loss": 1.4429,
-      "step": 833500
-    },
-    {
-      "epoch": 5.4229793874764285,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0017830808245009428,
-      "loss": 1.448,
-      "step": 834000
-    },
-    {
-      "epoch": 5.426230574159568,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017829507770336173,
-      "loss": 1.4424,
-      "step": 834500
-    },
-    {
-      "epoch": 5.429481760842708,
-      "grad_norm": 1.578125,
-      "learning_rate": 0.0017828207295662918,
-      "loss": 1.4442,
-      "step": 835000
-    },
-    {
-      "epoch": 5.432732947525847,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.001782690682098966,
-      "loss": 1.4364,
-      "step": 835500
-    },
-    {
-      "epoch": 5.4359841342089865,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0017825606346316405,
-      "loss": 1.4351,
-      "step": 836000
-    },
-    {
-      "epoch": 5.439235320892125,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.001782430587164315,
-      "loss": 1.4341,
-      "step": 836500
-    },
-    {
-      "epoch": 5.442486507575265,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0017823005396969893,
-      "loss": 1.4395,
-      "step": 837000
-    },
-    {
-      "epoch": 5.445737694258404,
-      "grad_norm": 2.65625,
-      "learning_rate": 0.0017821704922296637,
-      "loss": 1.4416,
-      "step": 837500
-    },
-    {
-      "epoch": 5.448988880941544,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0017820404447623384,
-      "loss": 1.4371,
-      "step": 838000
-    },
-    {
-      "epoch": 5.452240067624683,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.001781910397295013,
-      "loss": 1.4336,
-      "step": 838500
-    },
-    {
-      "epoch": 5.455491254307822,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0017817803498276872,
-      "loss": 1.4373,
-      "step": 839000
-    },
-    {
-      "epoch": 5.458742440990962,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0017816503023603616,
-      "loss": 1.4375,
-      "step": 839500
-    },
-    {
-      "epoch": 5.461993627674101,
-      "grad_norm": 3.96875,
-      "learning_rate": 0.0017815202548930361,
-      "loss": 1.4356,
-      "step": 840000
-    },
-    {
-      "epoch": 5.4652448143572405,
-      "grad_norm": 3.21875,
-      "learning_rate": 0.0017813902074257104,
-      "loss": 1.4346,
-      "step": 840500
-    },
-    {
-      "epoch": 5.468496001040379,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0017812601599583849,
-      "loss": 1.4327,
-      "step": 841000
-    },
-    {
-      "epoch": 5.471747187723519,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0017811301124910593,
-      "loss": 1.4347,
-      "step": 841500
-    },
-    {
-      "epoch": 5.474998374406659,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0017810000650237336,
-      "loss": 1.439,
-      "step": 842000
-    },
-    {
-      "epoch": 5.478249561089798,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.001780870017556408,
-      "loss": 1.4383,
-      "step": 842500
-    },
-    {
-      "epoch": 5.481500747772937,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0017807399700890826,
-      "loss": 1.4395,
-      "step": 843000
-    },
-    {
-      "epoch": 5.484751934456076,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0017806099226217568,
-      "loss": 1.4374,
-      "step": 843500
-    },
-    {
-      "epoch": 5.488003121139216,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0017804798751544313,
-      "loss": 1.445,
-      "step": 844000
-    },
-    {
-      "epoch": 5.491254307822355,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0017803498276871058,
-      "loss": 1.4482,
-      "step": 844500
-    },
-    {
-      "epoch": 5.4945054945054945,
-      "grad_norm": 1.1171875,
-      "learning_rate": 0.00178021978021978,
-      "loss": 1.4499,
-      "step": 845000
-    },
-    {
-      "epoch": 5.497756681188634,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0017800897327524547,
-      "loss": 1.4462,
-      "step": 845500
-    },
-    {
-      "epoch": 5.501007867871773,
-      "grad_norm": 3.71875,
-      "learning_rate": 0.0017799596852851292,
-      "loss": 1.4478,
-      "step": 846000
-    },
-    {
-      "epoch": 5.504259054554913,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0017798296378178037,
-      "loss": 1.4479,
-      "step": 846500
-    },
-    {
-      "epoch": 5.507510241238052,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.001779699590350478,
-      "loss": 1.4523,
-      "step": 847000
-    },
-    {
-      "epoch": 5.510761427921191,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0017795695428831524,
-      "loss": 1.4498,
-      "step": 847500
-    },
-    {
-      "epoch": 5.51401261460433,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.001779439495415827,
-      "loss": 1.4493,
-      "step": 848000
-    },
-    {
-      "epoch": 5.51726380128747,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0017793094479485012,
-      "loss": 1.454,
-      "step": 848500
-    },
-    {
-      "epoch": 5.520514987970609,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0017791794004811757,
-      "loss": 1.4481,
-      "step": 849000
-    },
-    {
-      "epoch": 5.5237661746537485,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0017790493530138501,
-      "loss": 1.4474,
-      "step": 849500
-    },
-    {
-      "epoch": 5.527017361336888,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0017789193055465244,
-      "loss": 1.4459,
-      "step": 850000
-    },
-    {
-      "epoch": 5.530268548020027,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0017787892580791989,
-      "loss": 1.4437,
-      "step": 850500
-    },
-    {
-      "epoch": 5.533519734703167,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0017786592106118734,
-      "loss": 1.4456,
-      "step": 851000
-    },
-    {
-      "epoch": 5.536770921386306,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0017785291631445476,
-      "loss": 1.44,
-      "step": 851500
-    },
-    {
-      "epoch": 5.540022108069445,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.001778399115677222,
-      "loss": 1.4362,
-      "step": 852000
-    },
-    {
-      "epoch": 5.543273294752584,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0017782690682098968,
-      "loss": 1.4389,
-      "step": 852500
-    },
-    {
-      "epoch": 5.546524481435724,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0017781390207425713,
-      "loss": 1.4365,
-      "step": 853000
-    },
-    {
-      "epoch": 5.549775668118864,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0017780089732752455,
-      "loss": 1.4347,
-      "step": 853500
-    },
-    {
-      "epoch": 5.5530268548020025,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.00177787892580792,
-      "loss": 1.433,
-      "step": 854000
-    },
-    {
-      "epoch": 5.556278041485142,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0017777488783405945,
-      "loss": 1.4389,
-      "step": 854500
-    },
-    {
-      "epoch": 5.559529228168281,
-      "grad_norm": 1.3046875,
-      "learning_rate": 0.0017776188308732687,
-      "loss": 1.4277,
-      "step": 855000
-    },
-    {
-      "epoch": 5.562780414851421,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0017774887834059432,
-      "loss": 1.4311,
-      "step": 855500
-    },
-    {
-      "epoch": 5.56603160153456,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0017773587359386177,
-      "loss": 1.4269,
-      "step": 856000
-    },
-    {
-      "epoch": 5.569282788217699,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.001777228688471292,
-      "loss": 1.4299,
-      "step": 856500
-    },
-    {
-      "epoch": 5.572533974900839,
-      "grad_norm": 4.78125,
-      "learning_rate": 0.0017770986410039664,
-      "loss": 1.4351,
-      "step": 857000
-    },
-    {
-      "epoch": 5.575785161583978,
-      "grad_norm": 2.921875,
-      "learning_rate": 0.001776968593536641,
-      "loss": 1.4311,
-      "step": 857500
-    },
-    {
-      "epoch": 5.579036348267118,
-      "grad_norm": 1.125,
-      "learning_rate": 0.0017768385460693152,
-      "loss": 1.4392,
-      "step": 858000
-    },
-    {
-      "epoch": 5.5822875349502565,
-      "grad_norm": 1.65625,
-      "learning_rate": 0.0017767084986019897,
-      "loss": 1.4358,
-      "step": 858500
-    },
-    {
-      "epoch": 5.585538721633396,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.0017765784511346641,
-      "loss": 1.4362,
-      "step": 859000
-    },
-    {
-      "epoch": 5.588789908316535,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0017764484036673384,
-      "loss": 1.4434,
-      "step": 859500
-    },
-    {
-      "epoch": 5.592041094999675,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.001776318356200013,
-      "loss": 1.4344,
-      "step": 860000
-    },
-    {
-      "epoch": 5.595292281682815,
-      "grad_norm": 1.0625,
-      "learning_rate": 0.0017761883087326876,
-      "loss": 1.4339,
-      "step": 860500
-    },
-    {
-      "epoch": 5.598543468365953,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.001776058261265362,
-      "loss": 1.4316,
-      "step": 861000
-    },
-    {
-      "epoch": 5.601794655049093,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0017759282137980363,
-      "loss": 1.4352,
-      "step": 861500
-    },
-    {
-      "epoch": 5.605045841732232,
-      "grad_norm": 0.97265625,
-      "learning_rate": 0.0017757981663307108,
-      "loss": 1.4341,
-      "step": 862000
-    },
-    {
-      "epoch": 5.608297028415372,
-      "grad_norm": 3.8125,
-      "learning_rate": 0.0017756681188633853,
-      "loss": 1.4367,
-      "step": 862500
-    },
-    {
-      "epoch": 5.6115482150985105,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0017755380713960595,
-      "loss": 1.4371,
-      "step": 863000
-    },
-    {
-      "epoch": 5.61479940178165,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.001775408023928734,
-      "loss": 1.434,
-      "step": 863500
-    },
-    {
-      "epoch": 5.61805058846479,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0017752779764614085,
-      "loss": 1.4328,
-      "step": 864000
-    },
-    {
-      "epoch": 5.621301775147929,
-      "grad_norm": 1.6171875,
-      "learning_rate": 0.0017751479289940828,
-      "loss": 1.4349,
-      "step": 864500
-    },
-    {
-      "epoch": 5.624552961831069,
-      "grad_norm": 1.0,
-      "learning_rate": 0.0017750178815267572,
-      "loss": 1.4316,
-      "step": 865000
-    },
-    {
-      "epoch": 5.627804148514207,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0017748878340594317,
-      "loss": 1.4332,
-      "step": 865500
-    },
-    {
-      "epoch": 5.631055335197347,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.001774757786592106,
-      "loss": 1.4338,
-      "step": 866000
-    },
-    {
-      "epoch": 5.634306521880486,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.0017746277391247805,
-      "loss": 1.4292,
-      "step": 866500
-    },
-    {
-      "epoch": 5.637557708563626,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0017744976916574551,
-      "loss": 1.4338,
-      "step": 867000
-    },
-    {
-      "epoch": 5.640808895246765,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0017743676441901296,
-      "loss": 1.4284,
-      "step": 867500
-    },
-    {
-      "epoch": 5.644060081929904,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0017742375967228039,
-      "loss": 1.4261,
-      "step": 868000
-    },
-    {
-      "epoch": 5.647311268613044,
-      "grad_norm": 1.390625,
-      "learning_rate": 0.0017741075492554784,
-      "loss": 1.4281,
-      "step": 868500
-    },
-    {
-      "epoch": 5.650562455296183,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017739775017881528,
-      "loss": 1.4258,
-      "step": 869000
-    },
-    {
-      "epoch": 5.653813641979323,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.001773847454320827,
-      "loss": 1.4271,
-      "step": 869500
-    },
-    {
-      "epoch": 5.657064828662461,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0017737174068535016,
-      "loss": 1.4319,
-      "step": 870000
-    },
-    {
-      "epoch": 5.660316015345601,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.001773587359386176,
-      "loss": 1.4392,
-      "step": 870500
-    },
-    {
-      "epoch": 5.663567202028741,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0017734573119188503,
-      "loss": 1.4391,
-      "step": 871000
-    },
-    {
-      "epoch": 5.66681838871188,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0017733272644515248,
-      "loss": 1.4384,
-      "step": 871500
-    },
-    {
-      "epoch": 5.670069575395019,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0017731972169841993,
-      "loss": 1.433,
-      "step": 872000
-    },
-    {
-      "epoch": 5.673320762078158,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0017730671695168735,
-      "loss": 1.4417,
-      "step": 872500
-    },
-    {
-      "epoch": 5.676571948761298,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.001772937122049548,
-      "loss": 1.4341,
-      "step": 873000
-    },
-    {
-      "epoch": 5.679823135444437,
-      "grad_norm": 1.03125,
-      "learning_rate": 0.0017728070745822225,
-      "loss": 1.4381,
-      "step": 873500
-    },
-    {
-      "epoch": 5.683074322127577,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0017726770271148968,
-      "loss": 1.4375,
-      "step": 874000
-    },
-    {
-      "epoch": 5.686325508810716,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0017725469796475715,
-      "loss": 1.4401,
-      "step": 874500
-    },
-    {
-      "epoch": 5.689576695493855,
-      "grad_norm": 0.85546875,
-      "learning_rate": 0.001772416932180246,
-      "loss": 1.4388,
-      "step": 875000
-    },
-    {
-      "epoch": 5.692827882176995,
-      "grad_norm": 1.6328125,
-      "learning_rate": 0.0017722868847129204,
-      "loss": 1.4357,
-      "step": 875500
-    },
-    {
-      "epoch": 5.696079068860134,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0017721568372455947,
-      "loss": 1.4377,
-      "step": 876000
-    },
-    {
-      "epoch": 5.699330255543273,
-      "grad_norm": 0.94140625,
-      "learning_rate": 0.0017720267897782692,
-      "loss": 1.433,
-      "step": 876500
-    },
-    {
-      "epoch": 5.702581442226412,
-      "grad_norm": 4.15625,
-      "learning_rate": 0.0017718967423109436,
-      "loss": 1.4328,
-      "step": 877000
-    },
-    {
-      "epoch": 5.705832628909552,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.001771766694843618,
-      "loss": 1.4349,
-      "step": 877500
-    },
-    {
-      "epoch": 5.709083815592692,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017716366473762924,
-      "loss": 1.4363,
-      "step": 878000
-    },
-    {
-      "epoch": 5.712335002275831,
-      "grad_norm": 1.5546875,
-      "learning_rate": 0.0017715065999089669,
-      "loss": 1.4388,
-      "step": 878500
-    },
-    {
-      "epoch": 5.71558618895897,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0017713765524416411,
-      "loss": 1.4395,
-      "step": 879000
-    },
-    {
-      "epoch": 5.718837375642109,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0017712465049743156,
-      "loss": 1.4353,
-      "step": 879500
-    },
-    {
-      "epoch": 5.722088562325249,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.00177111645750699,
-      "loss": 1.4373,
-      "step": 880000
-    },
-    {
-      "epoch": 5.725339749008388,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0017709864100396643,
-      "loss": 1.4349,
-      "step": 880500
-    },
-    {
-      "epoch": 5.728590935691527,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0017708563625723388,
-      "loss": 1.4382,
-      "step": 881000
-    },
-    {
-      "epoch": 5.731842122374667,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0017707263151050135,
-      "loss": 1.4353,
-      "step": 881500
-    },
-    {
-      "epoch": 5.735093309057806,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.001770596267637688,
-      "loss": 1.4369,
-      "step": 882000
-    },
-    {
-      "epoch": 5.738344495740946,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0017704662201703622,
-      "loss": 1.4347,
-      "step": 882500
-    },
-    {
-      "epoch": 5.741595682424085,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0017703361727030367,
-      "loss": 1.4345,
-      "step": 883000
-    },
-    {
-      "epoch": 5.744846869107224,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0017702061252357112,
-      "loss": 1.4395,
-      "step": 883500
-    },
-    {
-      "epoch": 5.748098055790363,
-      "grad_norm": 0.58984375,
-      "learning_rate": 0.0017700760777683855,
-      "loss": 1.4383,
-      "step": 884000
-    },
-    {
-      "epoch": 5.751349242473503,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.00176994603030106,
-      "loss": 1.4386,
-      "step": 884500
-    },
-    {
-      "epoch": 5.754600429156643,
-      "grad_norm": 1.25,
-      "learning_rate": 0.0017698159828337344,
-      "loss": 1.4323,
-      "step": 885000
-    },
-    {
-      "epoch": 5.7578516158397814,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0017696859353664087,
-      "loss": 1.4471,
-      "step": 885500
-    },
-    {
-      "epoch": 5.761102802522921,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0017695558878990832,
-      "loss": 1.437,
-      "step": 886000
-    },
-    {
-      "epoch": 5.76435398920606,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.0017694258404317576,
-      "loss": 1.4366,
-      "step": 886500
-    },
-    {
-      "epoch": 5.7676051758892,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.001769295792964432,
-      "loss": 1.4318,
-      "step": 887000
-    },
-    {
-      "epoch": 5.770856362572339,
-      "grad_norm": 0.8671875,
-      "learning_rate": 0.0017691657454971064,
-      "loss": 1.4392,
-      "step": 887500
-    },
-    {
-      "epoch": 5.774107549255478,
-      "grad_norm": 0.80078125,
-      "learning_rate": 0.0017690356980297809,
-      "loss": 1.437,
-      "step": 888000
-    },
-    {
-      "epoch": 5.777358735938618,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0017689056505624551,
-      "loss": 1.4328,
-      "step": 888500
-    },
-    {
-      "epoch": 5.780609922621757,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.0017687756030951298,
-      "loss": 1.4395,
-      "step": 889000
-    },
-    {
-      "epoch": 5.783861109304897,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.0017686455556278043,
-      "loss": 1.4352,
-      "step": 889500
-    },
-    {
-      "epoch": 5.7871122959880354,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.0017685155081604788,
-      "loss": 1.4312,
-      "step": 890000
-    },
-    {
-      "epoch": 5.790363482671175,
-      "grad_norm": 0.875,
-      "learning_rate": 0.001768385460693153,
-      "loss": 1.4387,
-      "step": 890500
-    },
-    {
-      "epoch": 5.793614669354314,
-      "grad_norm": 1.109375,
-      "learning_rate": 0.0017682554132258275,
-      "loss": 1.4412,
-      "step": 891000
-    },
-    {
-      "epoch": 5.796865856037454,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001768125365758502,
-      "loss": 1.4367,
-      "step": 891500
-    },
-    {
-      "epoch": 5.8001170427205935,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0017679953182911763,
-      "loss": 1.4418,
-      "step": 892000
-    },
-    {
-      "epoch": 5.803368229403732,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0017678652708238507,
-      "loss": 1.4393,
-      "step": 892500
-    },
-    {
-      "epoch": 5.806619416086872,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0017677352233565252,
-      "loss": 1.4412,
-      "step": 893000
-    },
-    {
-      "epoch": 5.809870602770011,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0017676051758891995,
-      "loss": 1.4506,
-      "step": 893500
-    },
-    {
-      "epoch": 5.813121789453151,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.001767475128421874,
-      "loss": 1.4387,
-      "step": 894000
-    },
-    {
-      "epoch": 5.8163729761362895,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0017673450809545484,
-      "loss": 1.4368,
-      "step": 894500
-    },
-    {
-      "epoch": 5.819624162819429,
-      "grad_norm": 2.6875,
-      "learning_rate": 0.0017672150334872227,
-      "loss": 1.4371,
-      "step": 895000
-    },
-    {
-      "epoch": 5.822875349502569,
-      "grad_norm": 1.2109375,
-      "learning_rate": 0.0017670849860198972,
-      "loss": 1.4433,
-      "step": 895500
-    },
-    {
-      "epoch": 5.826126536185708,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0017669549385525719,
-      "loss": 1.4398,
-      "step": 896000
-    },
-    {
-      "epoch": 5.8293777228688475,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.0017668248910852463,
-      "loss": 1.4315,
-      "step": 896500
-    },
-    {
-      "epoch": 5.832628909551986,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0017666948436179206,
-      "loss": 1.4369,
-      "step": 897000
-    },
-    {
-      "epoch": 5.835880096235126,
-      "grad_norm": 0.74609375,
-      "learning_rate": 0.001766564796150595,
-      "loss": 1.4403,
-      "step": 897500
-    },
-    {
-      "epoch": 5.839131282918265,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0017664347486832696,
-      "loss": 1.4389,
-      "step": 898000
-    },
-    {
-      "epoch": 5.842382469601405,
-      "grad_norm": 0.88671875,
-      "learning_rate": 0.0017663047012159438,
-      "loss": 1.44,
-      "step": 898500
-    },
-    {
-      "epoch": 5.845633656284544,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0017661746537486183,
-      "loss": 1.4428,
-      "step": 899000
-    },
-    {
-      "epoch": 5.848884842967683,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0017660446062812928,
-      "loss": 1.4401,
-      "step": 899500
-    },
-    {
-      "epoch": 5.852136029650823,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.001765914558813967,
-      "loss": 1.4453,
-      "step": 900000
-    },
-    {
-      "epoch": 5.855387216333962,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0017657845113466415,
-      "loss": 1.4403,
-      "step": 900500
-    },
-    {
-      "epoch": 5.8586384030171015,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.001765654463879316,
-      "loss": 1.4371,
-      "step": 901000
-    },
-    {
-      "epoch": 5.86188958970024,
-      "grad_norm": 1.2890625,
-      "learning_rate": 0.0017655244164119903,
-      "loss": 1.4415,
-      "step": 901500
-    },
-    {
-      "epoch": 5.86514077638338,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0017653943689446647,
-      "loss": 1.4538,
-      "step": 902000
-    },
-    {
-      "epoch": 5.86839196306652,
-      "grad_norm": 1.0625,
-      "learning_rate": 0.0017652643214773392,
-      "loss": 1.4442,
-      "step": 902500
-    },
-    {
-      "epoch": 5.871643149749659,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0017651342740100135,
-      "loss": 1.4423,
-      "step": 903000
-    },
-    {
-      "epoch": 5.874894336432798,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0017650042265426882,
-      "loss": 1.4468,
-      "step": 903500
-    },
-    {
-      "epoch": 5.878145523115937,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0017648741790753627,
-      "loss": 1.4504,
-      "step": 904000
-    },
-    {
-      "epoch": 5.881396709799077,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0017647441316080371,
-      "loss": 1.4504,
-      "step": 904500
-    },
-    {
-      "epoch": 5.884647896482216,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0017646140841407114,
-      "loss": 1.4506,
-      "step": 905000
-    },
-    {
-      "epoch": 5.8878990831653555,
-      "grad_norm": 1.1015625,
-      "learning_rate": 0.0017644840366733859,
-      "loss": 1.4464,
-      "step": 905500
-    },
-    {
-      "epoch": 5.891150269848494,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0017643539892060604,
-      "loss": 1.4466,
-      "step": 906000
-    },
-    {
-      "epoch": 5.894401456531634,
-      "grad_norm": 0.859375,
-      "learning_rate": 0.0017642239417387346,
-      "loss": 1.4492,
-      "step": 906500
-    },
-    {
-      "epoch": 5.897652643214773,
-      "grad_norm": 1.2265625,
-      "learning_rate": 0.001764093894271409,
-      "loss": 1.4484,
-      "step": 907000
-    },
-    {
-      "epoch": 5.900903829897913,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0017639638468040836,
-      "loss": 1.453,
-      "step": 907500
-    },
-    {
-      "epoch": 5.904155016581052,
-      "grad_norm": 1.234375,
-      "learning_rate": 0.0017638337993367578,
-      "loss": 1.4505,
-      "step": 908000
-    },
-    {
-      "epoch": 5.907406203264191,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.0017637037518694323,
-      "loss": 1.4462,
-      "step": 908500
-    },
-    {
-      "epoch": 5.910657389947331,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0017635737044021068,
-      "loss": 1.4504,
-      "step": 909000
-    },
-    {
-      "epoch": 5.91390857663047,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.001763443656934781,
-      "loss": 1.4482,
-      "step": 909500
-    },
-    {
-      "epoch": 5.9171597633136095,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0017633136094674555,
-      "loss": 1.448,
-      "step": 910000
-    },
-    {
-      "epoch": 5.920410949996748,
-      "grad_norm": 1.0078125,
-      "learning_rate": 0.0017631835620001302,
-      "loss": 1.4487,
-      "step": 910500
-    },
-    {
-      "epoch": 5.923662136679888,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0017630535145328047,
-      "loss": 1.4487,
-      "step": 911000
-    },
-    {
-      "epoch": 5.926913323363028,
-      "grad_norm": 2.140625,
-      "learning_rate": 0.001762923467065479,
-      "loss": 1.451,
-      "step": 911500
-    },
-    {
-      "epoch": 5.930164510046167,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0017627934195981534,
-      "loss": 1.4477,
-      "step": 912000
-    },
-    {
-      "epoch": 5.933415696729306,
-      "grad_norm": 0.94140625,
-      "learning_rate": 0.001762663372130828,
-      "loss": 1.4422,
-      "step": 912500
-    },
-    {
-      "epoch": 5.936666883412445,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0017625333246635022,
-      "loss": 1.4406,
-      "step": 913000
-    },
-    {
-      "epoch": 5.939918070095585,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0017624032771961767,
-      "loss": 1.4463,
-      "step": 913500
-    },
-    {
-      "epoch": 5.943169256778724,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0017622732297288511,
-      "loss": 1.4441,
-      "step": 914000
-    },
-    {
-      "epoch": 5.9464204434618635,
-      "grad_norm": 0.96484375,
-      "learning_rate": 0.0017621431822615254,
-      "loss": 1.4461,
-      "step": 914500
-    },
-    {
-      "epoch": 5.949671630145003,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0017620131347941999,
-      "loss": 1.4471,
-      "step": 915000
-    },
-    {
-      "epoch": 5.952922816828142,
-      "grad_norm": 3.171875,
-      "learning_rate": 0.0017618830873268744,
-      "loss": 1.4448,
-      "step": 915500
-    },
-    {
-      "epoch": 5.956174003511282,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0017617530398595486,
-      "loss": 1.4441,
-      "step": 916000
-    },
-    {
-      "epoch": 5.959425190194421,
-      "grad_norm": 0.875,
-      "learning_rate": 0.001761622992392223,
-      "loss": 1.4413,
-      "step": 916500
-    },
-    {
-      "epoch": 5.96267637687756,
-      "grad_norm": 0.61328125,
-      "learning_rate": 0.0017614929449248976,
-      "loss": 1.4439,
-      "step": 917000
-    },
-    {
-      "epoch": 5.965927563560699,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0017613628974575718,
-      "loss": 1.4338,
-      "step": 917500
-    },
-    {
-      "epoch": 5.969178750243839,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0017612328499902465,
-      "loss": 1.4402,
-      "step": 918000
-    },
-    {
-      "epoch": 5.972429936926979,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.001761102802522921,
-      "loss": 1.4474,
-      "step": 918500
-    },
-    {
-      "epoch": 5.9756811236101175,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0017609727550555955,
-      "loss": 1.4411,
-      "step": 919000
-    },
-    {
-      "epoch": 5.978932310293257,
-      "grad_norm": 1.5390625,
-      "learning_rate": 0.0017608427075882698,
-      "loss": 1.4379,
-      "step": 919500
-    },
-    {
-      "epoch": 5.982183496976396,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.0017607126601209442,
-      "loss": 1.4487,
-      "step": 920000
-    },
-    {
-      "epoch": 5.985434683659536,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0017605826126536187,
-      "loss": 1.4472,
-      "step": 920500
-    },
-    {
-      "epoch": 5.988685870342675,
-      "grad_norm": 1.015625,
-      "learning_rate": 0.001760452565186293,
-      "loss": 1.4468,
-      "step": 921000
-    },
-    {
-      "epoch": 5.991937057025814,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0017603225177189675,
-      "loss": 1.443,
-      "step": 921500
-    },
-    {
-      "epoch": 5.995188243708954,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.001760192470251642,
-      "loss": 1.4388,
-      "step": 922000
-    },
-    {
-      "epoch": 5.998439430392093,
-      "grad_norm": 1.296875,
-      "learning_rate": 0.0017600624227843162,
-      "loss": 1.443,
-      "step": 922500
-    },
-    {
-      "epoch": 6.0,
-      "eval_loss": 1.4233994483947754,
-      "eval_runtime": 0.5293,
-      "eval_samples_per_second": 1889.415,
-      "eval_steps_per_second": 30.231,
-      "step": 922740
-    },
-    {
-      "epoch": 6.001690617075233,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.0017599323753169907,
-      "loss": 1.4447,
-      "step": 923000
-    },
-    {
-      "epoch": 6.0049418037583715,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0017598023278496651,
-      "loss": 1.4429,
-      "step": 923500
-    },
-    {
-      "epoch": 6.008192990441511,
-      "grad_norm": 3.890625,
-      "learning_rate": 0.0017596722803823394,
-      "loss": 1.4379,
-      "step": 924000
-    },
-    {
-      "epoch": 6.01144417712465,
-      "grad_norm": 3.109375,
-      "learning_rate": 0.0017595422329150139,
-      "loss": 1.4337,
-      "step": 924500
-    },
-    {
-      "epoch": 6.01469536380779,
-      "grad_norm": 2.984375,
-      "learning_rate": 0.0017594121854476886,
-      "loss": 1.4372,
-      "step": 925000
-    },
-    {
-      "epoch": 6.0179465504909295,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.001759282137980363,
-      "loss": 1.4368,
-      "step": 925500
-    },
-    {
-      "epoch": 6.021197737174068,
-      "grad_norm": 0.921875,
-      "learning_rate": 0.0017591520905130373,
-      "loss": 1.4414,
-      "step": 926000
-    },
-    {
-      "epoch": 6.024448923857208,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0017590220430457118,
-      "loss": 1.4377,
-      "step": 926500
-    },
-    {
-      "epoch": 6.027700110540347,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0017588919955783863,
-      "loss": 1.4385,
-      "step": 927000
-    },
-    {
-      "epoch": 6.030951297223487,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0017587619481110605,
-      "loss": 1.437,
-      "step": 927500
-    },
-    {
-      "epoch": 6.0342024839066255,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.001758631900643735,
-      "loss": 1.4363,
-      "step": 928000
-    },
-    {
-      "epoch": 6.037453670589765,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0017585018531764095,
-      "loss": 1.4425,
-      "step": 928500
-    },
-    {
-      "epoch": 6.040704857272905,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0017583718057090838,
-      "loss": 1.4346,
-      "step": 929000
-    },
-    {
-      "epoch": 6.043956043956044,
-      "grad_norm": 1.0546875,
-      "learning_rate": 0.0017582417582417582,
-      "loss": 1.4351,
-      "step": 929500
-    },
-    {
-      "epoch": 6.0472072306391835,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0017581117107744327,
-      "loss": 1.4381,
-      "step": 930000
-    },
-    {
-      "epoch": 6.050458417322322,
-      "grad_norm": 1.2890625,
-      "learning_rate": 0.001757981663307107,
-      "loss": 1.4369,
-      "step": 930500
-    },
-    {
-      "epoch": 6.053709604005462,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0017578516158397815,
-      "loss": 1.4339,
-      "step": 931000
-    },
-    {
-      "epoch": 6.056960790688601,
-      "grad_norm": 1.5234375,
-      "learning_rate": 0.001757721568372456,
-      "loss": 1.432,
-      "step": 931500
-    },
-    {
-      "epoch": 6.060211977371741,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0017575915209051302,
-      "loss": 1.429,
-      "step": 932000
-    },
-    {
-      "epoch": 6.06346316405488,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.001757461473437805,
-      "loss": 1.4366,
-      "step": 932500
-    },
-    {
-      "epoch": 6.066714350738019,
-      "grad_norm": 1.3828125,
-      "learning_rate": 0.0017573314259704794,
-      "loss": 1.4358,
-      "step": 933000
-    },
-    {
-      "epoch": 6.069965537421159,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.0017572013785031539,
-      "loss": 1.4321,
-      "step": 933500
-    },
-    {
-      "epoch": 6.073216724104298,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0017570713310358281,
-      "loss": 1.4307,
-      "step": 934000
-    },
-    {
-      "epoch": 6.0764679107874375,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0017569412835685026,
-      "loss": 1.4316,
-      "step": 934500
-    },
-    {
-      "epoch": 6.079719097470576,
-      "grad_norm": 0.5859375,
-      "learning_rate": 0.001756811236101177,
-      "loss": 1.4334,
-      "step": 935000
-    },
-    {
-      "epoch": 6.082970284153716,
-      "grad_norm": 15.6875,
-      "learning_rate": 0.0017566811886338513,
-      "loss": 1.4387,
-      "step": 935500
-    },
-    {
-      "epoch": 6.086221470836856,
-      "grad_norm": 1.28125,
-      "learning_rate": 0.0017565511411665258,
-      "loss": 1.4378,
-      "step": 936000
-    },
-    {
-      "epoch": 6.089472657519995,
-      "grad_norm": 1.3203125,
-      "learning_rate": 0.0017564210936992003,
-      "loss": 1.438,
-      "step": 936500
-    },
-    {
-      "epoch": 6.092723844203134,
-      "grad_norm": 0.94921875,
-      "learning_rate": 0.0017562910462318746,
-      "loss": 1.4442,
-      "step": 937000
-    },
-    {
-      "epoch": 6.095975030886273,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.001756160998764549,
-      "loss": 1.4395,
-      "step": 937500
-    },
-    {
-      "epoch": 6.099226217569413,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0017560309512972235,
-      "loss": 1.4356,
-      "step": 938000
-    },
-    {
-      "epoch": 6.102477404252552,
-      "grad_norm": 0.8125,
-      "learning_rate": 0.0017559009038298978,
-      "loss": 1.4354,
-      "step": 938500
-    },
-    {
-      "epoch": 6.1057285909356915,
-      "grad_norm": 1.59375,
-      "learning_rate": 0.0017557708563625722,
-      "loss": 1.4383,
-      "step": 939000
-    },
-    {
-      "epoch": 6.108979777618831,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.001755640808895247,
-      "loss": 1.4348,
-      "step": 939500
-    },
-    {
-      "epoch": 6.11223096430197,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0017555107614279214,
-      "loss": 1.4367,
-      "step": 940000
-    },
-    {
-      "epoch": 6.11548215098511,
-      "grad_norm": 1.3828125,
-      "learning_rate": 0.0017553807139605957,
-      "loss": 1.4381,
-      "step": 940500
-    },
-    {
-      "epoch": 6.118733337668249,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0017552506664932702,
-      "loss": 1.4336,
-      "step": 941000
-    },
-    {
-      "epoch": 6.121984524351388,
-      "grad_norm": 0.9140625,
-      "learning_rate": 0.0017551206190259446,
-      "loss": 1.4387,
-      "step": 941500
-    },
-    {
-      "epoch": 6.125235711034527,
-      "grad_norm": 0.640625,
-      "learning_rate": 0.001754990571558619,
-      "loss": 1.4386,
-      "step": 942000
-    },
-    {
-      "epoch": 6.128486897717667,
-      "grad_norm": 0.96875,
-      "learning_rate": 0.0017548605240912934,
-      "loss": 1.4418,
-      "step": 942500
-    },
-    {
-      "epoch": 6.131738084400807,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0017547304766239679,
-      "loss": 1.4401,
-      "step": 943000
-    },
-    {
-      "epoch": 6.1349892710839455,
-      "grad_norm": 4.8125,
-      "learning_rate": 0.0017546004291566421,
-      "loss": 1.4466,
-      "step": 943500
-    },
-    {
-      "epoch": 6.138240457767085,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0017544703816893166,
-      "loss": 1.4419,
-      "step": 944000
-    },
-    {
-      "epoch": 6.141491644450224,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.001754340334221991,
-      "loss": 1.4366,
-      "step": 944500
-    },
-    {
-      "epoch": 6.144742831133364,
-      "grad_norm": 0.99609375,
-      "learning_rate": 0.0017542102867546653,
-      "loss": 1.4378,
-      "step": 945000
-    },
-    {
-      "epoch": 6.147994017816503,
-      "grad_norm": 2.484375,
-      "learning_rate": 0.0017540802392873398,
-      "loss": 1.4372,
-      "step": 945500
-    },
-    {
-      "epoch": 6.151245204499642,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.0017539501918200143,
-      "loss": 1.4418,
-      "step": 946000
-    },
-    {
-      "epoch": 6.154496391182782,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0017538201443526886,
-      "loss": 1.4448,
-      "step": 946500
-    },
-    {
-      "epoch": 6.157747577865921,
-      "grad_norm": 0.82421875,
-      "learning_rate": 0.0017536900968853633,
-      "loss": 1.4402,
-      "step": 947000
-    },
-    {
-      "epoch": 6.160998764549061,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0017535600494180377,
-      "loss": 1.44,
-      "step": 947500
-    },
-    {
-      "epoch": 6.1642499512321995,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0017534300019507122,
-      "loss": 1.4443,
-      "step": 948000
-    },
-    {
-      "epoch": 6.167501137915339,
-      "grad_norm": 0.59765625,
-      "learning_rate": 0.0017532999544833865,
-      "loss": 1.4419,
-      "step": 948500
-    },
-    {
-      "epoch": 6.170752324598478,
-      "grad_norm": 1.5234375,
-      "learning_rate": 0.001753169907016061,
-      "loss": 1.4433,
-      "step": 949000
-    },
-    {
-      "epoch": 6.174003511281618,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.0017530398595487354,
-      "loss": 1.4432,
-      "step": 949500
-    },
-    {
-      "epoch": 6.177254697964758,
-      "grad_norm": 0.83203125,
-      "learning_rate": 0.0017529098120814097,
-      "loss": 1.4434,
-      "step": 950000
-    },
-    {
-      "epoch": 6.180505884647896,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0017527797646140842,
-      "loss": 1.4415,
-      "step": 950500
-    },
-    {
-      "epoch": 6.183757071331036,
-      "grad_norm": 1.4296875,
-      "learning_rate": 0.0017526497171467586,
-      "loss": 1.4471,
-      "step": 951000
-    },
-    {
-      "epoch": 6.187008258014175,
-      "grad_norm": 0.875,
-      "learning_rate": 0.001752519669679433,
-      "loss": 1.4471,
-      "step": 951500
-    },
-    {
-      "epoch": 6.190259444697315,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0017523896222121074,
-      "loss": 1.4398,
-      "step": 952000
-    },
-    {
-      "epoch": 6.1935106313804535,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.0017522595747447819,
-      "loss": 1.4425,
-      "step": 952500
-    },
-    {
-      "epoch": 6.196761818063593,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0017521295272774561,
-      "loss": 1.4445,
-      "step": 953000
-    },
-    {
-      "epoch": 6.200013004746733,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0017519994798101306,
-      "loss": 1.447,
-      "step": 953500
-    },
-    {
-      "epoch": 6.203264191429872,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0017518694323428053,
-      "loss": 1.4354,
-      "step": 954000
-    },
-    {
-      "epoch": 6.206515378113012,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0017517393848754798,
-      "loss": 1.436,
-      "step": 954500
-    },
-    {
-      "epoch": 6.20976656479615,
-      "grad_norm": 1.140625,
-      "learning_rate": 0.001751609337408154,
-      "loss": 1.4396,
-      "step": 955000
-    },
-    {
-      "epoch": 6.21301775147929,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.0017514792899408285,
-      "loss": 1.4427,
-      "step": 955500
-    },
-    {
-      "epoch": 6.216268938162429,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.001751349242473503,
-      "loss": 1.4362,
-      "step": 956000
-    },
-    {
-      "epoch": 6.219520124845569,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.0017512191950061773,
-      "loss": 1.4419,
-      "step": 956500
-    },
-    {
-      "epoch": 6.2227713115287075,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0017510891475388517,
-      "loss": 1.4488,
-      "step": 957000
-    },
-    {
-      "epoch": 6.226022498211847,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0017509591000715262,
-      "loss": 1.4408,
-      "step": 957500
-    },
-    {
-      "epoch": 6.229273684894987,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0017508290526042005,
-      "loss": 1.4444,
-      "step": 958000
-    },
-    {
-      "epoch": 6.232524871578126,
-      "grad_norm": 0.90234375,
-      "learning_rate": 0.001750699005136875,
-      "loss": 1.448,
-      "step": 958500
-    },
-    {
-      "epoch": 6.235776058261266,
-      "grad_norm": 2.6875,
-      "learning_rate": 0.0017505689576695494,
-      "loss": 1.4426,
-      "step": 959000
-    },
-    {
-      "epoch": 6.239027244944404,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0017504389102022237,
-      "loss": 1.4409,
-      "step": 959500
-    },
-    {
-      "epoch": 6.242278431627544,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0017503088627348982,
-      "loss": 1.4349,
-      "step": 960000
-    },
-    {
-      "epoch": 6.245529618310683,
-      "grad_norm": 16.875,
-      "learning_rate": 0.0017501788152675727,
-      "loss": 1.4433,
-      "step": 960500
-    },
-    {
-      "epoch": 6.248780804993823,
-      "grad_norm": 1.5390625,
-      "learning_rate": 0.001750048767800247,
-      "loss": 1.4417,
-      "step": 961000
-    },
-    {
-      "epoch": 6.252031991676962,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0017499187203329216,
-      "loss": 1.4414,
-      "step": 961500
-    },
-    {
-      "epoch": 6.255283178360101,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.001749788672865596,
-      "loss": 1.4424,
-      "step": 962000
-    },
-    {
-      "epoch": 6.258534365043241,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0017496586253982706,
-      "loss": 1.4417,
-      "step": 962500
-    },
-    {
-      "epoch": 6.26178555172638,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0017495285779309448,
-      "loss": 1.4416,
-      "step": 963000
-    },
-    {
-      "epoch": 6.26503673840952,
-      "grad_norm": 1.7421875,
-      "learning_rate": 0.0017493985304636193,
-      "loss": 1.4448,
-      "step": 963500
-    },
-    {
-      "epoch": 6.268287925092658,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.0017492684829962938,
-      "loss": 1.4438,
-      "step": 964000
-    },
-    {
-      "epoch": 6.271539111775798,
-      "grad_norm": 1.0625,
-      "learning_rate": 0.001749138435528968,
-      "loss": 1.443,
-      "step": 964500
-    },
-    {
-      "epoch": 6.274790298458938,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.0017490083880616425,
-      "loss": 1.4432,
-      "step": 965000
-    },
-    {
-      "epoch": 6.278041485142077,
-      "grad_norm": 1.8046875,
-      "learning_rate": 0.001748878340594317,
-      "loss": 1.4418,
-      "step": 965500
-    },
-    {
-      "epoch": 6.2812926718252164,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0017487482931269913,
-      "loss": 1.4437,
-      "step": 966000
-    },
-    {
-      "epoch": 6.284543858508355,
-      "grad_norm": 0.7109375,
-      "learning_rate": 0.0017486182456596657,
-      "loss": 1.4367,
-      "step": 966500
-    },
-    {
-      "epoch": 6.287795045191495,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.0017484881981923402,
-      "loss": 1.4362,
-      "step": 967000
-    },
-    {
-      "epoch": 6.291046231874634,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0017483581507250145,
-      "loss": 1.443,
-      "step": 967500
-    },
-    {
-      "epoch": 6.294297418557774,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.001748228103257689,
-      "loss": 1.4402,
-      "step": 968000
-    },
-    {
-      "epoch": 6.297548605240913,
-      "grad_norm": 0.8203125,
-      "learning_rate": 0.0017480980557903637,
-      "loss": 1.4419,
-      "step": 968500
-    },
-    {
-      "epoch": 6.300799791924052,
-      "grad_norm": 1.0390625,
-      "learning_rate": 0.0017479680083230381,
-      "loss": 1.4365,
-      "step": 969000
-    },
-    {
-      "epoch": 6.304050978607192,
-      "grad_norm": 1.203125,
-      "learning_rate": 0.0017478379608557124,
-      "loss": 1.436,
-      "step": 969500
-    },
-    {
-      "epoch": 6.307302165290331,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017477079133883869,
-      "loss": 1.4406,
-      "step": 970000
-    },
-    {
-      "epoch": 6.3105533519734704,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.0017475778659210614,
-      "loss": 1.4384,
-      "step": 970500
-    },
-    {
-      "epoch": 6.313804538656609,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0017474478184537356,
-      "loss": 1.4433,
-      "step": 971000
-    },
-    {
-      "epoch": 6.317055725339749,
-      "grad_norm": 3.265625,
-      "learning_rate": 0.00174731777098641,
-      "loss": 1.4363,
-      "step": 971500
-    },
-    {
-      "epoch": 6.320306912022889,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017471877235190846,
-      "loss": 1.4395,
-      "step": 972000
-    },
-    {
-      "epoch": 6.323558098706028,
-      "grad_norm": 1.4453125,
-      "learning_rate": 0.0017470576760517588,
-      "loss": 1.4361,
-      "step": 972500
-    },
-    {
-      "epoch": 6.326809285389167,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0017469276285844333,
-      "loss": 1.4345,
-      "step": 973000
-    },
-    {
-      "epoch": 6.330060472072306,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0017467975811171078,
-      "loss": 1.4347,
-      "step": 973500
-    },
-    {
-      "epoch": 6.333311658755446,
-      "grad_norm": 0.9921875,
-      "learning_rate": 0.001746667533649782,
-      "loss": 1.4364,
-      "step": 974000
-    },
-    {
-      "epoch": 6.336562845438585,
-      "grad_norm": 1.1953125,
-      "learning_rate": 0.0017465374861824565,
-      "loss": 1.4367,
-      "step": 974500
-    },
-    {
-      "epoch": 6.3398140321217245,
-      "grad_norm": 5.71875,
-      "learning_rate": 0.001746407438715131,
-      "loss": 1.4332,
-      "step": 975000
-    },
-    {
-      "epoch": 6.343065218804864,
-      "grad_norm": 0.9921875,
-      "learning_rate": 0.0017462773912478053,
-      "loss": 1.4422,
-      "step": 975500
-    },
-    {
-      "epoch": 6.346316405488003,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.00174614734378048,
-      "loss": 1.4358,
-      "step": 976000
-    },
-    {
-      "epoch": 6.349567592171143,
-      "grad_norm": 1.5234375,
-      "learning_rate": 0.0017460172963131545,
-      "loss": 1.4357,
-      "step": 976500
-    },
-    {
-      "epoch": 6.352818778854282,
-      "grad_norm": 1.15625,
-      "learning_rate": 0.001745887248845829,
-      "loss": 1.4352,
-      "step": 977000
-    },
-    {
-      "epoch": 6.356069965537421,
-      "grad_norm": 0.98046875,
-      "learning_rate": 0.0017457572013785032,
-      "loss": 1.4334,
-      "step": 977500
-    },
-    {
-      "epoch": 6.35932115222056,
-      "grad_norm": 0.796875,
-      "learning_rate": 0.0017456271539111777,
-      "loss": 1.4317,
-      "step": 978000
-    },
-    {
-      "epoch": 6.3625723389037,
-      "grad_norm": 0.86328125,
-      "learning_rate": 0.0017454971064438521,
-      "loss": 1.4379,
-      "step": 978500
-    },
-    {
-      "epoch": 6.365823525586839,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0017453670589765264,
-      "loss": 1.4336,
-      "step": 979000
-    },
-    {
-      "epoch": 6.3690747122699785,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0017452370115092009,
-      "loss": 1.4379,
-      "step": 979500
-    },
-    {
-      "epoch": 6.372325898953118,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017451069640418754,
-      "loss": 1.4391,
-      "step": 980000
-    },
-    {
-      "epoch": 6.375577085636257,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.0017449769165745496,
-      "loss": 1.4391,
-      "step": 980500
-    },
-    {
-      "epoch": 6.378828272319397,
-      "grad_norm": 4.0625,
-      "learning_rate": 0.001744846869107224,
-      "loss": 1.4431,
-      "step": 981000
-    },
-    {
-      "epoch": 6.382079459002536,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0017447168216398986,
-      "loss": 1.4351,
-      "step": 981500
-    },
-    {
-      "epoch": 6.385330645685675,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0017445867741725728,
-      "loss": 1.4387,
-      "step": 982000
-    },
-    {
-      "epoch": 6.388581832368814,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.0017444567267052473,
-      "loss": 1.4328,
-      "step": 982500
-    },
-    {
-      "epoch": 6.391833019051954,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001744326679237922,
-      "loss": 1.4343,
-      "step": 983000
-    },
-    {
-      "epoch": 6.395084205735094,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0017441966317705965,
-      "loss": 1.4335,
-      "step": 983500
-    },
-    {
-      "epoch": 6.3983353924182325,
-      "grad_norm": 0.8359375,
-      "learning_rate": 0.0017440665843032708,
-      "loss": 1.435,
-      "step": 984000
-    },
-    {
-      "epoch": 6.401586579101372,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0017439365368359452,
-      "loss": 1.4415,
-      "step": 984500
-    },
-    {
-      "epoch": 6.404837765784511,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0017438064893686197,
-      "loss": 1.443,
-      "step": 985000
-    },
-    {
-      "epoch": 6.408088952467651,
-      "grad_norm": 1.71875,
-      "learning_rate": 0.001743676441901294,
-      "loss": 1.4426,
-      "step": 985500
-    },
-    {
-      "epoch": 6.41134013915079,
-      "grad_norm": 15.4375,
-      "learning_rate": 0.0017435463944339685,
-      "loss": 1.4405,
-      "step": 986000
-    },
-    {
-      "epoch": 6.414591325833929,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.001743416346966643,
-      "loss": 1.4408,
-      "step": 986500
-    },
-    {
-      "epoch": 6.417842512517069,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0017432862994993172,
-      "loss": 1.4509,
-      "step": 987000
-    },
-    {
-      "epoch": 6.421093699200208,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0017431562520319917,
-      "loss": 1.4435,
-      "step": 987500
-    },
-    {
-      "epoch": 6.424344885883348,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0017430262045646662,
-      "loss": 1.4417,
-      "step": 988000
-    },
-    {
-      "epoch": 6.4275960725664865,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0017428961570973404,
-      "loss": 1.4566,
-      "step": 988500
-    },
-    {
-      "epoch": 6.430847259249626,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.001742766109630015,
-      "loss": 1.4679,
-      "step": 989000
-    },
-    {
-      "epoch": 6.434098445932765,
-      "grad_norm": 0.65234375,
-      "learning_rate": 0.0017426360621626894,
-      "loss": 1.4754,
-      "step": 989500
-    },
-    {
-      "epoch": 6.437349632615905,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.0017425060146953636,
-      "loss": 1.4804,
-      "step": 990000
-    },
-    {
-      "epoch": 6.4406008192990445,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0017423759672280383,
-      "loss": 1.4847,
-      "step": 990500
-    },
-    {
-      "epoch": 6.443852005982183,
-      "grad_norm": 1.765625,
-      "learning_rate": 0.0017422459197607128,
-      "loss": 1.4757,
-      "step": 991000
-    },
-    {
-      "epoch": 6.447103192665323,
-      "grad_norm": 0.91796875,
-      "learning_rate": 0.0017421158722933873,
-      "loss": 1.4749,
-      "step": 991500
-    },
-    {
-      "epoch": 6.450354379348462,
-      "grad_norm": 0.8515625,
-      "learning_rate": 0.0017419858248260615,
-      "loss": 1.4674,
-      "step": 992000
-    },
-    {
-      "epoch": 6.453605566031602,
-      "grad_norm": 1.21875,
-      "learning_rate": 0.001741855777358736,
-      "loss": 1.4648,
-      "step": 992500
-    },
-    {
-      "epoch": 6.4568567527147405,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0017417257298914105,
-      "loss": 1.453,
-      "step": 993000
-    },
-    {
-      "epoch": 6.46010793939788,
-      "grad_norm": 3.25,
-      "learning_rate": 0.0017415956824240848,
-      "loss": 1.4638,
-      "step": 993500
-    },
-    {
-      "epoch": 6.46335912608102,
-      "grad_norm": 0.8828125,
-      "learning_rate": 0.0017414656349567592,
-      "loss": 1.4796,
-      "step": 994000
-    },
-    {
-      "epoch": 6.466610312764159,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0017413355874894337,
-      "loss": 1.4722,
-      "step": 994500
-    },
-    {
-      "epoch": 6.4698614994472985,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.001741205540022108,
-      "loss": 1.474,
-      "step": 995000
-    },
-    {
-      "epoch": 6.473112686130437,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0017410754925547825,
-      "loss": 1.4831,
-      "step": 995500
-    },
-    {
-      "epoch": 6.476363872813577,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.001740945445087457,
-      "loss": 1.4832,
-      "step": 996000
-    },
-    {
-      "epoch": 6.479615059496716,
-      "grad_norm": 0.93359375,
-      "learning_rate": 0.0017408153976201312,
-      "loss": 1.4926,
-      "step": 996500
-    },
-    {
-      "epoch": 6.482866246179856,
-      "grad_norm": 0.77734375,
-      "learning_rate": 0.0017406853501528057,
-      "loss": 1.489,
-      "step": 997000
-    },
-    {
-      "epoch": 6.486117432862995,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017405553026854804,
-      "loss": 1.494,
-      "step": 997500
-    },
-    {
-      "epoch": 6.489368619546134,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0017404252552181549,
-      "loss": 1.4693,
-      "step": 998000
-    },
-    {
-      "epoch": 6.492619806229274,
-      "grad_norm": 1.3828125,
-      "learning_rate": 0.0017402952077508291,
-      "loss": 1.457,
-      "step": 998500
-    },
-    {
-      "epoch": 6.495870992912413,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0017401651602835036,
-      "loss": 1.4526,
-      "step": 999000
-    },
-    {
-      "epoch": 6.4991221795955525,
-      "grad_norm": 2.296875,
-      "learning_rate": 0.001740035112816178,
-      "loss": 1.4506,
-      "step": 999500
-    },
-    {
-      "epoch": 6.502373366278691,
-      "grad_norm": 19.5,
-      "learning_rate": 0.0017399050653488523,
-      "loss": 1.4474,
-      "step": 1000000
-    },
-    {
-      "epoch": 6.505624552961831,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0017397750178815268,
-      "loss": 1.4499,
-      "step": 1000500
-    },
-    {
-      "epoch": 6.508875739644971,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.0017396449704142013,
-      "loss": 1.4509,
-      "step": 1001000
-    },
-    {
-      "epoch": 6.51212692632811,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0017395149229468756,
-      "loss": 1.4505,
-      "step": 1001500
-    },
-    {
-      "epoch": 6.515378113011249,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.00173938487547955,
-      "loss": 1.4481,
-      "step": 1002000
-    },
-    {
-      "epoch": 6.518629299694388,
-      "grad_norm": 0.76953125,
-      "learning_rate": 0.0017392548280122245,
-      "loss": 1.4461,
-      "step": 1002500
-    },
-    {
-      "epoch": 6.521880486377528,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0017391247805448988,
-      "loss": 1.4411,
-      "step": 1003000
-    },
-    {
-      "epoch": 6.525131673060667,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0017389947330775733,
-      "loss": 1.4491,
-      "step": 1003500
-    },
-    {
-      "epoch": 6.5283828597438065,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.0017388646856102477,
-      "loss": 1.4502,
-      "step": 1004000
-    },
-    {
-      "epoch": 6.531634046426946,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.001738734638142922,
-      "loss": 1.4517,
-      "step": 1004500
-    },
-    {
-      "epoch": 6.534885233110085,
-      "grad_norm": 1.3125,
-      "learning_rate": 0.0017386045906755967,
-      "loss": 1.4522,
-      "step": 1005000
-    },
-    {
-      "epoch": 6.538136419793225,
-      "grad_norm": 0.70703125,
-      "learning_rate": 0.0017384745432082712,
-      "loss": 1.4466,
-      "step": 1005500
-    },
-    {
-      "epoch": 6.541387606476364,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.0017383444957409456,
-      "loss": 1.4474,
-      "step": 1006000
-    },
-    {
-      "epoch": 6.544638793159503,
-      "grad_norm": 0.8046875,
-      "learning_rate": 0.00173821444827362,
-      "loss": 1.4479,
-      "step": 1006500
-    },
-    {
-      "epoch": 6.547889979842642,
-      "grad_norm": 0.703125,
-      "learning_rate": 0.0017380844008062944,
-      "loss": 1.4444,
-      "step": 1007000
-    },
-    {
-      "epoch": 6.551141166525782,
-      "grad_norm": 0.87890625,
-      "learning_rate": 0.0017379543533389689,
-      "loss": 1.4431,
-      "step": 1007500
-    },
-    {
-      "epoch": 6.554392353208922,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0017378243058716431,
-      "loss": 1.4487,
-      "step": 1008000
-    },
-    {
-      "epoch": 6.5576435398920605,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0017376942584043176,
-      "loss": 1.4422,
-      "step": 1008500
-    },
-    {
-      "epoch": 6.5608947265752,
-      "grad_norm": 0.68359375,
-      "learning_rate": 0.001737564210936992,
-      "loss": 1.44,
-      "step": 1009000
-    },
-    {
-      "epoch": 6.564145913258339,
-      "grad_norm": 0.9921875,
-      "learning_rate": 0.0017374341634696663,
-      "loss": 1.4404,
-      "step": 1009500
-    },
-    {
-      "epoch": 6.567397099941479,
-      "grad_norm": 0.73828125,
-      "learning_rate": 0.0017373041160023408,
-      "loss": 1.4407,
-      "step": 1010000
-    },
-    {
-      "epoch": 6.570648286624618,
-      "grad_norm": 1.6484375,
-      "learning_rate": 0.0017371740685350153,
-      "loss": 1.446,
-      "step": 1010500
-    },
-    {
-      "epoch": 6.573899473307757,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.0017370440210676896,
-      "loss": 1.449,
-      "step": 1011000
-    },
-    {
-      "epoch": 6.577150659990897,
-      "grad_norm": 0.7890625,
-      "learning_rate": 0.001736913973600364,
-      "loss": 1.449,
-      "step": 1011500
-    },
-    {
-      "epoch": 6.580401846674036,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0017367839261330387,
-      "loss": 1.4324,
-      "step": 1012000
-    },
-    {
-      "epoch": 6.583653033357176,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0017366538786657132,
-      "loss": 1.4414,
-      "step": 1012500
-    },
-    {
-      "epoch": 6.5869042200403145,
-      "grad_norm": 1.8984375,
-      "learning_rate": 0.0017365238311983875,
-      "loss": 1.4451,
-      "step": 1013000
-    },
-    {
-      "epoch": 6.590155406723454,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.001736393783731062,
-      "loss": 1.4398,
-      "step": 1013500
-    },
-    {
-      "epoch": 6.593406593406593,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0017362637362637364,
-      "loss": 1.446,
-      "step": 1014000
-    },
-    {
-      "epoch": 6.596657780089733,
-      "grad_norm": 0.7578125,
-      "learning_rate": 0.0017361336887964107,
-      "loss": 1.4452,
-      "step": 1014500
-    },
-    {
-      "epoch": 6.5999089667728725,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0017360036413290852,
-      "loss": 1.4497,
-      "step": 1015000
-    },
-    {
-      "epoch": 6.603160153456011,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0017358735938617597,
-      "loss": 1.4545,
-      "step": 1015500
-    },
-    {
-      "epoch": 6.606411340139151,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.001735743546394434,
-      "loss": 1.4642,
-      "step": 1016000
-    },
-    {
-      "epoch": 6.60966252682229,
-      "grad_norm": 0.79296875,
-      "learning_rate": 0.0017356134989271084,
-      "loss": 1.4545,
-      "step": 1016500
-    },
-    {
-      "epoch": 6.61291371350543,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0017354834514597829,
-      "loss": 1.4551,
-      "step": 1017000
-    },
-    {
-      "epoch": 6.6161649001885685,
-      "grad_norm": 1.484375,
-      "learning_rate": 0.0017353534039924571,
-      "loss": 1.4558,
-      "step": 1017500
-    },
-    {
-      "epoch": 6.619416086871708,
-      "grad_norm": 1.1484375,
-      "learning_rate": 0.0017352233565251316,
-      "loss": 1.4509,
-      "step": 1018000
-    },
-    {
-      "epoch": 6.622667273554848,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.001735093309057806,
-      "loss": 1.4542,
-      "step": 1018500
-    },
-    {
-      "epoch": 6.625918460237987,
-      "grad_norm": 1.125,
-      "learning_rate": 0.0017349632615904804,
-      "loss": 1.448,
-      "step": 1019000
-    },
-    {
-      "epoch": 6.6291696469211265,
-      "grad_norm": 2.78125,
-      "learning_rate": 0.001734833214123155,
-      "loss": 1.4441,
-      "step": 1019500
-    },
-    {
-      "epoch": 6.632420833604265,
-      "grad_norm": 0.7265625,
-      "learning_rate": 0.0017347031666558295,
-      "loss": 1.4479,
-      "step": 1020000
-    },
-    {
-      "epoch": 6.635672020287405,
-      "grad_norm": 0.67578125,
-      "learning_rate": 0.001734573119188504,
-      "loss": 1.4445,
-      "step": 1020500
-    },
-    {
-      "epoch": 6.638923206970544,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0017344430717211783,
-      "loss": 1.4484,
-      "step": 1021000
-    },
-    {
-      "epoch": 6.642174393653684,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.0017343130242538527,
-      "loss": 1.4464,
-      "step": 1021500
-    },
-    {
-      "epoch": 6.645425580336823,
-      "grad_norm": 1.71875,
-      "learning_rate": 0.0017341829767865272,
-      "loss": 1.4434,
-      "step": 1022000
-    },
-    {
-      "epoch": 6.648676767019962,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0017340529293192015,
-      "loss": 1.4531,
-      "step": 1022500
-    },
-    {
-      "epoch": 6.651927953703102,
-      "grad_norm": 1.46875,
-      "learning_rate": 0.001733922881851876,
-      "loss": 1.454,
-      "step": 1023000
-    },
-    {
-      "epoch": 6.655179140386241,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.0017337928343845504,
-      "loss": 1.4707,
-      "step": 1023500
-    },
-    {
-      "epoch": 6.6584303270693805,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0017336627869172247,
-      "loss": 1.4892,
-      "step": 1024000
-    },
-    {
-      "epoch": 6.661681513752519,
-      "grad_norm": 0.6640625,
-      "learning_rate": 0.0017335327394498992,
-      "loss": 1.5023,
-      "step": 1024500
-    },
-    {
-      "epoch": 6.664932700435659,
-      "grad_norm": 0.57421875,
-      "learning_rate": 0.0017334026919825737,
-      "loss": 1.5322,
-      "step": 1025000
-    },
-    {
-      "epoch": 6.668183887118799,
-      "grad_norm": 0.6328125,
-      "learning_rate": 0.001733272644515248,
-      "loss": 1.5527,
-      "step": 1025500
-    },
-    {
-      "epoch": 6.671435073801938,
-      "grad_norm": 0.59375,
-      "learning_rate": 0.0017331425970479224,
-      "loss": 1.523,
-      "step": 1026000
-    },
-    {
-      "epoch": 6.674686260485077,
-      "grad_norm": 0.98046875,
-      "learning_rate": 0.001733012549580597,
-      "loss": 1.5219,
-      "step": 1026500
-    },
-    {
-      "epoch": 6.677937447168216,
-      "grad_norm": 1.375,
-      "learning_rate": 0.0017328825021132716,
-      "loss": 1.5343,
-      "step": 1027000
-    },
-    {
-      "epoch": 6.681188633851356,
-      "grad_norm": 0.73046875,
-      "learning_rate": 0.0017327524546459458,
-      "loss": 1.5245,
-      "step": 1027500
-    },
-    {
-      "epoch": 6.684439820534495,
-      "grad_norm": 0.66015625,
-      "learning_rate": 0.0017326224071786203,
-      "loss": 1.5177,
-      "step": 1028000
-    },
-    {
-      "epoch": 6.6876910072176345,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0017324923597112948,
-      "loss": 1.513,
-      "step": 1028500
-    },
-    {
-      "epoch": 6.690942193900774,
-      "grad_norm": 0.75,
-      "learning_rate": 0.001732362312243969,
-      "loss": 1.5196,
-      "step": 1029000
-    },
-    {
-      "epoch": 6.694193380583913,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.0017322322647766435,
-      "loss": 1.5089,
-      "step": 1029500
-    },
-    {
-      "epoch": 6.697444567267053,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.001732102217309318,
-      "loss": 1.5089,
-      "step": 1030000
-    },
-    {
-      "epoch": 6.700695753950192,
-      "grad_norm": 0.671875,
-      "learning_rate": 0.0017319721698419923,
-      "loss": 1.5037,
-      "step": 1030500
-    },
-    {
-      "epoch": 6.703946940633331,
-      "grad_norm": 1.6796875,
-      "learning_rate": 0.0017318421223746668,
-      "loss": 1.4995,
-      "step": 1031000
-    },
-    {
-      "epoch": 6.70719812731647,
-      "grad_norm": 0.7734375,
-      "learning_rate": 0.0017317120749073412,
-      "loss": 1.4928,
-      "step": 1031500
-    },
-    {
-      "epoch": 6.71044931399961,
-      "grad_norm": 0.78125,
-      "learning_rate": 0.0017315820274400155,
-      "loss": 1.4962,
-      "step": 1032000
-    },
-    {
-      "epoch": 6.713700500682749,
-      "grad_norm": 0.6171875,
-      "learning_rate": 0.00173145197997269,
-      "loss": 1.4943,
-      "step": 1032500
-    },
-    {
-      "epoch": 6.7169516873658885,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0017313219325053645,
-      "loss": 1.4925,
-      "step": 1033000
-    },
-    {
-      "epoch": 6.720202874049028,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0017311918850380387,
-      "loss": 1.4849,
-      "step": 1033500
-    },
-    {
-      "epoch": 6.723454060732167,
-      "grad_norm": 0.71875,
-      "learning_rate": 0.0017310618375707134,
-      "loss": 1.479,
-      "step": 1034000
-    },
-    {
-      "epoch": 6.726705247415307,
-      "grad_norm": 0.875,
-      "learning_rate": 0.0017309317901033879,
-      "loss": 1.4838,
-      "step": 1034500
-    },
-    {
-      "epoch": 6.729956434098446,
-      "grad_norm": 1.09375,
-      "learning_rate": 0.0017308017426360624,
-      "loss": 1.4778,
-      "step": 1035000
-    },
-    {
-      "epoch": 6.733207620781585,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0017306716951687366,
-      "loss": 1.4825,
-      "step": 1035500
-    },
-    {
-      "epoch": 6.736458807464724,
-      "grad_norm": 0.890625,
-      "learning_rate": 0.001730541647701411,
-      "loss": 1.4769,
-      "step": 1036000
-    },
-    {
-      "epoch": 6.739709994147864,
-      "grad_norm": 1.046875,
-      "learning_rate": 0.0017304116002340856,
-      "loss": 1.4705,
-      "step": 1036500
-    },
-    {
-      "epoch": 6.742961180831003,
-      "grad_norm": 2.203125,
-      "learning_rate": 0.0017302815527667598,
-      "loss": 1.4729,
-      "step": 1037000
-    },
-    {
-      "epoch": 6.7462123675141425,
-      "grad_norm": 1.0,
-      "learning_rate": 0.0017301515052994343,
-      "loss": 1.4738,
-      "step": 1037500
-    },
-    {
-      "epoch": 6.749463554197282,
-      "grad_norm": 0.64453125,
-      "learning_rate": 0.0017300214578321088,
-      "loss": 1.4775,
-      "step": 1038000
-    },
-    {
-      "epoch": 6.752714740880421,
-      "grad_norm": 0.81640625,
-      "learning_rate": 0.001729891410364783,
-      "loss": 1.4787,
-      "step": 1038500
-    },
-    {
-      "epoch": 6.755965927563561,
-      "grad_norm": 1.796875,
-      "learning_rate": 0.0017297613628974575,
-      "loss": 1.4751,
-      "step": 1039000
-    },
-    {
-      "epoch": 6.7592171142467,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.001729631315430132,
-      "loss": 1.4772,
-      "step": 1039500
-    },
-    {
-      "epoch": 6.762468300929839,
-      "grad_norm": 0.89453125,
-      "learning_rate": 0.0017295012679628063,
-      "loss": 1.4735,
-      "step": 1040000
-    },
-    {
-      "epoch": 6.765719487612978,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0017293712204954808,
-      "loss": 1.4709,
-      "step": 1040500
-    },
-    {
-      "epoch": 6.768970674296118,
-      "grad_norm": 0.7421875,
-      "learning_rate": 0.0017292411730281555,
-      "loss": 1.4708,
-      "step": 1041000
-    },
-    {
-      "epoch": 6.772221860979258,
-      "grad_norm": 0.828125,
-      "learning_rate": 0.00172911112556083,
-      "loss": 1.4705,
-      "step": 1041500
-    },
-    {
-      "epoch": 6.7754730476623966,
-      "grad_norm": 0.62109375,
-      "learning_rate": 0.0017289810780935042,
-      "loss": 1.4681,
-      "step": 1042000
-    },
-    {
-      "epoch": 6.778724234345536,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0017288510306261787,
-      "loss": 1.4618,
-      "step": 1042500
-    },
-    {
-      "epoch": 6.781975421028675,
-      "grad_norm": 0.90625,
-      "learning_rate": 0.0017287209831588532,
-      "loss": 1.4675,
-      "step": 1043000
-    },
-    {
-      "epoch": 6.785226607711815,
-      "grad_norm": 0.6875,
-      "learning_rate": 0.0017285909356915274,
-      "loss": 1.472,
-      "step": 1043500
-    },
-    {
-      "epoch": 6.788477794394954,
-      "grad_norm": 0.66796875,
-      "learning_rate": 0.001728460888224202,
-      "loss": 1.4722,
-      "step": 1044000
-    },
-    {
-      "epoch": 6.791728981078093,
-      "grad_norm": 0.60546875,
-      "learning_rate": 0.0017283308407568764,
-      "loss": 1.4638,
-      "step": 1044500
-    },
-    {
-      "epoch": 6.794980167761233,
-      "grad_norm": 0.97265625,
-      "learning_rate": 0.0017282007932895506,
-      "loss": 1.4743,
-      "step": 1045000
-    },
-    {
-      "epoch": 6.798231354444372,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0017280707458222251,
-      "loss": 1.472,
-      "step": 1045500
-    },
-    {
-      "epoch": 6.801482541127512,
-      "grad_norm": 1.5390625,
-      "learning_rate": 0.0017279406983548996,
-      "loss": 1.4667,
-      "step": 1046000
-    },
-    {
-      "epoch": 6.804733727810651,
-      "grad_norm": 0.95703125,
-      "learning_rate": 0.0017278106508875739,
-      "loss": 1.4613,
-      "step": 1046500
-    },
-    {
-      "epoch": 6.80798491449379,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.0017276806034202483,
-      "loss": 1.4652,
-      "step": 1047000
-    },
-    {
-      "epoch": 6.811236101176929,
-      "grad_norm": 1.9609375,
-      "learning_rate": 0.0017275505559529228,
-      "loss": 1.4641,
-      "step": 1047500
-    },
-    {
-      "epoch": 6.814487287860069,
-      "grad_norm": 1.7109375,
-      "learning_rate": 0.001727420508485597,
-      "loss": 1.4573,
-      "step": 1048000
-    },
-    {
-      "epoch": 6.817738474543209,
-      "grad_norm": 0.765625,
-      "learning_rate": 0.0017272904610182718,
-      "loss": 1.4655,
-      "step": 1048500
-    },
-    {
-      "epoch": 6.820989661226347,
-      "grad_norm": 0.83984375,
-      "learning_rate": 0.0017271604135509462,
-      "loss": 1.4632,
-      "step": 1049000
-    },
-    {
-      "epoch": 6.824240847909487,
-      "grad_norm": 0.65625,
-      "learning_rate": 0.0017270303660836207,
-      "loss": 1.461,
-      "step": 1049500
-    },
-    {
-      "epoch": 6.827492034592626,
-      "grad_norm": 1.296875,
-      "learning_rate": 0.001726900318616295,
-      "loss": 1.4637,
-      "step": 1050000
-    },
-    {
-      "epoch": 6.830743221275766,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0017267702711489695,
-      "loss": 1.4603,
-      "step": 1050500
-    },
-    {
-      "epoch": 6.833994407958905,
-      "grad_norm": 0.72265625,
-      "learning_rate": 0.001726640223681644,
-      "loss": 1.4622,
-      "step": 1051000
-    },
-    {
-      "epoch": 6.837245594642044,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0017265101762143182,
-      "loss": 1.4617,
-      "step": 1051500
-    },
-    {
-      "epoch": 6.840496781325184,
-      "grad_norm": 0.9296875,
-      "learning_rate": 0.0017263801287469927,
-      "loss": 1.4625,
-      "step": 1052000
-    },
-    {
-      "epoch": 6.843747968008323,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0017262500812796672,
-      "loss": 1.459,
-      "step": 1052500
-    },
-    {
-      "epoch": 6.846999154691463,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0017261200338123414,
-      "loss": 1.4576,
-      "step": 1053000
-    },
-    {
-      "epoch": 6.850250341374601,
-      "grad_norm": 1.265625,
-      "learning_rate": 0.001725989986345016,
-      "loss": 1.4597,
-      "step": 1053500
-    },
-    {
-      "epoch": 6.853501528057741,
-      "grad_norm": 0.91796875,
-      "learning_rate": 0.0017258599388776904,
-      "loss": 1.4643,
-      "step": 1054000
-    },
-    {
-      "epoch": 6.85675271474088,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017257298914103646,
-      "loss": 1.4575,
-      "step": 1054500
-    },
-    {
-      "epoch": 6.86000390142402,
-      "grad_norm": 0.6953125,
-      "learning_rate": 0.0017255998439430391,
-      "loss": 1.4588,
-      "step": 1055000
-    },
-    {
-      "epoch": 6.8632550881071595,
-      "grad_norm": 0.609375,
-      "learning_rate": 0.0017254697964757138,
-      "loss": 1.4612,
-      "step": 1055500
-    },
-    {
-      "epoch": 6.866506274790298,
-      "grad_norm": 0.63671875,
-      "learning_rate": 0.0017253397490083883,
-      "loss": 1.4574,
-      "step": 1056000
-    },
-    {
-      "epoch": 6.869757461473438,
-      "grad_norm": 0.71484375,
-      "learning_rate": 0.0017252097015410626,
-      "loss": 1.4544,
-      "step": 1056500
-    },
-    {
-      "epoch": 6.873008648156577,
-      "grad_norm": 0.76171875,
-      "learning_rate": 0.001725079654073737,
-      "loss": 1.4566,
-      "step": 1057000
-    },
-    {
-      "epoch": 6.876259834839717,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0017249496066064115,
-      "loss": 1.4514,
-      "step": 1057500
-    },
-    {
-      "epoch": 6.879511021522855,
-      "grad_norm": 1.9453125,
-      "learning_rate": 0.0017248195591390858,
-      "loss": 1.4564,
-      "step": 1058000
-    },
-    {
-      "epoch": 6.882762208205995,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0017246895116717603,
-      "loss": 1.452,
-      "step": 1058500
-    },
-    {
-      "epoch": 6.886013394889135,
-      "grad_norm": 1.0234375,
-      "learning_rate": 0.0017245594642044347,
-      "loss": 1.4514,
-      "step": 1059000
-    },
-    {
-      "epoch": 6.889264581572274,
-      "grad_norm": 0.6796875,
-      "learning_rate": 0.001724429416737109,
-      "loss": 1.4594,
-      "step": 1059500
-    },
-    {
-      "epoch": 6.8925157682554135,
-      "grad_norm": 0.9609375,
-      "learning_rate": 0.0017242993692697835,
-      "loss": 1.4545,
-      "step": 1060000
-    },
-    {
-      "epoch": 6.895766954938552,
-      "grad_norm": 0.625,
-      "learning_rate": 0.001724169321802458,
-      "loss": 1.4563,
-      "step": 1060500
-    },
-    {
-      "epoch": 6.899018141621692,
-      "grad_norm": 1.375,
-      "learning_rate": 0.0017240392743351322,
-      "loss": 1.4481,
-      "step": 1061000
-    },
-    {
-      "epoch": 6.902269328304831,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0017239092268678067,
-      "loss": 1.4557,
-      "step": 1061500
-    },
-    {
-      "epoch": 6.905520514987971,
-      "grad_norm": 0.6484375,
-      "learning_rate": 0.0017237791794004812,
-      "loss": 1.4574,
-      "step": 1062000
-    },
-    {
-      "epoch": 6.90877170167111,
-      "grad_norm": 1.0,
-      "learning_rate": 0.0017236491319331554,
-      "loss": 1.4529,
-      "step": 1062500
-    },
-    {
-      "epoch": 6.912022888354249,
-      "grad_norm": 1.1875,
-      "learning_rate": 0.0017235190844658301,
-      "loss": 1.4563,
-      "step": 1063000
-    },
-    {
-      "epoch": 6.915274075037389,
-      "grad_norm": 0.69921875,
-      "learning_rate": 0.0017233890369985046,
-      "loss": 1.4571,
-      "step": 1063500
-    },
-    {
-      "epoch": 6.918525261720528,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.001723258989531179,
-      "loss": 1.4568,
-      "step": 1064000
-    },
-    {
-      "epoch": 6.9217764484036675,
-      "grad_norm": 1.421875,
-      "learning_rate": 0.0017231289420638533,
-      "loss": 1.4533,
-      "step": 1064500
-    },
-    {
-      "epoch": 6.925027635086806,
-      "grad_norm": 0.8984375,
-      "learning_rate": 0.0017229988945965278,
-      "loss": 1.4548,
-      "step": 1065000
-    },
-    {
-      "epoch": 6.928278821769946,
-      "grad_norm": 0.87109375,
-      "learning_rate": 0.0017228688471292023,
-      "loss": 1.4596,
-      "step": 1065500
-    },
-    {
-      "epoch": 6.931530008453086,
-      "grad_norm": 0.9375,
-      "learning_rate": 0.0017227387996618766,
-      "loss": 1.4569,
-      "step": 1066000
-    },
-    {
-      "epoch": 6.934781195136225,
-      "grad_norm": 1.3828125,
-      "learning_rate": 0.001722608752194551,
-      "loss": 1.4575,
-      "step": 1066500
-    },
-    {
-      "epoch": 6.938032381819364,
-      "grad_norm": 0.75,
-      "learning_rate": 0.0017224787047272255,
-      "loss": 1.4544,
-      "step": 1067000
-    },
-    {
-      "epoch": 6.941283568502503,
-      "grad_norm": 0.78515625,
-      "learning_rate": 0.0017223486572598998,
-      "loss": 1.4564,
-      "step": 1067500
-    },
-    {
-      "epoch": 6.944534755185643,
-      "grad_norm": 0.84765625,
-      "learning_rate": 0.0017222186097925743,
-      "loss": 1.4541,
-      "step": 1068000
-    },
-    {
-      "epoch": 6.947785941868782,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0017220885623252487,
-      "loss": 1.4547,
-      "step": 1068500
-    },
-    {
-      "epoch": 6.9510371285519215,
-      "grad_norm": 0.92578125,
-      "learning_rate": 0.001721958514857923,
-      "loss": 1.4605,
-      "step": 1069000
-    },
-    {
-      "epoch": 6.954288315235061,
-      "grad_norm": 2.390625,
-      "learning_rate": 0.0017218284673905975,
-      "loss": 1.4535,
-      "step": 1069500
-    },
-    {
-      "epoch": 6.9575395019182,
-      "grad_norm": 1.0703125,
-      "learning_rate": 0.001721698419923272,
-      "loss": 1.4535,
-      "step": 1070000
-    },
-    {
-      "epoch": 6.96079068860134,
-      "grad_norm": 0.62890625,
-      "learning_rate": 0.0017215683724559467,
-      "loss": 1.4582,
-      "step": 1070500
-    },
-    {
-      "epoch": 6.964041875284479,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.001721438324988621,
-      "loss": 1.4554,
-      "step": 1071000
-    },
-    {
-      "epoch": 6.967293061967618,
-      "grad_norm": 0.97265625,
-      "learning_rate": 0.0017213082775212954,
-      "loss": 1.4507,
-      "step": 1071500
-    },
-    {
-      "epoch": 6.970544248650757,
-      "grad_norm": 0.75390625,
-      "learning_rate": 0.0017211782300539699,
-      "loss": 1.449,
-      "step": 1072000
-    },
-    {
-      "epoch": 6.973795435333897,
-      "grad_norm": 0.625,
-      "learning_rate": 0.0017210481825866441,
-      "loss": 1.4487,
-      "step": 1072500
-    },
-    {
-      "epoch": 6.977046622017037,
-      "grad_norm": 1.2265625,
-      "learning_rate": 0.0017209181351193186,
-      "loss": 1.4501,
-      "step": 1073000
-    },
-    {
-      "epoch": 6.9802978087001755,
-      "grad_norm": 1.40625,
-      "learning_rate": 0.001720788087651993,
-      "loss": 1.447,
-      "step": 1073500
-    },
-    {
-      "epoch": 6.983548995383315,
-      "grad_norm": 3.203125,
-      "learning_rate": 0.0017206580401846674,
-      "loss": 1.4483,
-      "step": 1074000
-    },
-    {
-      "epoch": 6.986800182066454,
-      "grad_norm": 1.0,
-      "learning_rate": 0.0017205279927173418,
-      "loss": 1.4508,
-      "step": 1074500
-    },
-    {
-      "epoch": 6.990051368749594,
-      "grad_norm": 0.80859375,
-      "learning_rate": 0.0017203979452500163,
-      "loss": 1.4445,
-      "step": 1075000
-    },
-    {
-      "epoch": 6.993302555432733,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0017202678977826906,
-      "loss": 1.4496,
-      "step": 1075500
-    },
-    {
-      "epoch": 6.996553742115872,
-      "grad_norm": 0.734375,
-      "learning_rate": 0.001720137850315365,
-      "loss": 1.4451,
-      "step": 1076000
-    },
-    {
-      "epoch": 6.999804928799012,
-      "grad_norm": 0.69140625,
-      "learning_rate": 0.0017200078028480395,
-      "loss": 1.4514,
-      "step": 1076500
-    },
-    {
-      "epoch": 7.0,
-      "eval_loss": 1.4298810958862305,
-      "eval_runtime": 0.5331,
-      "eval_samples_per_second": 1875.719,
-      "eval_steps_per_second": 30.011,
-      "step": 1076530
+      "epoch": 24.0,
+      "eval_loss": 1.305156946182251,
+      "eval_runtime": 2.3203,
+      "eval_samples_per_second": 430.983,
+      "eval_steps_per_second": 0.431,
+      "step": 230688
     }
   ],
   "logging_steps": 500,
-  "max_steps": 7689500,
+  "max_steps": 480600,
   "num_input_tokens_seen": 0,
   "num_train_epochs": 50,
   "save_steps": 500,
-  "total_flos": 1.3852790029150102e+19,
-  "train_batch_size": 64,
+  "stateful_callbacks": {
+    "EarlyStoppingCallback": {
+      "args": {
+        "early_stopping_patience": 3,
+        "early_stopping_threshold": 0.0
+      },
+      "attributes": {
+        "early_stopping_patience_counter": 0
+      }
+    },
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": false
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.304114629940989e+19,
+  "train_batch_size": 1024,
   "trial_name": null,
   "trial_params": null
 }