{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.001564945226917, "eval_steps": 500, "global_step": 160, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006259780907668232, "grad_norm": 1.98288817639941, "learning_rate": 5.000000000000001e-07, "loss": 0.6599, "step": 1 }, { "epoch": 0.012519561815336464, "grad_norm": 2.008513351833145, "learning_rate": 1.0000000000000002e-06, "loss": 0.6744, "step": 2 }, { "epoch": 0.018779342723004695, "grad_norm": 2.03144664277006, "learning_rate": 1.5e-06, "loss": 0.6721, "step": 3 }, { "epoch": 0.025039123630672927, "grad_norm": 1.9480725202469245, "learning_rate": 2.0000000000000003e-06, "loss": 0.6577, "step": 4 }, { "epoch": 0.03129890453834116, "grad_norm": 1.8678118004054254, "learning_rate": 2.5e-06, "loss": 0.6484, "step": 5 }, { "epoch": 0.03755868544600939, "grad_norm": 1.6583787538868422, "learning_rate": 3e-06, "loss": 0.6174, "step": 6 }, { "epoch": 0.04381846635367762, "grad_norm": 1.5614405714896737, "learning_rate": 3.5000000000000004e-06, "loss": 0.5896, "step": 7 }, { "epoch": 0.050078247261345854, "grad_norm": 0.5773143053283745, "learning_rate": 4.000000000000001e-06, "loss": 0.5557, "step": 8 }, { "epoch": 0.056338028169014086, "grad_norm": 0.3043811484340276, "learning_rate": 4.5e-06, "loss": 0.541, "step": 9 }, { "epoch": 0.06259780907668232, "grad_norm": 0.8131531353366078, "learning_rate": 5e-06, "loss": 0.5595, "step": 10 }, { "epoch": 0.06885758998435054, "grad_norm": 0.424180567084822, "learning_rate": 5.500000000000001e-06, "loss": 0.5427, "step": 11 }, { "epoch": 0.07511737089201878, "grad_norm": 0.2913041969769501, "learning_rate": 6e-06, "loss": 0.5274, "step": 12 }, { "epoch": 0.081377151799687, "grad_norm": 0.34524917385772347, "learning_rate": 6.5000000000000004e-06, "loss": 0.5337, "step": 13 }, { "epoch": 0.08763693270735524, "grad_norm": 0.36469195568794854, "learning_rate": 7.000000000000001e-06, "loss": 0.5279, "step": 14 }, { "epoch": 0.09389671361502347, "grad_norm": 0.35209082489157323, "learning_rate": 7.5e-06, "loss": 0.5296, "step": 15 }, { "epoch": 0.10015649452269171, "grad_norm": 0.28086156745404856, "learning_rate": 8.000000000000001e-06, "loss": 0.5319, "step": 16 }, { "epoch": 0.10641627543035993, "grad_norm": 0.5457849868763605, "learning_rate": 8.500000000000002e-06, "loss": 0.5199, "step": 17 }, { "epoch": 0.11267605633802817, "grad_norm": 0.264594169690208, "learning_rate": 9e-06, "loss": 0.5234, "step": 18 }, { "epoch": 0.1189358372456964, "grad_norm": 0.2472097021778676, "learning_rate": 9.5e-06, "loss": 0.5248, "step": 19 }, { "epoch": 0.12519561815336464, "grad_norm": 0.2560549908847749, "learning_rate": 1e-05, "loss": 0.5159, "step": 20 }, { "epoch": 0.13145539906103287, "grad_norm": 0.4101523009554862, "learning_rate": 1.05e-05, "loss": 0.5058, "step": 21 }, { "epoch": 0.13771517996870108, "grad_norm": 0.22290433425318873, "learning_rate": 1.1000000000000001e-05, "loss": 0.5099, "step": 22 }, { "epoch": 0.14397496087636932, "grad_norm": 0.2600145857043661, "learning_rate": 1.1500000000000002e-05, "loss": 0.5076, "step": 23 }, { "epoch": 0.15023474178403756, "grad_norm": 1.1584269063197106, "learning_rate": 1.2e-05, "loss": 0.5133, "step": 24 }, { "epoch": 0.1564945226917058, "grad_norm": 0.21303015786105067, "learning_rate": 1.25e-05, "loss": 0.5009, "step": 25 }, { "epoch": 0.162754303599374, "grad_norm": 2.5709430754104345, "learning_rate": 1.3000000000000001e-05, "loss": 0.5067, "step": 26 }, { "epoch": 0.16901408450704225, "grad_norm": 0.42260631876680255, "learning_rate": 1.3500000000000001e-05, "loss": 0.4951, "step": 27 }, { "epoch": 0.1752738654147105, "grad_norm": 0.2122989372030049, "learning_rate": 1.4000000000000001e-05, "loss": 0.4968, "step": 28 }, { "epoch": 0.18153364632237873, "grad_norm": 0.36382001881720555, "learning_rate": 1.45e-05, "loss": 0.5035, "step": 29 }, { "epoch": 0.18779342723004694, "grad_norm": 0.22094603076455596, "learning_rate": 1.5e-05, "loss": 0.5049, "step": 30 }, { "epoch": 0.19405320813771518, "grad_norm": 0.17188920546056902, "learning_rate": 1.55e-05, "loss": 0.4979, "step": 31 }, { "epoch": 0.20031298904538342, "grad_norm": 0.18515458685485783, "learning_rate": 1.6000000000000003e-05, "loss": 0.4916, "step": 32 }, { "epoch": 0.20657276995305165, "grad_norm": 0.783356101762532, "learning_rate": 1.65e-05, "loss": 0.4929, "step": 33 }, { "epoch": 0.21283255086071987, "grad_norm": 0.19059224326067628, "learning_rate": 1.7000000000000003e-05, "loss": 0.4945, "step": 34 }, { "epoch": 0.2190923317683881, "grad_norm": 0.2275442577977743, "learning_rate": 1.75e-05, "loss": 0.4936, "step": 35 }, { "epoch": 0.22535211267605634, "grad_norm": 0.24798149507141237, "learning_rate": 1.8e-05, "loss": 0.4898, "step": 36 }, { "epoch": 0.23161189358372458, "grad_norm": 0.20682357544778035, "learning_rate": 1.85e-05, "loss": 0.4888, "step": 37 }, { "epoch": 0.2378716744913928, "grad_norm": 0.19518819682961547, "learning_rate": 1.9e-05, "loss": 0.4899, "step": 38 }, { "epoch": 0.24413145539906103, "grad_norm": 0.18423871547579748, "learning_rate": 1.9500000000000003e-05, "loss": 0.4868, "step": 39 }, { "epoch": 0.25039123630672927, "grad_norm": 0.1714820355275791, "learning_rate": 2e-05, "loss": 0.4795, "step": 40 }, { "epoch": 0.2566510172143975, "grad_norm": 0.19187618384155788, "learning_rate": 2.05e-05, "loss": 0.4821, "step": 41 }, { "epoch": 0.26291079812206575, "grad_norm": 0.1422378326228944, "learning_rate": 2.1e-05, "loss": 0.4829, "step": 42 }, { "epoch": 0.26917057902973396, "grad_norm": 0.14724977757162294, "learning_rate": 2.15e-05, "loss": 0.4811, "step": 43 }, { "epoch": 0.27543035993740217, "grad_norm": 0.16077227738580077, "learning_rate": 2.2000000000000003e-05, "loss": 0.477, "step": 44 }, { "epoch": 0.28169014084507044, "grad_norm": 0.15993679259901028, "learning_rate": 2.25e-05, "loss": 0.4789, "step": 45 }, { "epoch": 0.28794992175273865, "grad_norm": 0.14385134377084383, "learning_rate": 2.3000000000000003e-05, "loss": 0.4641, "step": 46 }, { "epoch": 0.2942097026604069, "grad_norm": 0.14244559356804792, "learning_rate": 2.35e-05, "loss": 0.4767, "step": 47 }, { "epoch": 0.3004694835680751, "grad_norm": 0.1481660114240819, "learning_rate": 2.4e-05, "loss": 0.4759, "step": 48 }, { "epoch": 0.30672926447574334, "grad_norm": 0.14195363156015162, "learning_rate": 2.45e-05, "loss": 0.471, "step": 49 }, { "epoch": 0.3129890453834116, "grad_norm": 0.15220552720898642, "learning_rate": 2.5e-05, "loss": 0.4715, "step": 50 }, { "epoch": 0.3192488262910798, "grad_norm": 0.13409784658365015, "learning_rate": 2.5500000000000003e-05, "loss": 0.4692, "step": 51 }, { "epoch": 0.325508607198748, "grad_norm": 0.13766694658848178, "learning_rate": 2.6000000000000002e-05, "loss": 0.47, "step": 52 }, { "epoch": 0.3317683881064163, "grad_norm": 0.13097864679643595, "learning_rate": 2.6500000000000004e-05, "loss": 0.4651, "step": 53 }, { "epoch": 0.3380281690140845, "grad_norm": 0.13207003285729219, "learning_rate": 2.7000000000000002e-05, "loss": 0.4714, "step": 54 }, { "epoch": 0.3442879499217527, "grad_norm": 0.14128427173382038, "learning_rate": 2.7500000000000004e-05, "loss": 0.4719, "step": 55 }, { "epoch": 0.350547730829421, "grad_norm": 0.13599048333974484, "learning_rate": 2.8000000000000003e-05, "loss": 0.4657, "step": 56 }, { "epoch": 0.3568075117370892, "grad_norm": 0.1547358977814178, "learning_rate": 2.8499999999999998e-05, "loss": 0.4599, "step": 57 }, { "epoch": 0.36306729264475746, "grad_norm": 0.1357320992255676, "learning_rate": 2.9e-05, "loss": 0.4615, "step": 58 }, { "epoch": 0.36932707355242567, "grad_norm": 0.14465717873045295, "learning_rate": 2.95e-05, "loss": 0.4738, "step": 59 }, { "epoch": 0.3755868544600939, "grad_norm": 0.5900603203611421, "learning_rate": 3e-05, "loss": 0.4702, "step": 60 }, { "epoch": 0.38184663536776214, "grad_norm": 0.17729474902277623, "learning_rate": 3.05e-05, "loss": 0.4592, "step": 61 }, { "epoch": 0.38810641627543035, "grad_norm": 0.22055664690525556, "learning_rate": 3.1e-05, "loss": 0.47, "step": 62 }, { "epoch": 0.39436619718309857, "grad_norm": 0.22917133262033845, "learning_rate": 3.15e-05, "loss": 0.4668, "step": 63 }, { "epoch": 0.40062597809076683, "grad_norm": 0.23278911760289017, "learning_rate": 3.2000000000000005e-05, "loss": 0.4691, "step": 64 }, { "epoch": 0.40688575899843504, "grad_norm": 0.23911939507472177, "learning_rate": 3.2500000000000004e-05, "loss": 0.4662, "step": 65 }, { "epoch": 0.4131455399061033, "grad_norm": 0.19447041878105836, "learning_rate": 3.3e-05, "loss": 0.4633, "step": 66 }, { "epoch": 0.4194053208137715, "grad_norm": 0.17498726962496755, "learning_rate": 3.35e-05, "loss": 0.4654, "step": 67 }, { "epoch": 0.42566510172143973, "grad_norm": 0.24918375228266929, "learning_rate": 3.4000000000000007e-05, "loss": 0.477, "step": 68 }, { "epoch": 0.431924882629108, "grad_norm": 0.2850664865678729, "learning_rate": 3.45e-05, "loss": 0.4648, "step": 69 }, { "epoch": 0.4381846635367762, "grad_norm": 0.27562629972396513, "learning_rate": 3.5e-05, "loss": 0.4667, "step": 70 }, { "epoch": 0.4444444444444444, "grad_norm": 0.22637202856522412, "learning_rate": 3.55e-05, "loss": 0.4653, "step": 71 }, { "epoch": 0.4507042253521127, "grad_norm": 0.2295442026728235, "learning_rate": 3.6e-05, "loss": 0.4622, "step": 72 }, { "epoch": 0.4569640062597809, "grad_norm": 0.26572612655057165, "learning_rate": 3.65e-05, "loss": 0.4673, "step": 73 }, { "epoch": 0.46322378716744916, "grad_norm": 0.2496817546620412, "learning_rate": 3.7e-05, "loss": 0.4611, "step": 74 }, { "epoch": 0.4694835680751174, "grad_norm": 0.21430723659191686, "learning_rate": 3.7500000000000003e-05, "loss": 0.4637, "step": 75 }, { "epoch": 0.4757433489827856, "grad_norm": 0.1799606207168491, "learning_rate": 3.8e-05, "loss": 0.4612, "step": 76 }, { "epoch": 0.48200312989045385, "grad_norm": 0.2329269891744439, "learning_rate": 3.85e-05, "loss": 0.4569, "step": 77 }, { "epoch": 0.48826291079812206, "grad_norm": 0.2859704851548014, "learning_rate": 3.9000000000000006e-05, "loss": 0.4677, "step": 78 }, { "epoch": 0.4945226917057903, "grad_norm": 0.3153100598444141, "learning_rate": 3.9500000000000005e-05, "loss": 0.465, "step": 79 }, { "epoch": 0.5007824726134585, "grad_norm": 0.3165950932566608, "learning_rate": 4e-05, "loss": 0.4755, "step": 80 }, { "epoch": 0.5070422535211268, "grad_norm": 0.3018577292754275, "learning_rate": 4.05e-05, "loss": 0.464, "step": 81 }, { "epoch": 0.513302034428795, "grad_norm": 0.39363558044861696, "learning_rate": 4.1e-05, "loss": 0.4701, "step": 82 }, { "epoch": 0.5195618153364632, "grad_norm": 0.44171413078007776, "learning_rate": 4.15e-05, "loss": 0.4697, "step": 83 }, { "epoch": 0.5258215962441315, "grad_norm": 0.4086449510625894, "learning_rate": 4.2e-05, "loss": 0.4611, "step": 84 }, { "epoch": 0.5320813771517997, "grad_norm": 0.3156689305434587, "learning_rate": 4.25e-05, "loss": 0.4633, "step": 85 }, { "epoch": 0.5383411580594679, "grad_norm": 0.37582415992669976, "learning_rate": 4.3e-05, "loss": 0.4689, "step": 86 }, { "epoch": 0.5446009389671361, "grad_norm": 0.3751728997948819, "learning_rate": 4.35e-05, "loss": 0.4658, "step": 87 }, { "epoch": 0.5508607198748043, "grad_norm": 0.2622604607003995, "learning_rate": 4.4000000000000006e-05, "loss": 0.4641, "step": 88 }, { "epoch": 0.5571205007824727, "grad_norm": 0.27806769516567914, "learning_rate": 4.4500000000000004e-05, "loss": 0.4689, "step": 89 }, { "epoch": 0.5633802816901409, "grad_norm": 0.37193892514568727, "learning_rate": 4.5e-05, "loss": 0.4645, "step": 90 }, { "epoch": 0.5696400625978091, "grad_norm": 0.319234610988282, "learning_rate": 4.55e-05, "loss": 0.4697, "step": 91 }, { "epoch": 0.5758998435054773, "grad_norm": 0.24391835650924631, "learning_rate": 4.600000000000001e-05, "loss": 0.4605, "step": 92 }, { "epoch": 0.5821596244131455, "grad_norm": 0.3860119064167233, "learning_rate": 4.6500000000000005e-05, "loss": 0.4721, "step": 93 }, { "epoch": 0.5884194053208138, "grad_norm": 0.43978262147491526, "learning_rate": 4.7e-05, "loss": 0.4692, "step": 94 }, { "epoch": 0.594679186228482, "grad_norm": 0.2869109051387356, "learning_rate": 4.75e-05, "loss": 0.4644, "step": 95 }, { "epoch": 0.6009389671361502, "grad_norm": 0.33046074741721215, "learning_rate": 4.8e-05, "loss": 0.4711, "step": 96 }, { "epoch": 0.6071987480438185, "grad_norm": 0.3874189152162858, "learning_rate": 4.85e-05, "loss": 0.4694, "step": 97 }, { "epoch": 0.6134585289514867, "grad_norm": 0.46318630797414556, "learning_rate": 4.9e-05, "loss": 0.4741, "step": 98 }, { "epoch": 0.6197183098591549, "grad_norm": 0.6037444606802089, "learning_rate": 4.9500000000000004e-05, "loss": 0.4754, "step": 99 }, { "epoch": 0.6259780907668232, "grad_norm": 0.5037059436389102, "learning_rate": 5e-05, "loss": 0.4739, "step": 100 }, { "epoch": 0.6322378716744914, "grad_norm": 0.5631190436137139, "learning_rate": 4.9997404092249336e-05, "loss": 0.4699, "step": 101 }, { "epoch": 0.6384976525821596, "grad_norm": 0.39119483297638863, "learning_rate": 4.998961690809628e-05, "loss": 0.4703, "step": 102 }, { "epoch": 0.6447574334898278, "grad_norm": 0.40196303529424704, "learning_rate": 4.997664006472579e-05, "loss": 0.4749, "step": 103 }, { "epoch": 0.651017214397496, "grad_norm": 0.3397733110278162, "learning_rate": 4.9958476257072914e-05, "loss": 0.4654, "step": 104 }, { "epoch": 0.6572769953051644, "grad_norm": 0.2670846226151608, "learning_rate": 4.993512925726319e-05, "loss": 0.4716, "step": 105 }, { "epoch": 0.6635367762128326, "grad_norm": 0.36681659702689784, "learning_rate": 4.990660391382923e-05, "loss": 0.4704, "step": 106 }, { "epoch": 0.6697965571205008, "grad_norm": 0.26058292855009557, "learning_rate": 4.987290615070385e-05, "loss": 0.4647, "step": 107 }, { "epoch": 0.676056338028169, "grad_norm": 0.25205128219384887, "learning_rate": 4.983404296598979e-05, "loss": 0.4725, "step": 108 }, { "epoch": 0.6823161189358372, "grad_norm": 0.3208687051782515, "learning_rate": 4.9790022430506463e-05, "loss": 0.471, "step": 109 }, { "epoch": 0.6885758998435054, "grad_norm": 0.2306209439140453, "learning_rate": 4.974085368611381e-05, "loss": 0.473, "step": 110 }, { "epoch": 0.6948356807511737, "grad_norm": 0.21458192536569118, "learning_rate": 4.968654694381379e-05, "loss": 0.4692, "step": 111 }, { "epoch": 0.701095461658842, "grad_norm": 0.24400329234341836, "learning_rate": 4.962711348162987e-05, "loss": 0.4742, "step": 112 }, { "epoch": 0.7073552425665102, "grad_norm": 0.5445701250609367, "learning_rate": 4.956256564226487e-05, "loss": 0.4677, "step": 113 }, { "epoch": 0.7136150234741784, "grad_norm": 0.2485591152431222, "learning_rate": 4.949291683053769e-05, "loss": 0.478, "step": 114 }, { "epoch": 0.7198748043818466, "grad_norm": 0.2683190648451619, "learning_rate": 4.941818151059956e-05, "loss": 0.468, "step": 115 }, { "epoch": 0.7261345852895149, "grad_norm": 0.17377296116604452, "learning_rate": 4.933837520293017e-05, "loss": 0.4682, "step": 116 }, { "epoch": 0.7323943661971831, "grad_norm": 0.19892874090328266, "learning_rate": 4.9253514481114535e-05, "loss": 0.4716, "step": 117 }, { "epoch": 0.7386541471048513, "grad_norm": 0.22470516800088272, "learning_rate": 4.91636169684011e-05, "loss": 0.4807, "step": 118 }, { "epoch": 0.7449139280125195, "grad_norm": 0.23033947133081567, "learning_rate": 4.906870133404187e-05, "loss": 0.4721, "step": 119 }, { "epoch": 0.7511737089201878, "grad_norm": 0.2764527709442302, "learning_rate": 4.896878728941531e-05, "loss": 0.4693, "step": 120 }, { "epoch": 0.7574334898278561, "grad_norm": 0.28746556965081915, "learning_rate": 4.8863895583932836e-05, "loss": 0.4767, "step": 121 }, { "epoch": 0.7636932707355243, "grad_norm": 0.32061574884194566, "learning_rate": 4.875404800072977e-05, "loss": 0.4643, "step": 122 }, { "epoch": 0.7699530516431925, "grad_norm": 0.34181281337669966, "learning_rate": 4.86392673521415e-05, "loss": 0.4602, "step": 123 }, { "epoch": 0.7762128325508607, "grad_norm": 0.30941984507586506, "learning_rate": 4.8519577474966074e-05, "loss": 0.4711, "step": 124 }, { "epoch": 0.7824726134585289, "grad_norm": 0.23600978038755785, "learning_rate": 4.839500322551386e-05, "loss": 0.4696, "step": 125 }, { "epoch": 0.7887323943661971, "grad_norm": 0.2577164285099203, "learning_rate": 4.8265570474445636e-05, "loss": 0.4644, "step": 126 }, { "epoch": 0.7949921752738655, "grad_norm": 0.27823451721774306, "learning_rate": 4.813130610139994e-05, "loss": 0.479, "step": 127 }, { "epoch": 0.8012519561815337, "grad_norm": 0.22061524932206344, "learning_rate": 4.7992237989410904e-05, "loss": 0.4711, "step": 128 }, { "epoch": 0.8075117370892019, "grad_norm": 0.20216340578684158, "learning_rate": 4.784839501911771e-05, "loss": 0.468, "step": 129 }, { "epoch": 0.8137715179968701, "grad_norm": 0.27542745611047786, "learning_rate": 4.7699807062766876e-05, "loss": 0.4754, "step": 130 }, { "epoch": 0.8200312989045383, "grad_norm": 0.21954180738847087, "learning_rate": 4.75465049780086e-05, "loss": 0.4595, "step": 131 }, { "epoch": 0.8262910798122066, "grad_norm": 0.19430624161738624, "learning_rate": 4.738852060148849e-05, "loss": 0.4747, "step": 132 }, { "epoch": 0.8325508607198748, "grad_norm": 0.1884671644058954, "learning_rate": 4.722588674223594e-05, "loss": 0.4748, "step": 133 }, { "epoch": 0.838810641627543, "grad_norm": 0.20913369047927102, "learning_rate": 4.7058637174850604e-05, "loss": 0.4653, "step": 134 }, { "epoch": 0.8450704225352113, "grad_norm": 0.19564021089464265, "learning_rate": 4.688680663248837e-05, "loss": 0.4644, "step": 135 }, { "epoch": 0.8513302034428795, "grad_norm": 0.17437877798570775, "learning_rate": 4.671043079964815e-05, "loss": 0.4666, "step": 136 }, { "epoch": 0.8575899843505478, "grad_norm": 0.18658537333186465, "learning_rate": 4.652954630476127e-05, "loss": 0.463, "step": 137 }, { "epoch": 0.863849765258216, "grad_norm": 0.1916983418252378, "learning_rate": 4.634419071258472e-05, "loss": 0.4801, "step": 138 }, { "epoch": 0.8701095461658842, "grad_norm": 0.18269150591223743, "learning_rate": 4.615440251639995e-05, "loss": 0.465, "step": 139 }, { "epoch": 0.8763693270735524, "grad_norm": 0.19124021712384207, "learning_rate": 4.5960221130018946e-05, "loss": 0.4624, "step": 140 }, { "epoch": 0.8826291079812206, "grad_norm": 0.17751289300487907, "learning_rate": 4.576168687959895e-05, "loss": 0.4667, "step": 141 }, { "epoch": 0.8888888888888888, "grad_norm": 0.16256598664527863, "learning_rate": 4.555884099526794e-05, "loss": 0.4724, "step": 142 }, { "epoch": 0.8951486697965572, "grad_norm": 0.17306660659668968, "learning_rate": 4.535172560256218e-05, "loss": 0.4764, "step": 143 }, { "epoch": 0.9014084507042254, "grad_norm": 0.15311694878287935, "learning_rate": 4.5140383713677916e-05, "loss": 0.4633, "step": 144 }, { "epoch": 0.9076682316118936, "grad_norm": 0.16327033693685952, "learning_rate": 4.492485921853894e-05, "loss": 0.4626, "step": 145 }, { "epoch": 0.9139280125195618, "grad_norm": 0.1577015015575217, "learning_rate": 4.4705196875681854e-05, "loss": 0.465, "step": 146 }, { "epoch": 0.92018779342723, "grad_norm": 0.14976303947345634, "learning_rate": 4.448144230296093e-05, "loss": 0.4732, "step": 147 }, { "epoch": 0.9264475743348983, "grad_norm": 0.1799041852337434, "learning_rate": 4.425364196807451e-05, "loss": 0.4638, "step": 148 }, { "epoch": 0.9327073552425665, "grad_norm": 0.25582934784311545, "learning_rate": 4.402184317891501e-05, "loss": 0.4687, "step": 149 }, { "epoch": 0.9389671361502347, "grad_norm": 0.14767269207211267, "learning_rate": 4.37860940737443e-05, "loss": 0.4622, "step": 150 }, { "epoch": 0.945226917057903, "grad_norm": 0.18510146862998086, "learning_rate": 4.354644361119672e-05, "loss": 0.4714, "step": 151 }, { "epoch": 0.9514866979655712, "grad_norm": 0.1834113544053396, "learning_rate": 4.330294156011172e-05, "loss": 0.4665, "step": 152 }, { "epoch": 0.9577464788732394, "grad_norm": 0.16106024098596552, "learning_rate": 4.305563848919824e-05, "loss": 0.4612, "step": 153 }, { "epoch": 0.9640062597809077, "grad_norm": 0.1582714001537092, "learning_rate": 4.2804585756532965e-05, "loss": 0.4656, "step": 154 }, { "epoch": 0.9702660406885759, "grad_norm": 0.1838011411088347, "learning_rate": 4.254983549889467e-05, "loss": 0.4585, "step": 155 }, { "epoch": 0.9765258215962441, "grad_norm": 0.22256207898681857, "learning_rate": 4.2291440620936796e-05, "loss": 0.4712, "step": 156 }, { "epoch": 0.9827856025039123, "grad_norm": 0.16842112143070276, "learning_rate": 4.2029454784200676e-05, "loss": 0.4691, "step": 157 }, { "epoch": 0.9890453834115805, "grad_norm": 0.15122887832488566, "learning_rate": 4.176393239597144e-05, "loss": 0.4778, "step": 158 }, { "epoch": 0.9953051643192489, "grad_norm": 0.1902639072378955, "learning_rate": 4.149492859797912e-05, "loss": 0.4688, "step": 159 }, { "epoch": 1.001564945226917, "grad_norm": 0.17200971150006397, "learning_rate": 4.122249925494726e-05, "loss": 0.464, "step": 160 } ], "logging_steps": 1, "max_steps": 318, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 16, "total_flos": 1.0356139229184e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }