{ "best_metric": 4.949131488800049, "best_model_checkpoint": "./results/models/checkpoint-33700", "epoch": 20.0, "eval_steps": 500, "global_step": 33700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.29673590504451036, "grad_norm": 0.197265625, "learning_rate": 0.0009940652818991099, "loss": 5.6032, "step": 500 }, { "epoch": 0.5934718100890207, "grad_norm": 0.1953125, "learning_rate": 0.0009881305637982197, "loss": 5.4199, "step": 1000 }, { "epoch": 0.8902077151335311, "grad_norm": 0.1982421875, "learning_rate": 0.0009821958456973294, "loss": 5.3653, "step": 1500 }, { "epoch": 1.0, "eval_loss": 5.317612171173096, "eval_runtime": 0.4163, "eval_samples_per_second": 2402.115, "eval_steps_per_second": 4.804, "step": 1685 }, { "epoch": 1.1869436201780414, "grad_norm": 0.1953125, "learning_rate": 0.0009762611275964391, "loss": 5.3193, "step": 2000 }, { "epoch": 1.4836795252225519, "grad_norm": 0.208984375, "learning_rate": 0.000970326409495549, "loss": 5.2869, "step": 2500 }, { "epoch": 1.7804154302670623, "grad_norm": 0.203125, "learning_rate": 0.0009643916913946587, "loss": 5.2619, "step": 3000 }, { "epoch": 2.0, "eval_loss": 5.211864471435547, "eval_runtime": 0.4845, "eval_samples_per_second": 2063.824, "eval_steps_per_second": 4.128, "step": 3370 }, { "epoch": 2.077151335311573, "grad_norm": 0.197265625, "learning_rate": 0.0009584569732937686, "loss": 5.2314, "step": 3500 }, { "epoch": 2.373887240356083, "grad_norm": 0.2197265625, "learning_rate": 0.0009525222551928784, "loss": 5.2019, "step": 4000 }, { "epoch": 2.6706231454005933, "grad_norm": 0.21875, "learning_rate": 0.0009465875370919882, "loss": 5.1854, "step": 4500 }, { "epoch": 2.9673590504451037, "grad_norm": 0.228515625, "learning_rate": 0.0009406528189910979, "loss": 5.1734, "step": 5000 }, { "epoch": 3.0, "eval_loss": 5.1604228019714355, "eval_runtime": 0.5733, "eval_samples_per_second": 1744.297, "eval_steps_per_second": 3.489, "step": 5055 }, { "epoch": 3.264094955489614, "grad_norm": 0.2294921875, "learning_rate": 0.0009347181008902077, "loss": 5.1376, "step": 5500 }, { "epoch": 3.5608308605341246, "grad_norm": 0.2314453125, "learning_rate": 0.0009287833827893175, "loss": 5.1259, "step": 6000 }, { "epoch": 3.857566765578635, "grad_norm": 0.2265625, "learning_rate": 0.0009228486646884273, "loss": 5.1134, "step": 6500 }, { "epoch": 4.0, "eval_loss": 5.109052658081055, "eval_runtime": 0.6878, "eval_samples_per_second": 1453.883, "eval_steps_per_second": 2.908, "step": 6740 }, { "epoch": 4.154302670623146, "grad_norm": 0.2470703125, "learning_rate": 0.0009169139465875371, "loss": 5.0915, "step": 7000 }, { "epoch": 4.451038575667655, "grad_norm": 0.25, "learning_rate": 0.0009109792284866469, "loss": 5.0718, "step": 7500 }, { "epoch": 4.747774480712166, "grad_norm": 0.25, "learning_rate": 0.0009050445103857568, "loss": 5.066, "step": 8000 }, { "epoch": 5.0, "eval_loss": 5.074661731719971, "eval_runtime": 0.5054, "eval_samples_per_second": 1978.542, "eval_steps_per_second": 3.957, "step": 8425 }, { "epoch": 5.044510385756676, "grad_norm": 0.2470703125, "learning_rate": 0.0008991097922848664, "loss": 5.0542, "step": 8500 }, { "epoch": 5.341246290801187, "grad_norm": 0.26171875, "learning_rate": 0.0008931750741839763, "loss": 5.023, "step": 9000 }, { "epoch": 5.637982195845697, "grad_norm": 0.259765625, "learning_rate": 0.0008872403560830861, "loss": 5.0233, "step": 9500 }, { "epoch": 5.9347181008902075, "grad_norm": 0.265625, "learning_rate": 0.0008813056379821959, "loss": 5.0197, "step": 10000 }, { "epoch": 6.0, "eval_loss": 5.054934501647949, "eval_runtime": 0.6462, "eval_samples_per_second": 1547.607, "eval_steps_per_second": 3.095, "step": 10110 }, { "epoch": 6.231454005934718, "grad_norm": 0.271484375, "learning_rate": 0.0008753709198813057, "loss": 4.9826, "step": 10500 }, { "epoch": 6.528189910979228, "grad_norm": 0.271484375, "learning_rate": 0.0008694362017804155, "loss": 4.984, "step": 11000 }, { "epoch": 6.824925816023739, "grad_norm": 0.271484375, "learning_rate": 0.0008635014836795252, "loss": 4.982, "step": 11500 }, { "epoch": 7.0, "eval_loss": 5.03799295425415, "eval_runtime": 0.4792, "eval_samples_per_second": 2086.994, "eval_steps_per_second": 4.174, "step": 11795 }, { "epoch": 7.121661721068249, "grad_norm": 0.287109375, "learning_rate": 0.000857566765578635, "loss": 4.9661, "step": 12000 }, { "epoch": 7.4183976261127595, "grad_norm": 0.2890625, "learning_rate": 0.0008516320474777448, "loss": 4.9489, "step": 12500 }, { "epoch": 7.71513353115727, "grad_norm": 0.294921875, "learning_rate": 0.0008456973293768546, "loss": 4.9486, "step": 13000 }, { "epoch": 8.0, "eval_loss": 5.01511287689209, "eval_runtime": 0.7849, "eval_samples_per_second": 1273.986, "eval_steps_per_second": 2.548, "step": 13480 }, { "epoch": 8.011869436201781, "grad_norm": 0.294921875, "learning_rate": 0.0008397626112759644, "loss": 4.9437, "step": 13500 }, { "epoch": 8.308605341246292, "grad_norm": 0.322265625, "learning_rate": 0.0008338278931750742, "loss": 4.9108, "step": 14000 }, { "epoch": 8.605341246290802, "grad_norm": 0.310546875, "learning_rate": 0.000827893175074184, "loss": 4.9207, "step": 14500 }, { "epoch": 8.90207715133531, "grad_norm": 0.30859375, "learning_rate": 0.0008219584569732938, "loss": 4.9149, "step": 15000 }, { "epoch": 9.0, "eval_loss": 5.001404285430908, "eval_runtime": 0.5146, "eval_samples_per_second": 1943.422, "eval_steps_per_second": 3.887, "step": 15165 }, { "epoch": 9.198813056379821, "grad_norm": 0.310546875, "learning_rate": 0.0008160237388724035, "loss": 4.8919, "step": 15500 }, { "epoch": 9.495548961424332, "grad_norm": 0.32421875, "learning_rate": 0.0008100890207715134, "loss": 4.8883, "step": 16000 }, { "epoch": 9.792284866468842, "grad_norm": 0.330078125, "learning_rate": 0.0008041543026706232, "loss": 4.8952, "step": 16500 }, { "epoch": 10.0, "eval_loss": 4.996912002563477, "eval_runtime": 0.6039, "eval_samples_per_second": 1655.995, "eval_steps_per_second": 3.312, "step": 16850 }, { "epoch": 10.089020771513352, "grad_norm": 0.3125, "learning_rate": 0.000798219584569733, "loss": 4.8771, "step": 17000 }, { "epoch": 10.385756676557863, "grad_norm": 0.326171875, "learning_rate": 0.0007922848664688428, "loss": 4.8607, "step": 17500 }, { "epoch": 10.682492581602373, "grad_norm": 0.34375, "learning_rate": 0.0007863501483679525, "loss": 4.87, "step": 18000 }, { "epoch": 10.979228486646884, "grad_norm": 0.333984375, "learning_rate": 0.0007804154302670623, "loss": 4.868, "step": 18500 }, { "epoch": 11.0, "eval_loss": 4.980271339416504, "eval_runtime": 0.5082, "eval_samples_per_second": 1967.91, "eval_steps_per_second": 3.936, "step": 18535 }, { "epoch": 11.275964391691394, "grad_norm": 0.341796875, "learning_rate": 0.0007744807121661721, "loss": 4.8319, "step": 19000 }, { "epoch": 11.572700296735905, "grad_norm": 0.373046875, "learning_rate": 0.000768545994065282, "loss": 4.8425, "step": 19500 }, { "epoch": 11.869436201780415, "grad_norm": 0.349609375, "learning_rate": 0.0007626112759643917, "loss": 4.8469, "step": 20000 }, { "epoch": 12.0, "eval_loss": 4.969499111175537, "eval_runtime": 0.5498, "eval_samples_per_second": 1818.897, "eval_steps_per_second": 3.638, "step": 20220 }, { "epoch": 12.166172106824925, "grad_norm": 0.365234375, "learning_rate": 0.0007566765578635016, "loss": 4.8272, "step": 20500 }, { "epoch": 12.462908011869436, "grad_norm": 0.3515625, "learning_rate": 0.0007507418397626113, "loss": 4.8171, "step": 21000 }, { "epoch": 12.759643916913946, "grad_norm": 0.33984375, "learning_rate": 0.0007448071216617211, "loss": 4.8272, "step": 21500 }, { "epoch": 13.0, "eval_loss": 4.971672058105469, "eval_runtime": 0.5373, "eval_samples_per_second": 1861.169, "eval_steps_per_second": 3.722, "step": 21905 }, { "epoch": 13.056379821958457, "grad_norm": 0.37109375, "learning_rate": 0.0007388724035608308, "loss": 4.8221, "step": 22000 }, { "epoch": 13.353115727002967, "grad_norm": 0.375, "learning_rate": 0.0007329376854599407, "loss": 4.7935, "step": 22500 }, { "epoch": 13.649851632047477, "grad_norm": 0.3671875, "learning_rate": 0.0007270029673590504, "loss": 4.8096, "step": 23000 }, { "epoch": 13.946587537091988, "grad_norm": 0.384765625, "learning_rate": 0.0007210682492581603, "loss": 4.8121, "step": 23500 }, { "epoch": 14.0, "eval_loss": 4.967980861663818, "eval_runtime": 0.5226, "eval_samples_per_second": 1913.569, "eval_steps_per_second": 3.827, "step": 23590 }, { "epoch": 14.243323442136498, "grad_norm": 0.3828125, "learning_rate": 0.0007151335311572701, "loss": 4.7805, "step": 24000 }, { "epoch": 14.540059347181009, "grad_norm": 0.376953125, "learning_rate": 0.0007091988130563798, "loss": 4.7851, "step": 24500 }, { "epoch": 14.836795252225519, "grad_norm": 0.388671875, "learning_rate": 0.0007032640949554896, "loss": 4.7942, "step": 25000 }, { "epoch": 15.0, "eval_loss": 4.957317352294922, "eval_runtime": 0.4875, "eval_samples_per_second": 2051.234, "eval_steps_per_second": 4.102, "step": 25275 }, { "epoch": 15.13353115727003, "grad_norm": 0.388671875, "learning_rate": 0.0006973293768545994, "loss": 4.778, "step": 25500 }, { "epoch": 15.43026706231454, "grad_norm": 0.416015625, "learning_rate": 0.0006913946587537093, "loss": 4.7668, "step": 26000 }, { "epoch": 15.72700296735905, "grad_norm": 0.392578125, "learning_rate": 0.000685459940652819, "loss": 4.7775, "step": 26500 }, { "epoch": 16.0, "eval_loss": 4.964081287384033, "eval_runtime": 0.5374, "eval_samples_per_second": 1860.926, "eval_steps_per_second": 3.722, "step": 26960 }, { "epoch": 16.023738872403563, "grad_norm": 0.40234375, "learning_rate": 0.0006795252225519289, "loss": 4.7739, "step": 27000 }, { "epoch": 16.320474777448073, "grad_norm": 0.416015625, "learning_rate": 0.0006735905044510386, "loss": 4.7483, "step": 27500 }, { "epoch": 16.617210682492583, "grad_norm": 0.396484375, "learning_rate": 0.0006676557863501484, "loss": 4.7586, "step": 28000 }, { "epoch": 16.91394658753709, "grad_norm": 0.390625, "learning_rate": 0.0006617210682492581, "loss": 4.7701, "step": 28500 }, { "epoch": 17.0, "eval_loss": 4.9522247314453125, "eval_runtime": 0.4981, "eval_samples_per_second": 2007.758, "eval_steps_per_second": 4.016, "step": 28645 }, { "epoch": 17.2106824925816, "grad_norm": 0.427734375, "learning_rate": 0.000655786350148368, "loss": 4.735, "step": 29000 }, { "epoch": 17.50741839762611, "grad_norm": 0.408203125, "learning_rate": 0.0006498516320474777, "loss": 4.7469, "step": 29500 }, { "epoch": 17.80415430267062, "grad_norm": 0.4140625, "learning_rate": 0.0006439169139465876, "loss": 4.7499, "step": 30000 }, { "epoch": 18.0, "eval_loss": 4.961427688598633, "eval_runtime": 0.4675, "eval_samples_per_second": 2139.244, "eval_steps_per_second": 4.278, "step": 30330 }, { "epoch": 18.100890207715132, "grad_norm": 0.4140625, "learning_rate": 0.0006379821958456973, "loss": 4.7358, "step": 30500 }, { "epoch": 18.397626112759642, "grad_norm": 0.396484375, "learning_rate": 0.0006320474777448071, "loss": 4.7299, "step": 31000 }, { "epoch": 18.694362017804153, "grad_norm": 0.419921875, "learning_rate": 0.0006261127596439168, "loss": 4.735, "step": 31500 }, { "epoch": 18.991097922848663, "grad_norm": 0.4375, "learning_rate": 0.0006201780415430267, "loss": 4.7428, "step": 32000 }, { "epoch": 19.0, "eval_loss": 4.956191062927246, "eval_runtime": 0.4912, "eval_samples_per_second": 2035.99, "eval_steps_per_second": 4.072, "step": 32015 }, { "epoch": 19.287833827893174, "grad_norm": 0.447265625, "learning_rate": 0.0006142433234421366, "loss": 4.7161, "step": 32500 }, { "epoch": 19.584569732937684, "grad_norm": 0.427734375, "learning_rate": 0.0006083086053412463, "loss": 4.7258, "step": 33000 }, { "epoch": 19.881305637982194, "grad_norm": 0.4296875, "learning_rate": 0.0006023738872403562, "loss": 4.7254, "step": 33500 }, { "epoch": 20.0, "eval_loss": 4.949131488800049, "eval_runtime": 0.4629, "eval_samples_per_second": 2160.419, "eval_steps_per_second": 4.321, "step": 33700 } ], "logging_steps": 500, "max_steps": 84250, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.76236489769554e+16, "train_batch_size": 512, "trial_name": null, "trial_params": null }