{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 14109, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0425260472039124, "grad_norm": 4.5295610427856445, "learning_rate": 4.929123254660146e-05, "loss": 3.196, "step": 200 }, { "epoch": 0.0850520944078248, "grad_norm": 3.153637170791626, "learning_rate": 4.858246509320292e-05, "loss": 2.0357, "step": 400 }, { "epoch": 0.1275781416117372, "grad_norm": 2.833491802215576, "learning_rate": 4.7873697639804386e-05, "loss": 1.9407, "step": 600 }, { "epoch": 0.1701041888156496, "grad_norm": 3.3437952995300293, "learning_rate": 4.7164930186405845e-05, "loss": 1.8486, "step": 800 }, { "epoch": 0.212630236019562, "grad_norm": 2.2692761421203613, "learning_rate": 4.64561627330073e-05, "loss": 1.8338, "step": 1000 }, { "epoch": 0.2551562832234744, "grad_norm": 9.267280578613281, "learning_rate": 4.574739527960876e-05, "loss": 1.8259, "step": 1200 }, { "epoch": 0.2976823304273868, "grad_norm": 2.5437700748443604, "learning_rate": 4.503862782621022e-05, "loss": 1.7474, "step": 1400 }, { "epoch": 0.3402083776312992, "grad_norm": 3.454941511154175, "learning_rate": 4.432986037281168e-05, "loss": 1.7185, "step": 1600 }, { "epoch": 0.3827344248352116, "grad_norm": 2.709567070007324, "learning_rate": 4.3621092919413145e-05, "loss": 1.7468, "step": 1800 }, { "epoch": 0.425260472039124, "grad_norm": 2.4916956424713135, "learning_rate": 4.2912325466014604e-05, "loss": 1.7277, "step": 2000 }, { "epoch": 0.4677865192430364, "grad_norm": 2.3599932193756104, "learning_rate": 4.220355801261606e-05, "loss": 1.6742, "step": 2200 }, { "epoch": 0.5103125664469488, "grad_norm": 2.518554449081421, "learning_rate": 4.149479055921752e-05, "loss": 1.6232, "step": 2400 }, { "epoch": 0.5528386136508612, "grad_norm": 2.202854633331299, "learning_rate": 4.078602310581898e-05, "loss": 1.6431, "step": 2600 }, { "epoch": 0.5953646608547736, "grad_norm": 2.0834789276123047, "learning_rate": 4.007725565242044e-05, "loss": 1.6275, "step": 2800 }, { "epoch": 0.637890708058686, "grad_norm": 1.998606562614441, "learning_rate": 3.9368488199021905e-05, "loss": 1.6399, "step": 3000 }, { "epoch": 0.6804167552625984, "grad_norm": 2.2863314151763916, "learning_rate": 3.8659720745623364e-05, "loss": 1.631, "step": 3200 }, { "epoch": 0.7229428024665108, "grad_norm": 2.4040656089782715, "learning_rate": 3.795095329222482e-05, "loss": 1.5873, "step": 3400 }, { "epoch": 0.7654688496704232, "grad_norm": 2.490069627761841, "learning_rate": 3.724218583882628e-05, "loss": 1.6236, "step": 3600 }, { "epoch": 0.8079948968743356, "grad_norm": 2.4205753803253174, "learning_rate": 3.653341838542774e-05, "loss": 1.57, "step": 3800 }, { "epoch": 0.850520944078248, "grad_norm": 3.0913541316986084, "learning_rate": 3.58246509320292e-05, "loss": 1.5535, "step": 4000 }, { "epoch": 0.8930469912821604, "grad_norm": 2.7764530181884766, "learning_rate": 3.5115883478630665e-05, "loss": 1.6216, "step": 4200 }, { "epoch": 0.9355730384860728, "grad_norm": 2.6188108921051025, "learning_rate": 3.4407116025232124e-05, "loss": 1.5819, "step": 4400 }, { "epoch": 0.9780990856899852, "grad_norm": 2.5243659019470215, "learning_rate": 3.369834857183358e-05, "loss": 1.5277, "step": 4600 }, { "epoch": 1.0206251328938976, "grad_norm": 2.1120309829711914, "learning_rate": 3.298958111843504e-05, "loss": 1.5052, "step": 4800 }, { "epoch": 1.06315118009781, "grad_norm": 2.532235622406006, "learning_rate": 3.22808136650365e-05, "loss": 1.4648, "step": 5000 }, { "epoch": 1.1056772273017224, "grad_norm": 2.924020767211914, "learning_rate": 3.157204621163796e-05, "loss": 1.4526, "step": 5200 }, { "epoch": 1.1482032745056348, "grad_norm": 3.1454060077667236, "learning_rate": 3.0863278758239425e-05, "loss": 1.4659, "step": 5400 }, { "epoch": 1.1907293217095472, "grad_norm": 2.3973867893218994, "learning_rate": 3.0154511304840884e-05, "loss": 1.4622, "step": 5600 }, { "epoch": 1.2332553689134595, "grad_norm": 2.5608162879943848, "learning_rate": 2.9445743851442343e-05, "loss": 1.4919, "step": 5800 }, { "epoch": 1.275781416117372, "grad_norm": 1.8345551490783691, "learning_rate": 2.87369763980438e-05, "loss": 1.4644, "step": 6000 }, { "epoch": 1.3183074633212843, "grad_norm": 2.1043267250061035, "learning_rate": 2.8028208944645264e-05, "loss": 1.4662, "step": 6200 }, { "epoch": 1.3608335105251967, "grad_norm": 1.9993553161621094, "learning_rate": 2.7319441491246722e-05, "loss": 1.4558, "step": 6400 }, { "epoch": 1.4033595577291091, "grad_norm": 2.4795050621032715, "learning_rate": 2.661067403784818e-05, "loss": 1.4381, "step": 6600 }, { "epoch": 1.4458856049330215, "grad_norm": 2.0630886554718018, "learning_rate": 2.5901906584449644e-05, "loss": 1.4244, "step": 6800 }, { "epoch": 1.488411652136934, "grad_norm": 2.489701509475708, "learning_rate": 2.5193139131051102e-05, "loss": 1.4263, "step": 7000 }, { "epoch": 1.5309376993408463, "grad_norm": 2.2150392532348633, "learning_rate": 2.448437167765256e-05, "loss": 1.4437, "step": 7200 }, { "epoch": 1.5734637465447587, "grad_norm": 2.3309507369995117, "learning_rate": 2.3775604224254023e-05, "loss": 1.437, "step": 7400 }, { "epoch": 1.6159897937486711, "grad_norm": 2.4965591430664062, "learning_rate": 2.3066836770855482e-05, "loss": 1.4509, "step": 7600 }, { "epoch": 1.6585158409525835, "grad_norm": 2.379479169845581, "learning_rate": 2.235806931745694e-05, "loss": 1.3942, "step": 7800 }, { "epoch": 1.701041888156496, "grad_norm": 2.083732843399048, "learning_rate": 2.1649301864058403e-05, "loss": 1.3781, "step": 8000 }, { "epoch": 1.7435679353604083, "grad_norm": 2.258941411972046, "learning_rate": 2.0940534410659862e-05, "loss": 1.3947, "step": 8200 }, { "epoch": 1.7860939825643207, "grad_norm": 2.069312334060669, "learning_rate": 2.023176695726132e-05, "loss": 1.448, "step": 8400 }, { "epoch": 1.8286200297682331, "grad_norm": 2.0143864154815674, "learning_rate": 1.9522999503862783e-05, "loss": 1.387, "step": 8600 }, { "epoch": 1.8711460769721455, "grad_norm": 2.7671608924865723, "learning_rate": 1.8814232050464242e-05, "loss": 1.4122, "step": 8800 }, { "epoch": 1.913672124176058, "grad_norm": 2.4909543991088867, "learning_rate": 1.8105464597065704e-05, "loss": 1.4364, "step": 9000 }, { "epoch": 1.9561981713799703, "grad_norm": 2.471282720565796, "learning_rate": 1.7396697143667166e-05, "loss": 1.4177, "step": 9200 }, { "epoch": 1.9987242185838827, "grad_norm": 3.3425190448760986, "learning_rate": 1.6687929690268625e-05, "loss": 1.4231, "step": 9400 }, { "epoch": 2.041250265787795, "grad_norm": 2.7562530040740967, "learning_rate": 1.5979162236870084e-05, "loss": 1.3759, "step": 9600 }, { "epoch": 2.0837763129917075, "grad_norm": 2.857525587081909, "learning_rate": 1.5270394783471546e-05, "loss": 1.3467, "step": 9800 }, { "epoch": 2.12630236019562, "grad_norm": 2.1692278385162354, "learning_rate": 1.4561627330073005e-05, "loss": 1.3397, "step": 10000 }, { "epoch": 2.1688284073995323, "grad_norm": 2.8106327056884766, "learning_rate": 1.3852859876674466e-05, "loss": 1.3203, "step": 10200 }, { "epoch": 2.2113544546034447, "grad_norm": 2.509658098220825, "learning_rate": 1.3144092423275925e-05, "loss": 1.3108, "step": 10400 }, { "epoch": 2.253880501807357, "grad_norm": 2.2616686820983887, "learning_rate": 1.2435324969877383e-05, "loss": 1.3394, "step": 10600 }, { "epoch": 2.2964065490112695, "grad_norm": 2.6901018619537354, "learning_rate": 1.1726557516478844e-05, "loss": 1.3267, "step": 10800 }, { "epoch": 2.338932596215182, "grad_norm": 3.918454647064209, "learning_rate": 1.1017790063080303e-05, "loss": 1.3223, "step": 11000 }, { "epoch": 2.3814586434190943, "grad_norm": 2.4760806560516357, "learning_rate": 1.0309022609681763e-05, "loss": 1.3549, "step": 11200 }, { "epoch": 2.4239846906230067, "grad_norm": 2.3667821884155273, "learning_rate": 9.600255156283224e-06, "loss": 1.3398, "step": 11400 }, { "epoch": 2.466510737826919, "grad_norm": 2.600486993789673, "learning_rate": 8.891487702884683e-06, "loss": 1.3195, "step": 11600 }, { "epoch": 2.5090367850308315, "grad_norm": 2.0121264457702637, "learning_rate": 8.182720249486143e-06, "loss": 1.3308, "step": 11800 }, { "epoch": 2.551562832234744, "grad_norm": 2.0753839015960693, "learning_rate": 7.473952796087604e-06, "loss": 1.3607, "step": 12000 }, { "epoch": 2.5940888794386563, "grad_norm": 2.37463641166687, "learning_rate": 6.7651853426890634e-06, "loss": 1.3247, "step": 12200 }, { "epoch": 2.6366149266425687, "grad_norm": 2.016892910003662, "learning_rate": 6.056417889290524e-06, "loss": 1.2809, "step": 12400 }, { "epoch": 2.679140973846481, "grad_norm": 2.9185993671417236, "learning_rate": 5.3476504358919845e-06, "loss": 1.3227, "step": 12600 }, { "epoch": 2.7216670210503935, "grad_norm": 3.1016061305999756, "learning_rate": 4.638882982493444e-06, "loss": 1.3224, "step": 12800 }, { "epoch": 2.764193068254306, "grad_norm": 1.890191912651062, "learning_rate": 3.930115529094904e-06, "loss": 1.3527, "step": 13000 }, { "epoch": 2.8067191154582183, "grad_norm": 2.897411346435547, "learning_rate": 3.221348075696364e-06, "loss": 1.3423, "step": 13200 }, { "epoch": 2.8492451626621307, "grad_norm": 2.7592458724975586, "learning_rate": 2.512580622297824e-06, "loss": 1.3389, "step": 13400 }, { "epoch": 2.891771209866043, "grad_norm": 2.054417610168457, "learning_rate": 1.8038131688992842e-06, "loss": 1.3143, "step": 13600 }, { "epoch": 2.9342972570699555, "grad_norm": 2.7590255737304688, "learning_rate": 1.0950457155007443e-06, "loss": 1.3156, "step": 13800 }, { "epoch": 2.976823304273868, "grad_norm": 2.5642201900482178, "learning_rate": 3.862782621022043e-07, "loss": 1.3214, "step": 14000 } ], "logging_steps": 200, "max_steps": 14109, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3685981372416000.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }