ggml-org
/

stories15M_MOE

Text Generation

text-generation-inference

Inference Endpoints

Model card Files Files and versions Community

stories15M_MOE / moe_shakespeare15M /checkpoint-400 /trainer_state.json

ngxson's picture

ngxson HF staff

train

83f7dbb 2 months ago

history blame contribute delete

No virus

3.69 kB

	{
	"best_metric": null,
	"best_model_checkpoint": null,
	"epoch": 333.0,
	"eval_steps": 500,
	"global_step": 400,
	"is_hyper_param_search": false,
	"is_local_process_zero": true,
	"is_world_process_zero": true,
	"log_history": [
	{
	"epoch": 17.0,
	"grad_norm": 6.09660530090332,
	"learning_rate": 5e-06,
	"loss": 2.5103,
	"step": 20
	},
	{
	"epoch": 33.0,
	"grad_norm": 1.7365801334381104,
	"learning_rate": 1e-05,
	"loss": 2.1374,
	"step": 40
	},
	{
	"epoch": 50.0,
	"grad_norm": 1.1849949359893799,
	"learning_rate": 1.5e-05,
	"loss": 2.0202,
	"step": 60
	},
	{
	"epoch": 67.0,
	"grad_norm": 0.8566276431083679,
	"learning_rate": 2e-05,
	"loss": 1.8209,
	"step": 80
	},
	{
	"epoch": 83.0,
	"grad_norm": 0.733796238899231,
	"learning_rate": 2.5e-05,
	"loss": 1.5589,
	"step": 100
	},
	{
	"epoch": 100.0,
	"grad_norm": 1.1494934558868408,
	"learning_rate": 2.375e-05,
	"loss": 1.4981,
	"step": 120
	},
	{
	"epoch": 117.0,
	"grad_norm": 1.0989091396331787,
	"learning_rate": 2.25e-05,
	"loss": 1.3532,
	"step": 140
	},
	{
	"epoch": 133.0,
	"grad_norm": 1.2396273612976074,
	"learning_rate": 2.125e-05,
	"loss": 1.1437,
	"step": 160
	},
	{
	"epoch": 150.0,
	"grad_norm": 1.9112813472747803,
	"learning_rate": 2e-05,
	"loss": 1.0889,
	"step": 180
	},
	{
	"epoch": 167.0,
	"grad_norm": 1.9788281917572021,
	"learning_rate": 1.8750000000000002e-05,
	"loss": 0.9825,
	"step": 200
	},
	{
	"epoch": 183.0,
	"grad_norm": 2.772813081741333,
	"learning_rate": 1.75e-05,
	"loss": 0.832,
	"step": 220
	},
	{
	"epoch": 200.0,
	"grad_norm": 2.384052038192749,
	"learning_rate": 1.6250000000000002e-05,
	"loss": 0.7939,
	"step": 240
	},
	{
	"epoch": 217.0,
	"grad_norm": 2.423220634460449,
	"learning_rate": 1.5e-05,
	"loss": 0.7084,
	"step": 260
	},
	{
	"epoch": 233.0,
	"grad_norm": 2.2296969890594482,
	"learning_rate": 1.3750000000000002e-05,
	"loss": 0.5999,
	"step": 280
	},
	{
	"epoch": 250.0,
	"grad_norm": 3.0638062953948975,
	"learning_rate": 1.25e-05,
	"loss": 0.5756,
	"step": 300
	},
	{
	"epoch": 267.0,
	"grad_norm": 3.0690948963165283,
	"learning_rate": 1.125e-05,
	"loss": 0.5252,
	"step": 320
	},
	{
	"epoch": 283.0,
	"grad_norm": 2.1726977825164795,
	"learning_rate": 1e-05,
	"loss": 0.4515,
	"step": 340
	},
	{
	"epoch": 300.0,
	"grad_norm": 2.711531639099121,
	"learning_rate": 8.75e-06,
	"loss": 0.4445,
	"step": 360
	},
	{
	"epoch": 317.0,
	"grad_norm": 2.583651542663574,
	"learning_rate": 7.5e-06,
	"loss": 0.4179,
	"step": 380
	},
	{
	"epoch": 333.0,
	"grad_norm": 2.864351511001587,
	"learning_rate": 6.25e-06,
	"loss": 0.363,
	"step": 400
	}
	],
	"logging_steps": 20,
	"max_steps": 500,
	"num_input_tokens_seen": 0,
	"num_train_epochs": 500,
	"save_steps": 100,
	"stateful_callbacks": {
	"TrainerControl": {
	"args": {
	"should_epoch_stop": false,
	"should_evaluate": false,
	"should_log": false,
	"should_save": true,
	"should_training_stop": false
	},
	"attributes": {}
	}
	},
	"total_flos": 1439972197574400.0,
	"train_batch_size": 50,
	"trial_name": null,
	"trial_params": null
	}