answer / trainer_state.json
hawkling's picture
Model save
9ec1f88 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 884,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 3.014728639868492,
"learning_rate": 0.0001,
"loss": 3.3438,
"step": 1
},
{
"epoch": 0.01,
"grad_norm": 5.07497563513899,
"learning_rate": 0.0001,
"loss": 3.4844,
"step": 2
},
{
"epoch": 0.01,
"grad_norm": 3.0683045636143302,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 3
},
{
"epoch": 0.02,
"grad_norm": 3.341314586764068,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 4
},
{
"epoch": 0.02,
"grad_norm": 4.1805565704564955,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 5
},
{
"epoch": 0.03,
"grad_norm": 4.244329296572686,
"learning_rate": 0.0001,
"loss": 3.6562,
"step": 6
},
{
"epoch": 0.03,
"grad_norm": 4.595711743609493,
"learning_rate": 0.0001,
"loss": 3.7344,
"step": 7
},
{
"epoch": 0.04,
"grad_norm": 3.4778447381697863,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 8
},
{
"epoch": 0.04,
"grad_norm": 3.9898264241527004,
"learning_rate": 0.0001,
"loss": 3.4219,
"step": 9
},
{
"epoch": 0.05,
"grad_norm": 4.542104616744462,
"learning_rate": 0.0001,
"loss": 3.2656,
"step": 10
},
{
"epoch": 0.05,
"grad_norm": 3.4277356413883804,
"learning_rate": 0.0001,
"loss": 3.2188,
"step": 11
},
{
"epoch": 0.05,
"grad_norm": 3.2437025610823427,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 12
},
{
"epoch": 0.06,
"grad_norm": 4.305702485290415,
"learning_rate": 0.0001,
"loss": 3.1406,
"step": 13
},
{
"epoch": 0.06,
"grad_norm": 2.7399497184119506,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 14
},
{
"epoch": 0.07,
"grad_norm": 3.203877548043482,
"learning_rate": 0.0001,
"loss": 3.3281,
"step": 15
},
{
"epoch": 0.07,
"grad_norm": 2.7598651340996145,
"learning_rate": 0.0001,
"loss": 3.4531,
"step": 16
},
{
"epoch": 0.08,
"grad_norm": 2.8794034742734076,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 17
},
{
"epoch": 0.08,
"grad_norm": 2.716863791265328,
"learning_rate": 0.0001,
"loss": 3.2812,
"step": 18
},
{
"epoch": 0.09,
"grad_norm": 2.6209919053501953,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 19
},
{
"epoch": 0.09,
"grad_norm": 5.648879316727216,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 20
},
{
"epoch": 0.1,
"grad_norm": 3.110008991960499,
"learning_rate": 0.0001,
"loss": 3.2188,
"step": 21
},
{
"epoch": 0.1,
"grad_norm": 2.9713952473944705,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 22
},
{
"epoch": 0.1,
"grad_norm": 3.7018343790938353,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 23
},
{
"epoch": 0.11,
"grad_norm": 3.4131899574811118,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 24
},
{
"epoch": 0.11,
"grad_norm": 3.298790786105439,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 25
},
{
"epoch": 0.12,
"grad_norm": 2.5966196850680148,
"learning_rate": 0.0001,
"loss": 3.2031,
"step": 26
},
{
"epoch": 0.12,
"grad_norm": 3.875902211805855,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 27
},
{
"epoch": 0.13,
"grad_norm": 2.765858168009748,
"learning_rate": 0.0001,
"loss": 3.2031,
"step": 28
},
{
"epoch": 0.13,
"grad_norm": 3.070259667441601,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 29
},
{
"epoch": 0.14,
"grad_norm": 3.3005916640548,
"learning_rate": 0.0001,
"loss": 3.25,
"step": 30
},
{
"epoch": 0.14,
"grad_norm": 2.7606571016769332,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 31
},
{
"epoch": 0.14,
"grad_norm": 2.1981380713510323,
"learning_rate": 0.0001,
"loss": 3.1406,
"step": 32
},
{
"epoch": 0.15,
"grad_norm": 3.5672449144444,
"learning_rate": 0.0001,
"loss": 3.3125,
"step": 33
},
{
"epoch": 0.15,
"grad_norm": 2.646603965279102,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 34
},
{
"epoch": 0.16,
"grad_norm": 3.2248743875212385,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 35
},
{
"epoch": 0.16,
"grad_norm": 2.8724583239645565,
"learning_rate": 0.0001,
"loss": 3.3438,
"step": 36
},
{
"epoch": 0.17,
"grad_norm": 3.5297221873351754,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 37
},
{
"epoch": 0.17,
"grad_norm": 2.5275460394914675,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 38
},
{
"epoch": 0.18,
"grad_norm": 2.358392438696745,
"learning_rate": 0.0001,
"loss": 3.2344,
"step": 39
},
{
"epoch": 0.18,
"grad_norm": 2.55912275345912,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 40
},
{
"epoch": 0.19,
"grad_norm": 4.363728235132881,
"learning_rate": 0.0001,
"loss": 3.2344,
"step": 41
},
{
"epoch": 0.19,
"grad_norm": 4.175630702744953,
"learning_rate": 0.0001,
"loss": 3.1719,
"step": 42
},
{
"epoch": 0.19,
"grad_norm": 2.805357918124218,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 43
},
{
"epoch": 0.2,
"grad_norm": 2.993779953713576,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 44
},
{
"epoch": 0.2,
"grad_norm": 3.039174383300467,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 45
},
{
"epoch": 0.21,
"grad_norm": 3.4583649396718203,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 46
},
{
"epoch": 0.21,
"grad_norm": 2.400384244467908,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 47
},
{
"epoch": 0.22,
"grad_norm": 2.9016353799852213,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 48
},
{
"epoch": 0.22,
"grad_norm": 3.12493864757068,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 49
},
{
"epoch": 0.23,
"grad_norm": 2.8938775159043613,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 50
},
{
"epoch": 0.23,
"grad_norm": 4.890651173471067,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 51
},
{
"epoch": 0.24,
"grad_norm": 3.614938570857581,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 52
},
{
"epoch": 0.24,
"grad_norm": 2.969969594116141,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 53
},
{
"epoch": 0.24,
"grad_norm": 2.7352993614346293,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 54
},
{
"epoch": 0.25,
"grad_norm": 3.1359556182594974,
"learning_rate": 0.0001,
"loss": 3.2188,
"step": 55
},
{
"epoch": 0.25,
"grad_norm": 2.843122039034664,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 56
},
{
"epoch": 0.26,
"grad_norm": 3.3288764649012377,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 57
},
{
"epoch": 0.26,
"grad_norm": 2.8165528057214244,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 58
},
{
"epoch": 0.27,
"grad_norm": 2.5191401715518427,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 59
},
{
"epoch": 0.27,
"grad_norm": 2.1379077630223975,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 60
},
{
"epoch": 0.28,
"grad_norm": 2.124369818711383,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 61
},
{
"epoch": 0.28,
"grad_norm": 4.132972808670283,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 62
},
{
"epoch": 0.29,
"grad_norm": 2.9614040202740735,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 63
},
{
"epoch": 0.29,
"grad_norm": 2.8972234185293777,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 64
},
{
"epoch": 0.29,
"grad_norm": 2.6678504925494466,
"learning_rate": 0.0001,
"loss": 3.0781,
"step": 65
},
{
"epoch": 0.3,
"grad_norm": 3.911760340298777,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 66
},
{
"epoch": 0.3,
"grad_norm": 2.522073404974725,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 67
},
{
"epoch": 0.31,
"grad_norm": 2.873975472150959,
"learning_rate": 0.0001,
"loss": 3.0781,
"step": 68
},
{
"epoch": 0.31,
"grad_norm": 6.248241805309187,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 69
},
{
"epoch": 0.32,
"grad_norm": 3.734654438703655,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 70
},
{
"epoch": 0.32,
"grad_norm": 2.914708142756069,
"learning_rate": 0.0001,
"loss": 3.4688,
"step": 71
},
{
"epoch": 0.33,
"grad_norm": 3.0704529793274205,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 72
},
{
"epoch": 0.33,
"grad_norm": 2.7714788446837417,
"learning_rate": 0.0001,
"loss": 3.2344,
"step": 73
},
{
"epoch": 0.33,
"grad_norm": 2.1777578325193923,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 74
},
{
"epoch": 0.34,
"grad_norm": 2.7344906874924058,
"learning_rate": 0.0001,
"loss": 3.1719,
"step": 75
},
{
"epoch": 0.34,
"grad_norm": 2.8946934280105747,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 76
},
{
"epoch": 0.35,
"grad_norm": 3.1542104061738323,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 77
},
{
"epoch": 0.35,
"grad_norm": 2.762285352941279,
"learning_rate": 0.0001,
"loss": 3.3125,
"step": 78
},
{
"epoch": 0.36,
"grad_norm": 3.5687661240845703,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 79
},
{
"epoch": 0.36,
"grad_norm": 4.354199830221798,
"learning_rate": 0.0001,
"loss": 3.2344,
"step": 80
},
{
"epoch": 0.37,
"grad_norm": 3.982271568285038,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 81
},
{
"epoch": 0.37,
"grad_norm": 2.6629483529030193,
"learning_rate": 0.0001,
"loss": 3.2656,
"step": 82
},
{
"epoch": 0.38,
"grad_norm": 3.097410495508935,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 83
},
{
"epoch": 0.38,
"grad_norm": 3.4293559919795342,
"learning_rate": 0.0001,
"loss": 3.1875,
"step": 84
},
{
"epoch": 0.38,
"grad_norm": 3.016797304151073,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 85
},
{
"epoch": 0.39,
"grad_norm": 2.6266954910547033,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 86
},
{
"epoch": 0.39,
"grad_norm": 2.691102686844325,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 87
},
{
"epoch": 0.4,
"grad_norm": 3.0573753479866896,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 88
},
{
"epoch": 0.4,
"grad_norm": 3.248776325364024,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 89
},
{
"epoch": 0.41,
"grad_norm": 2.684435268310462,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 90
},
{
"epoch": 0.41,
"grad_norm": 2.9459506953365486,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 91
},
{
"epoch": 0.42,
"grad_norm": 3.6777239852969212,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 92
},
{
"epoch": 0.42,
"grad_norm": 3.687978088563062,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 93
},
{
"epoch": 0.43,
"grad_norm": 3.625427110996173,
"learning_rate": 0.0001,
"loss": 3.1875,
"step": 94
},
{
"epoch": 0.43,
"grad_norm": 3.364572491761355,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 95
},
{
"epoch": 0.43,
"grad_norm": 4.003287810433365,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 96
},
{
"epoch": 0.44,
"grad_norm": 3.7602829324063642,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 97
},
{
"epoch": 0.44,
"grad_norm": 2.8680654062417377,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 98
},
{
"epoch": 0.45,
"grad_norm": 2.3377680005099393,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 99
},
{
"epoch": 0.45,
"grad_norm": 4.080756825063626,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 100
},
{
"epoch": 0.46,
"grad_norm": 3.5824032205107357,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 101
},
{
"epoch": 0.46,
"grad_norm": 3.572874096937035,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 102
},
{
"epoch": 0.47,
"grad_norm": 3.173554273811993,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 103
},
{
"epoch": 0.47,
"grad_norm": 3.374295066463168,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 104
},
{
"epoch": 0.48,
"grad_norm": 3.546675683646743,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 105
},
{
"epoch": 0.48,
"grad_norm": 2.8610685415267687,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 106
},
{
"epoch": 0.48,
"grad_norm": 3.739045406962109,
"learning_rate": 0.0001,
"loss": 3.25,
"step": 107
},
{
"epoch": 0.49,
"grad_norm": 2.963531109735375,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 108
},
{
"epoch": 0.49,
"grad_norm": 3.269922121525073,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 109
},
{
"epoch": 0.5,
"grad_norm": 3.1227369643458402,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 110
},
{
"epoch": 0.5,
"grad_norm": 3.0619044281415593,
"learning_rate": 0.0001,
"loss": 3.1719,
"step": 111
},
{
"epoch": 0.51,
"grad_norm": 3.502108277442133,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 112
},
{
"epoch": 0.51,
"grad_norm": 3.4217887133458893,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 113
},
{
"epoch": 0.52,
"grad_norm": 4.071458514694356,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 114
},
{
"epoch": 0.52,
"grad_norm": 3.8820655057810574,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 115
},
{
"epoch": 0.52,
"grad_norm": 2.425526328303125,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 116
},
{
"epoch": 0.53,
"grad_norm": 2.5725735012774393,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 117
},
{
"epoch": 0.53,
"grad_norm": 2.570192135060652,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 118
},
{
"epoch": 0.54,
"grad_norm": 4.518786389858919,
"learning_rate": 0.0001,
"loss": 3.3906,
"step": 119
},
{
"epoch": 0.54,
"grad_norm": 2.781462280343676,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 120
},
{
"epoch": 0.55,
"grad_norm": 4.0169315946259525,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 121
},
{
"epoch": 0.55,
"grad_norm": 3.3904299413281547,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 122
},
{
"epoch": 0.56,
"grad_norm": 2.767985407126416,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 123
},
{
"epoch": 0.56,
"grad_norm": 3.350209228567863,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 124
},
{
"epoch": 0.57,
"grad_norm": 3.7716820325330063,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 125
},
{
"epoch": 0.57,
"grad_norm": 2.6172676918246065,
"learning_rate": 0.0001,
"loss": 3.1406,
"step": 126
},
{
"epoch": 0.57,
"grad_norm": 2.8615732602702617,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 127
},
{
"epoch": 0.58,
"grad_norm": 3.1977593475641566,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 128
},
{
"epoch": 0.58,
"grad_norm": 3.7044980154353344,
"learning_rate": 0.0001,
"loss": 3.0781,
"step": 129
},
{
"epoch": 0.59,
"grad_norm": 2.4708896342997333,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 130
},
{
"epoch": 0.59,
"grad_norm": 2.973855966147942,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 131
},
{
"epoch": 0.6,
"grad_norm": 2.7119122676528393,
"learning_rate": 0.0001,
"loss": 2.3594,
"step": 132
},
{
"epoch": 0.6,
"grad_norm": 2.6702654024783024,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 133
},
{
"epoch": 0.61,
"grad_norm": 3.7431706355808334,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 134
},
{
"epoch": 0.61,
"grad_norm": 5.077035030726395,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 135
},
{
"epoch": 0.62,
"grad_norm": 3.690111157612484,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 136
},
{
"epoch": 0.62,
"grad_norm": 2.9422207378285203,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 137
},
{
"epoch": 0.62,
"grad_norm": 4.791349492274715,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 138
},
{
"epoch": 0.63,
"grad_norm": 2.5077421362522108,
"learning_rate": 0.0001,
"loss": 3.0781,
"step": 139
},
{
"epoch": 0.63,
"grad_norm": 3.1415791107859667,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 140
},
{
"epoch": 0.64,
"grad_norm": 3.655269108842213,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 141
},
{
"epoch": 0.64,
"grad_norm": 3.1238567068777745,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 142
},
{
"epoch": 0.65,
"grad_norm": 2.6177206881858,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 143
},
{
"epoch": 0.65,
"grad_norm": 2.843884596485035,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 144
},
{
"epoch": 0.66,
"grad_norm": 4.63778601469586,
"learning_rate": 0.0001,
"loss": 3.4062,
"step": 145
},
{
"epoch": 0.66,
"grad_norm": 2.2470659627608582,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 146
},
{
"epoch": 0.67,
"grad_norm": 2.597522080096179,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 147
},
{
"epoch": 0.67,
"grad_norm": 3.3910575990800633,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 148
},
{
"epoch": 0.67,
"grad_norm": 3.0899424465475698,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 149
},
{
"epoch": 0.68,
"grad_norm": 2.8859452606243186,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 150
},
{
"epoch": 0.68,
"grad_norm": 4.272675820285552,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 151
},
{
"epoch": 0.69,
"grad_norm": 3.258577402368087,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 152
},
{
"epoch": 0.69,
"grad_norm": 4.136534066435656,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 153
},
{
"epoch": 0.7,
"grad_norm": 3.503047467842312,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 154
},
{
"epoch": 0.7,
"grad_norm": 3.2423987815492694,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 155
},
{
"epoch": 0.71,
"grad_norm": 3.4377222090009774,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 156
},
{
"epoch": 0.71,
"grad_norm": 3.1680307697838144,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 157
},
{
"epoch": 0.71,
"grad_norm": 3.7903929164222423,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 158
},
{
"epoch": 0.72,
"grad_norm": 3.9152200948164633,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 159
},
{
"epoch": 0.72,
"grad_norm": 3.148242879456836,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 160
},
{
"epoch": 0.73,
"grad_norm": 3.2016489522036387,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 161
},
{
"epoch": 0.73,
"grad_norm": 2.6290757877359283,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 162
},
{
"epoch": 0.74,
"grad_norm": 3.3331963669130316,
"learning_rate": 0.0001,
"loss": 3.1406,
"step": 163
},
{
"epoch": 0.74,
"grad_norm": 3.7860541087442985,
"learning_rate": 0.0001,
"loss": 3.3906,
"step": 164
},
{
"epoch": 0.75,
"grad_norm": 2.6724816282540425,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 165
},
{
"epoch": 0.75,
"grad_norm": 2.7967393894511416,
"learning_rate": 0.0001,
"loss": 3.375,
"step": 166
},
{
"epoch": 0.76,
"grad_norm": 3.3937925854061146,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 167
},
{
"epoch": 0.76,
"grad_norm": 4.384654503841222,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 168
},
{
"epoch": 0.76,
"grad_norm": 3.9320332448378066,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 169
},
{
"epoch": 0.77,
"grad_norm": 3.8025261019973815,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 170
},
{
"epoch": 0.77,
"grad_norm": 2.7469268151387083,
"learning_rate": 0.0001,
"loss": 3.3438,
"step": 171
},
{
"epoch": 0.78,
"grad_norm": 2.728521201067668,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 172
},
{
"epoch": 0.78,
"grad_norm": 2.945552120230828,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 173
},
{
"epoch": 0.79,
"grad_norm": 3.5474098558831555,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 174
},
{
"epoch": 0.79,
"grad_norm": 2.682553203483424,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 175
},
{
"epoch": 0.8,
"grad_norm": 3.454442321478634,
"learning_rate": 0.0001,
"loss": 3.2031,
"step": 176
},
{
"epoch": 0.8,
"grad_norm": 3.4711419275650197,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 177
},
{
"epoch": 0.81,
"grad_norm": 2.96972797034164,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 178
},
{
"epoch": 0.81,
"grad_norm": 3.241804222438612,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 179
},
{
"epoch": 0.81,
"grad_norm": 2.591008613141231,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 180
},
{
"epoch": 0.82,
"grad_norm": 3.1836736658810905,
"learning_rate": 0.0001,
"loss": 2.4531,
"step": 181
},
{
"epoch": 0.82,
"grad_norm": 2.6696567939120377,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 182
},
{
"epoch": 0.83,
"grad_norm": 4.153665896714593,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 183
},
{
"epoch": 0.83,
"grad_norm": 3.632729399655711,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 184
},
{
"epoch": 0.84,
"grad_norm": 2.3690150169249673,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 185
},
{
"epoch": 0.84,
"grad_norm": 3.606188859835758,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 186
},
{
"epoch": 0.85,
"grad_norm": 2.8610059159346135,
"learning_rate": 0.0001,
"loss": 3.0781,
"step": 187
},
{
"epoch": 0.85,
"grad_norm": 3.162526657575551,
"learning_rate": 0.0001,
"loss": 3.25,
"step": 188
},
{
"epoch": 0.86,
"grad_norm": 3.3799373047880463,
"learning_rate": 0.0001,
"loss": 3.1719,
"step": 189
},
{
"epoch": 0.86,
"grad_norm": 3.636158335840892,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 190
},
{
"epoch": 0.86,
"grad_norm": 2.628330891784058,
"learning_rate": 0.0001,
"loss": 2.1719,
"step": 191
},
{
"epoch": 0.87,
"grad_norm": 3.1369675986549543,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 192
},
{
"epoch": 0.87,
"grad_norm": 2.9451234006615112,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 193
},
{
"epoch": 0.88,
"grad_norm": 3.1049660335818894,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 194
},
{
"epoch": 0.88,
"grad_norm": 2.746393657093893,
"learning_rate": 0.0001,
"loss": 3.1406,
"step": 195
},
{
"epoch": 0.89,
"grad_norm": 3.313442015397048,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 196
},
{
"epoch": 0.89,
"grad_norm": 3.5237073828607777,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 197
},
{
"epoch": 0.9,
"grad_norm": 3.993671122520398,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 198
},
{
"epoch": 0.9,
"grad_norm": 3.7104137607661096,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 199
},
{
"epoch": 0.9,
"grad_norm": 2.63732458277305,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 200
},
{
"epoch": 0.91,
"grad_norm": 2.637164098858904,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 201
},
{
"epoch": 0.91,
"grad_norm": 2.7067823222285443,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 202
},
{
"epoch": 0.92,
"grad_norm": 3.20499440039486,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 203
},
{
"epoch": 0.92,
"grad_norm": 3.390167457388061,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 204
},
{
"epoch": 0.93,
"grad_norm": 2.9755718114652785,
"learning_rate": 0.0001,
"loss": 3.25,
"step": 205
},
{
"epoch": 0.93,
"grad_norm": 2.9858024159992387,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 206
},
{
"epoch": 0.94,
"grad_norm": 3.49719960170014,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 207
},
{
"epoch": 0.94,
"grad_norm": 2.760481519803184,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 208
},
{
"epoch": 0.95,
"grad_norm": 3.5361985653549035,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 209
},
{
"epoch": 0.95,
"grad_norm": 2.7189516787553702,
"learning_rate": 0.0001,
"loss": 3.1719,
"step": 210
},
{
"epoch": 0.95,
"grad_norm": 2.7444161167117778,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 211
},
{
"epoch": 0.96,
"grad_norm": 4.178779298351167,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 212
},
{
"epoch": 0.96,
"grad_norm": 5.1382962163861245,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 213
},
{
"epoch": 0.97,
"grad_norm": 4.777877381510249,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 214
},
{
"epoch": 0.97,
"grad_norm": 4.483777499230272,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 215
},
{
"epoch": 0.98,
"grad_norm": 2.6505694947027414,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 216
},
{
"epoch": 0.98,
"grad_norm": 2.8266381222956296,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 217
},
{
"epoch": 0.99,
"grad_norm": 2.7454666967878922,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 218
},
{
"epoch": 0.99,
"grad_norm": 3.2359889427504243,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 219
},
{
"epoch": 1.0,
"grad_norm": 3.2617408981943914,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 220
},
{
"epoch": 1.0,
"grad_norm": 3.761832594055263,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 221
},
{
"epoch": 1.0,
"grad_norm": 3.429839617150036,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 222
},
{
"epoch": 1.01,
"grad_norm": 3.472012057501413,
"learning_rate": 0.0001,
"loss": 3.1406,
"step": 223
},
{
"epoch": 1.01,
"grad_norm": 4.347859361063026,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 224
},
{
"epoch": 1.02,
"grad_norm": 3.7603120152326337,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 225
},
{
"epoch": 1.02,
"grad_norm": 3.141198806856796,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 226
},
{
"epoch": 1.03,
"grad_norm": 2.27575220665026,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 227
},
{
"epoch": 1.03,
"grad_norm": 3.292652786163134,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 228
},
{
"epoch": 1.04,
"grad_norm": 2.8521097618658215,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 229
},
{
"epoch": 1.04,
"grad_norm": 3.5683909346483116,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 230
},
{
"epoch": 1.05,
"grad_norm": 2.048666439224908,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 231
},
{
"epoch": 1.05,
"grad_norm": 3.226473060669876,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 232
},
{
"epoch": 1.05,
"grad_norm": 2.560468630545825,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 233
},
{
"epoch": 1.06,
"grad_norm": 2.8610577465558875,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 234
},
{
"epoch": 1.06,
"grad_norm": 3.276597524431846,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 235
},
{
"epoch": 1.07,
"grad_norm": 3.11183393883472,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 236
},
{
"epoch": 1.07,
"grad_norm": 2.731944444220084,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 237
},
{
"epoch": 1.08,
"grad_norm": 3.439580204033313,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 238
},
{
"epoch": 1.08,
"grad_norm": 3.850548595357673,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 239
},
{
"epoch": 1.09,
"grad_norm": 3.032256388909145,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 240
},
{
"epoch": 1.09,
"grad_norm": 2.816746313771955,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 241
},
{
"epoch": 1.1,
"grad_norm": 3.555308458291825,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 242
},
{
"epoch": 1.1,
"grad_norm": 3.1232590262180104,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 243
},
{
"epoch": 1.1,
"grad_norm": 2.841004178819089,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 244
},
{
"epoch": 1.11,
"grad_norm": 3.169879937448116,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 245
},
{
"epoch": 1.11,
"grad_norm": 3.756621242224477,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 246
},
{
"epoch": 1.12,
"grad_norm": 3.291591115732283,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 247
},
{
"epoch": 1.12,
"grad_norm": 3.9491635337617876,
"learning_rate": 0.0001,
"loss": 3.2031,
"step": 248
},
{
"epoch": 1.13,
"grad_norm": 4.5454750211170705,
"learning_rate": 0.0001,
"loss": 2.4219,
"step": 249
},
{
"epoch": 1.13,
"grad_norm": 3.1194188990231173,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 250
},
{
"epoch": 1.14,
"grad_norm": 3.1708709674001154,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 251
},
{
"epoch": 1.14,
"grad_norm": 3.769197611076157,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 252
},
{
"epoch": 1.14,
"grad_norm": 3.1034571175550734,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 253
},
{
"epoch": 1.15,
"grad_norm": 3.3624038577718443,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 254
},
{
"epoch": 1.15,
"grad_norm": 3.695977386674938,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 255
},
{
"epoch": 1.16,
"grad_norm": 3.520629660314111,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 256
},
{
"epoch": 1.16,
"grad_norm": 3.6643635174993214,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 257
},
{
"epoch": 1.17,
"grad_norm": 3.6400728396728788,
"learning_rate": 0.0001,
"loss": 2.3281,
"step": 258
},
{
"epoch": 1.17,
"grad_norm": 3.1938129371502533,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 259
},
{
"epoch": 1.18,
"grad_norm": 4.070065400459323,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 260
},
{
"epoch": 1.18,
"grad_norm": 3.8599798135960857,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 261
},
{
"epoch": 1.19,
"grad_norm": 4.0800675518707425,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 262
},
{
"epoch": 1.19,
"grad_norm": 2.88070935560625,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 263
},
{
"epoch": 1.19,
"grad_norm": 3.6143748192147958,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 264
},
{
"epoch": 1.2,
"grad_norm": 4.033368015293004,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 265
},
{
"epoch": 1.2,
"grad_norm": 3.1096713230086754,
"learning_rate": 0.0001,
"loss": 3.1562,
"step": 266
},
{
"epoch": 1.21,
"grad_norm": 3.53880392811516,
"learning_rate": 0.0001,
"loss": 2.2188,
"step": 267
},
{
"epoch": 1.21,
"grad_norm": 3.9253800416766644,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 268
},
{
"epoch": 1.22,
"grad_norm": 3.4594381842770394,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 269
},
{
"epoch": 1.22,
"grad_norm": 3.868070923795578,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 270
},
{
"epoch": 1.23,
"grad_norm": 3.234829607238394,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 271
},
{
"epoch": 1.23,
"grad_norm": 3.823186245574893,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 272
},
{
"epoch": 1.24,
"grad_norm": 3.0994313335043002,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 273
},
{
"epoch": 1.24,
"grad_norm": 3.6618700341670536,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 274
},
{
"epoch": 1.24,
"grad_norm": 4.245984922959924,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 275
},
{
"epoch": 1.25,
"grad_norm": 3.413612437224495,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 276
},
{
"epoch": 1.25,
"grad_norm": 3.6451564739213547,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 277
},
{
"epoch": 1.26,
"grad_norm": 4.166476678137405,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 278
},
{
"epoch": 1.26,
"grad_norm": 3.724493167963291,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 279
},
{
"epoch": 1.27,
"grad_norm": 3.395942991582062,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 280
},
{
"epoch": 1.27,
"grad_norm": 3.5527811358540693,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 281
},
{
"epoch": 1.28,
"grad_norm": 5.687097991739583,
"learning_rate": 0.0001,
"loss": 3.2188,
"step": 282
},
{
"epoch": 1.28,
"grad_norm": 4.6245839813569045,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 283
},
{
"epoch": 1.29,
"grad_norm": 4.001745122577989,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 284
},
{
"epoch": 1.29,
"grad_norm": 4.2065745171783675,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 285
},
{
"epoch": 1.29,
"grad_norm": 3.763842873535832,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 286
},
{
"epoch": 1.3,
"grad_norm": 3.4297749721988753,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 287
},
{
"epoch": 1.3,
"grad_norm": 5.112060409623577,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 288
},
{
"epoch": 1.31,
"grad_norm": 4.765001988028911,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 289
},
{
"epoch": 1.31,
"grad_norm": 4.025477764290351,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 290
},
{
"epoch": 1.32,
"grad_norm": 3.533207269511524,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 291
},
{
"epoch": 1.32,
"grad_norm": 3.894629067717557,
"learning_rate": 0.0001,
"loss": 2.3438,
"step": 292
},
{
"epoch": 1.33,
"grad_norm": 3.8265063420175554,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 293
},
{
"epoch": 1.33,
"grad_norm": 5.387518896603435,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 294
},
{
"epoch": 1.33,
"grad_norm": 3.462770499090204,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 295
},
{
"epoch": 1.34,
"grad_norm": 3.999081454001175,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 296
},
{
"epoch": 1.34,
"grad_norm": 3.26791217269049,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 297
},
{
"epoch": 1.35,
"grad_norm": 4.01576665487724,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 298
},
{
"epoch": 1.35,
"grad_norm": 3.4357297956359223,
"learning_rate": 0.0001,
"loss": 3.2188,
"step": 299
},
{
"epoch": 1.36,
"grad_norm": 3.991044885131129,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 300
},
{
"epoch": 1.36,
"grad_norm": 3.2208290782529088,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 301
},
{
"epoch": 1.37,
"grad_norm": 3.5358550835047375,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 302
},
{
"epoch": 1.37,
"grad_norm": 3.25374956312475,
"learning_rate": 0.0001,
"loss": 3.1719,
"step": 303
},
{
"epoch": 1.38,
"grad_norm": 3.260985818723821,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 304
},
{
"epoch": 1.38,
"grad_norm": 3.9257270591651534,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 305
},
{
"epoch": 1.38,
"grad_norm": 3.167203120292308,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 306
},
{
"epoch": 1.39,
"grad_norm": 4.694459033055643,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 307
},
{
"epoch": 1.39,
"grad_norm": 3.6064245290727452,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 308
},
{
"epoch": 1.4,
"grad_norm": 3.4765920276238136,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 309
},
{
"epoch": 1.4,
"grad_norm": 4.168438398393131,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 310
},
{
"epoch": 1.41,
"grad_norm": 4.641284910447617,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 311
},
{
"epoch": 1.41,
"grad_norm": 3.165019356899751,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 312
},
{
"epoch": 1.42,
"grad_norm": 6.293667183946401,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 313
},
{
"epoch": 1.42,
"grad_norm": 3.8581829580543574,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 314
},
{
"epoch": 1.43,
"grad_norm": 4.0595835955067505,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 315
},
{
"epoch": 1.43,
"grad_norm": 3.804012147939095,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 316
},
{
"epoch": 1.43,
"grad_norm": 2.8757598500054873,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 317
},
{
"epoch": 1.44,
"grad_norm": 3.473056721459923,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 318
},
{
"epoch": 1.44,
"grad_norm": 4.030824580859273,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 319
},
{
"epoch": 1.45,
"grad_norm": 3.7575238271559557,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 320
},
{
"epoch": 1.45,
"grad_norm": 4.319465423783974,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 321
},
{
"epoch": 1.46,
"grad_norm": 3.4270491578278315,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 322
},
{
"epoch": 1.46,
"grad_norm": 4.597605413378048,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 323
},
{
"epoch": 1.47,
"grad_norm": 5.242593092794404,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 324
},
{
"epoch": 1.47,
"grad_norm": 3.5690314241373007,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 325
},
{
"epoch": 1.48,
"grad_norm": 3.170604037406789,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 326
},
{
"epoch": 1.48,
"grad_norm": 4.6838516697328,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 327
},
{
"epoch": 1.48,
"grad_norm": 3.2080896344291854,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 328
},
{
"epoch": 1.49,
"grad_norm": 5.326504169409197,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 329
},
{
"epoch": 1.49,
"grad_norm": 3.2789255992893596,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 330
},
{
"epoch": 1.5,
"grad_norm": 3.085516640743533,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 331
},
{
"epoch": 1.5,
"grad_norm": 4.194716526521928,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 332
},
{
"epoch": 1.51,
"grad_norm": 3.2525926179968034,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 333
},
{
"epoch": 1.51,
"grad_norm": 3.0977142194617313,
"learning_rate": 0.0001,
"loss": 2.5469,
"step": 334
},
{
"epoch": 1.52,
"grad_norm": 3.760665526282257,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 335
},
{
"epoch": 1.52,
"grad_norm": 3.7818001961714445,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 336
},
{
"epoch": 1.52,
"grad_norm": 3.6609563888659973,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 337
},
{
"epoch": 1.53,
"grad_norm": 3.982276940149541,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 338
},
{
"epoch": 1.53,
"grad_norm": 2.8042721280801484,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 339
},
{
"epoch": 1.54,
"grad_norm": 3.2615173671408555,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 340
},
{
"epoch": 1.54,
"grad_norm": 3.6914782762730924,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 341
},
{
"epoch": 1.55,
"grad_norm": 4.899453093729484,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 342
},
{
"epoch": 1.55,
"grad_norm": 4.291638036327811,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 343
},
{
"epoch": 1.56,
"grad_norm": 2.6028433980069865,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 344
},
{
"epoch": 1.56,
"grad_norm": 2.7549032124651736,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 345
},
{
"epoch": 1.57,
"grad_norm": 3.352486062521882,
"learning_rate": 0.0001,
"loss": 3.1562,
"step": 346
},
{
"epoch": 1.57,
"grad_norm": 2.8708894259256845,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 347
},
{
"epoch": 1.57,
"grad_norm": 5.037035733284411,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 348
},
{
"epoch": 1.58,
"grad_norm": 3.594206741562048,
"learning_rate": 0.0001,
"loss": 2.3438,
"step": 349
},
{
"epoch": 1.58,
"grad_norm": 5.159307734311026,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 350
},
{
"epoch": 1.59,
"grad_norm": 5.9233278621953644,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 351
},
{
"epoch": 1.59,
"grad_norm": 4.0715030156474254,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 352
},
{
"epoch": 1.6,
"grad_norm": 3.044655552777047,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 353
},
{
"epoch": 1.6,
"grad_norm": 3.7734627420835953,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 354
},
{
"epoch": 1.61,
"grad_norm": 5.180693081625301,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 355
},
{
"epoch": 1.61,
"grad_norm": 4.3219223924038745,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 356
},
{
"epoch": 1.62,
"grad_norm": 4.142269921917345,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 357
},
{
"epoch": 1.62,
"grad_norm": 3.3735403369190893,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 358
},
{
"epoch": 1.62,
"grad_norm": 3.5239894283719653,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 359
},
{
"epoch": 1.63,
"grad_norm": 3.8828475802810245,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 360
},
{
"epoch": 1.63,
"grad_norm": 4.693796163234429,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 361
},
{
"epoch": 1.64,
"grad_norm": 3.4075539488803925,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 362
},
{
"epoch": 1.64,
"grad_norm": 3.664547612895459,
"learning_rate": 0.0001,
"loss": 3.2188,
"step": 363
},
{
"epoch": 1.65,
"grad_norm": 5.938694208857688,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 364
},
{
"epoch": 1.65,
"grad_norm": 3.409013748608653,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 365
},
{
"epoch": 1.66,
"grad_norm": 4.4658543768768935,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 366
},
{
"epoch": 1.66,
"grad_norm": 3.371693408259016,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 367
},
{
"epoch": 1.67,
"grad_norm": 4.559788858289379,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 368
},
{
"epoch": 1.67,
"grad_norm": 3.628331869322542,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 369
},
{
"epoch": 1.67,
"grad_norm": 4.39455926251329,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 370
},
{
"epoch": 1.68,
"grad_norm": 3.853396943835766,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 371
},
{
"epoch": 1.68,
"grad_norm": 3.9765219333825494,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 372
},
{
"epoch": 1.69,
"grad_norm": 3.061152055123867,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 373
},
{
"epoch": 1.69,
"grad_norm": 4.618483012917297,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 374
},
{
"epoch": 1.7,
"grad_norm": 3.3260259398876393,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 375
},
{
"epoch": 1.7,
"grad_norm": 3.3538962131850387,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 376
},
{
"epoch": 1.71,
"grad_norm": 5.8575854324075864,
"learning_rate": 0.0001,
"loss": 2.2344,
"step": 377
},
{
"epoch": 1.71,
"grad_norm": 4.168971058411045,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 378
},
{
"epoch": 1.71,
"grad_norm": 3.6920069784108116,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 379
},
{
"epoch": 1.72,
"grad_norm": 4.170712811496067,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 380
},
{
"epoch": 1.72,
"grad_norm": 4.793007744358583,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 381
},
{
"epoch": 1.73,
"grad_norm": 4.025355549587713,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 382
},
{
"epoch": 1.73,
"grad_norm": 3.4870718987013296,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 383
},
{
"epoch": 1.74,
"grad_norm": 3.3667477898409186,
"learning_rate": 0.0001,
"loss": 2.2969,
"step": 384
},
{
"epoch": 1.74,
"grad_norm": 3.7242254076840884,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 385
},
{
"epoch": 1.75,
"grad_norm": 3.1840545349531495,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 386
},
{
"epoch": 1.75,
"grad_norm": 4.823197200047613,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 387
},
{
"epoch": 1.76,
"grad_norm": 3.988386839545984,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 388
},
{
"epoch": 1.76,
"grad_norm": 5.057343299049697,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 389
},
{
"epoch": 1.76,
"grad_norm": 5.161242480912821,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 390
},
{
"epoch": 1.77,
"grad_norm": 4.480668734671967,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 391
},
{
"epoch": 1.77,
"grad_norm": 3.9365190921112907,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 392
},
{
"epoch": 1.78,
"grad_norm": 4.662495774692652,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 393
},
{
"epoch": 1.78,
"grad_norm": 3.5240918262495318,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 394
},
{
"epoch": 1.79,
"grad_norm": 4.526636196082364,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 395
},
{
"epoch": 1.79,
"grad_norm": 4.469697798668837,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 396
},
{
"epoch": 1.8,
"grad_norm": 5.4459922507695495,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 397
},
{
"epoch": 1.8,
"grad_norm": 3.236562942073936,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 398
},
{
"epoch": 1.81,
"grad_norm": 3.770016835883372,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 399
},
{
"epoch": 1.81,
"grad_norm": 3.9591980759103653,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 400
},
{
"epoch": 1.81,
"grad_norm": 7.860080427566192,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 401
},
{
"epoch": 1.82,
"grad_norm": 3.5735897118338418,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 402
},
{
"epoch": 1.82,
"grad_norm": 3.253545354981559,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 403
},
{
"epoch": 1.83,
"grad_norm": 3.079257326273105,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 404
},
{
"epoch": 1.83,
"grad_norm": 4.47108730912994,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 405
},
{
"epoch": 1.84,
"grad_norm": 4.921590616999781,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 406
},
{
"epoch": 1.84,
"grad_norm": 3.270607047698539,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 407
},
{
"epoch": 1.85,
"grad_norm": 4.207173315577373,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 408
},
{
"epoch": 1.85,
"grad_norm": 3.7871838368285267,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 409
},
{
"epoch": 1.86,
"grad_norm": 4.936529474935983,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 410
},
{
"epoch": 1.86,
"grad_norm": 3.8957693898330037,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 411
},
{
"epoch": 1.86,
"grad_norm": 4.960143029053001,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 412
},
{
"epoch": 1.87,
"grad_norm": 4.03955454269378,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 413
},
{
"epoch": 1.87,
"grad_norm": 8.311598719730528,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 414
},
{
"epoch": 1.88,
"grad_norm": 4.078913705129825,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 415
},
{
"epoch": 1.88,
"grad_norm": 3.2725183823603006,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 416
},
{
"epoch": 1.89,
"grad_norm": 5.107210559438947,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 417
},
{
"epoch": 1.89,
"grad_norm": 4.293656342570625,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 418
},
{
"epoch": 1.9,
"grad_norm": 3.5007632189651408,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 419
},
{
"epoch": 1.9,
"grad_norm": 2.870288914226923,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 420
},
{
"epoch": 1.9,
"grad_norm": 4.013292523806265,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 421
},
{
"epoch": 1.91,
"grad_norm": 2.944258225094142,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 422
},
{
"epoch": 1.91,
"grad_norm": 5.0008572219675,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 423
},
{
"epoch": 1.92,
"grad_norm": 3.4668064948996435,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 424
},
{
"epoch": 1.92,
"grad_norm": 3.282414339798479,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 425
},
{
"epoch": 1.93,
"grad_norm": 4.3660030510317736,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 426
},
{
"epoch": 1.93,
"grad_norm": 4.277759844521119,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 427
},
{
"epoch": 1.94,
"grad_norm": 6.495737721951466,
"learning_rate": 0.0001,
"loss": 2.5469,
"step": 428
},
{
"epoch": 1.94,
"grad_norm": 3.6749746067665052,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 429
},
{
"epoch": 1.95,
"grad_norm": 3.8285972640178647,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 430
},
{
"epoch": 1.95,
"grad_norm": 4.329624774415669,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 431
},
{
"epoch": 1.95,
"grad_norm": 3.5664735497666173,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 432
},
{
"epoch": 1.96,
"grad_norm": 3.814296312027705,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 433
},
{
"epoch": 1.96,
"grad_norm": 3.255543154666781,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 434
},
{
"epoch": 1.97,
"grad_norm": 6.098448774362813,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 435
},
{
"epoch": 1.97,
"grad_norm": 4.594773365704486,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 436
},
{
"epoch": 1.98,
"grad_norm": 4.256945975023405,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 437
},
{
"epoch": 1.98,
"grad_norm": 3.45198248082701,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 438
},
{
"epoch": 1.99,
"grad_norm": 4.645343343544797,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 439
},
{
"epoch": 1.99,
"grad_norm": 2.8494264914506013,
"learning_rate": 0.0001,
"loss": 2.4531,
"step": 440
},
{
"epoch": 2.0,
"grad_norm": 3.2371106907573353,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 441
},
{
"epoch": 2.0,
"grad_norm": 4.351841237094819,
"learning_rate": 0.0001,
"loss": 2.5469,
"step": 442
},
{
"epoch": 2.0,
"grad_norm": 4.885380513377714,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 443
},
{
"epoch": 2.01,
"grad_norm": 4.141226131557114,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 444
},
{
"epoch": 2.01,
"grad_norm": 3.571384859469851,
"learning_rate": 0.0001,
"loss": 3.2031,
"step": 445
},
{
"epoch": 2.02,
"grad_norm": 4.143653406347033,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 446
},
{
"epoch": 2.02,
"grad_norm": 3.376258580527233,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 447
},
{
"epoch": 2.03,
"grad_norm": 5.041611047731054,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 448
},
{
"epoch": 2.03,
"grad_norm": 2.7510995579910897,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 449
},
{
"epoch": 2.04,
"grad_norm": 4.794901656709599,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 450
},
{
"epoch": 2.04,
"grad_norm": 3.2071039166350794,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 451
},
{
"epoch": 2.05,
"grad_norm": 3.6160535680470733,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 452
},
{
"epoch": 2.05,
"grad_norm": 3.4951696700796053,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 453
},
{
"epoch": 2.05,
"grad_norm": 4.052957529710831,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 454
},
{
"epoch": 2.06,
"grad_norm": 4.5548596168461035,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 455
},
{
"epoch": 2.06,
"grad_norm": 4.692304645656616,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 456
},
{
"epoch": 2.07,
"grad_norm": 4.251726697426816,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 457
},
{
"epoch": 2.07,
"grad_norm": 5.055218343110248,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 458
},
{
"epoch": 2.08,
"grad_norm": 4.0514218848787475,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 459
},
{
"epoch": 2.08,
"grad_norm": 5.471343199254358,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 460
},
{
"epoch": 2.09,
"grad_norm": 4.364540022104307,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 461
},
{
"epoch": 2.09,
"grad_norm": 4.347289897028965,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 462
},
{
"epoch": 2.1,
"grad_norm": 3.320864844261629,
"learning_rate": 0.0001,
"loss": 2.1719,
"step": 463
},
{
"epoch": 2.1,
"grad_norm": 4.277320044723722,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 464
},
{
"epoch": 2.1,
"grad_norm": 5.2453592445245745,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 465
},
{
"epoch": 2.11,
"grad_norm": 5.520158779920925,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 466
},
{
"epoch": 2.11,
"grad_norm": 5.09919550518767,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 467
},
{
"epoch": 2.12,
"grad_norm": 5.7882890121116075,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 468
},
{
"epoch": 2.12,
"grad_norm": 4.166784108590436,
"learning_rate": 0.0001,
"loss": 2.3906,
"step": 469
},
{
"epoch": 2.13,
"grad_norm": 5.532799561595931,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 470
},
{
"epoch": 2.13,
"grad_norm": 4.199106278412939,
"learning_rate": 0.0001,
"loss": 2.4531,
"step": 471
},
{
"epoch": 2.14,
"grad_norm": 5.148294228201523,
"learning_rate": 0.0001,
"loss": 2.4844,
"step": 472
},
{
"epoch": 2.14,
"grad_norm": 6.06473338860914,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 473
},
{
"epoch": 2.14,
"grad_norm": 4.4020944879221435,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 474
},
{
"epoch": 2.15,
"grad_norm": 6.37168824745839,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 475
},
{
"epoch": 2.15,
"grad_norm": 6.2089065293516335,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 476
},
{
"epoch": 2.16,
"grad_norm": 5.9677659863997174,
"learning_rate": 0.0001,
"loss": 2.3281,
"step": 477
},
{
"epoch": 2.16,
"grad_norm": 4.535209473651245,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 478
},
{
"epoch": 2.17,
"grad_norm": 4.758021060904884,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 479
},
{
"epoch": 2.17,
"grad_norm": 4.6744206823178605,
"learning_rate": 0.0001,
"loss": 2.2969,
"step": 480
},
{
"epoch": 2.18,
"grad_norm": 4.748502122462707,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 481
},
{
"epoch": 2.18,
"grad_norm": 3.8572636341399447,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 482
},
{
"epoch": 2.19,
"grad_norm": 4.399471923346186,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 483
},
{
"epoch": 2.19,
"grad_norm": 3.646671632648375,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 484
},
{
"epoch": 2.19,
"grad_norm": 4.00567569136169,
"learning_rate": 0.0001,
"loss": 3.1562,
"step": 485
},
{
"epoch": 2.2,
"grad_norm": 3.65464670971521,
"learning_rate": 0.0001,
"loss": 2.3906,
"step": 486
},
{
"epoch": 2.2,
"grad_norm": 3.91732048961442,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 487
},
{
"epoch": 2.21,
"grad_norm": 5.317268932471553,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 488
},
{
"epoch": 2.21,
"grad_norm": 5.341302649630214,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 489
},
{
"epoch": 2.22,
"grad_norm": 4.383122101856098,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 490
},
{
"epoch": 2.22,
"grad_norm": 3.8860693666995063,
"learning_rate": 0.0001,
"loss": 2.4844,
"step": 491
},
{
"epoch": 2.23,
"grad_norm": 4.364787342162316,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 492
},
{
"epoch": 2.23,
"grad_norm": 4.470798240244141,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 493
},
{
"epoch": 2.24,
"grad_norm": 6.104282384032258,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 494
},
{
"epoch": 2.24,
"grad_norm": 5.006875450612742,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 495
},
{
"epoch": 2.24,
"grad_norm": 5.331678950721802,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 496
},
{
"epoch": 2.25,
"grad_norm": 4.04281058614812,
"learning_rate": 0.0001,
"loss": 2.3594,
"step": 497
},
{
"epoch": 2.25,
"grad_norm": 5.345612164193795,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 498
},
{
"epoch": 2.26,
"grad_norm": 5.341596960250008,
"learning_rate": 0.0001,
"loss": 2.3438,
"step": 499
},
{
"epoch": 2.26,
"grad_norm": 5.739876231264038,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 500
},
{
"epoch": 2.27,
"grad_norm": 5.142026268720496,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 501
},
{
"epoch": 2.27,
"grad_norm": 4.15681316206977,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 502
},
{
"epoch": 2.28,
"grad_norm": 4.817970447202398,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 503
},
{
"epoch": 2.28,
"grad_norm": 8.308610641023103,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 504
},
{
"epoch": 2.29,
"grad_norm": 5.00252311689909,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 505
},
{
"epoch": 2.29,
"grad_norm": 4.380395189338173,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 506
},
{
"epoch": 2.29,
"grad_norm": 4.670956184899664,
"learning_rate": 0.0001,
"loss": 3.1719,
"step": 507
},
{
"epoch": 2.3,
"grad_norm": 4.621499756596695,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 508
},
{
"epoch": 2.3,
"grad_norm": 6.29671870323029,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 509
},
{
"epoch": 2.31,
"grad_norm": 4.932336285101389,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 510
},
{
"epoch": 2.31,
"grad_norm": 4.3245121078513655,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 511
},
{
"epoch": 2.32,
"grad_norm": 7.030111858054767,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 512
},
{
"epoch": 2.32,
"grad_norm": 4.89566917127695,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 513
},
{
"epoch": 2.33,
"grad_norm": 4.195486069953785,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 514
},
{
"epoch": 2.33,
"grad_norm": 5.136743915157165,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 515
},
{
"epoch": 2.33,
"grad_norm": 3.948875651747143,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 516
},
{
"epoch": 2.34,
"grad_norm": 3.231795023359959,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 517
},
{
"epoch": 2.34,
"grad_norm": 4.9551613025707475,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 518
},
{
"epoch": 2.35,
"grad_norm": 5.865415887327459,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 519
},
{
"epoch": 2.35,
"grad_norm": 4.7737465818996885,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 520
},
{
"epoch": 2.36,
"grad_norm": 6.733184400927877,
"learning_rate": 0.0001,
"loss": 2.125,
"step": 521
},
{
"epoch": 2.36,
"grad_norm": 5.290951724234347,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 522
},
{
"epoch": 2.37,
"grad_norm": 4.585822054902166,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 523
},
{
"epoch": 2.37,
"grad_norm": 5.845655683014917,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 524
},
{
"epoch": 2.38,
"grad_norm": 4.058548342806645,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 525
},
{
"epoch": 2.38,
"grad_norm": 4.730863583042205,
"learning_rate": 0.0001,
"loss": 2.3125,
"step": 526
},
{
"epoch": 2.38,
"grad_norm": 3.9460443083159125,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 527
},
{
"epoch": 2.39,
"grad_norm": 5.985763082632476,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 528
},
{
"epoch": 2.39,
"grad_norm": 4.034177525824554,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 529
},
{
"epoch": 2.4,
"grad_norm": 6.0743244504041,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 530
},
{
"epoch": 2.4,
"grad_norm": 6.144275879007958,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 531
},
{
"epoch": 2.41,
"grad_norm": 5.208959046160335,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 532
},
{
"epoch": 2.41,
"grad_norm": 4.505033267544332,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 533
},
{
"epoch": 2.42,
"grad_norm": 7.682902684774122,
"learning_rate": 0.0001,
"loss": 2.3594,
"step": 534
},
{
"epoch": 2.42,
"grad_norm": 5.4360470024869905,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 535
},
{
"epoch": 2.43,
"grad_norm": 5.211817151957024,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 536
},
{
"epoch": 2.43,
"grad_norm": 4.959284674849101,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 537
},
{
"epoch": 2.43,
"grad_norm": 5.4968430538522215,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 538
},
{
"epoch": 2.44,
"grad_norm": 4.1351746678748755,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 539
},
{
"epoch": 2.44,
"grad_norm": 5.320761577416053,
"learning_rate": 0.0001,
"loss": 3.1406,
"step": 540
},
{
"epoch": 2.45,
"grad_norm": 6.655636255576953,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 541
},
{
"epoch": 2.45,
"grad_norm": 4.458839149948617,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 542
},
{
"epoch": 2.46,
"grad_norm": 4.972007553913938,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 543
},
{
"epoch": 2.46,
"grad_norm": 3.5505879907694755,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 544
},
{
"epoch": 2.47,
"grad_norm": 5.0186604630551255,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 545
},
{
"epoch": 2.47,
"grad_norm": 4.717499667281885,
"learning_rate": 0.0001,
"loss": 3.0938,
"step": 546
},
{
"epoch": 2.48,
"grad_norm": 4.38634574908589,
"learning_rate": 0.0001,
"loss": 2.3125,
"step": 547
},
{
"epoch": 2.48,
"grad_norm": 5.633061705506686,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 548
},
{
"epoch": 2.48,
"grad_norm": 5.028007311949011,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 549
},
{
"epoch": 2.49,
"grad_norm": 6.111968054796198,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 550
},
{
"epoch": 2.49,
"grad_norm": 4.563137449239842,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 551
},
{
"epoch": 2.5,
"grad_norm": 4.540680149103331,
"learning_rate": 0.0001,
"loss": 2.9844,
"step": 552
},
{
"epoch": 2.5,
"grad_norm": 7.279079756719769,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 553
},
{
"epoch": 2.51,
"grad_norm": 3.9211600361134695,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 554
},
{
"epoch": 2.51,
"grad_norm": 4.376188709334436,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 555
},
{
"epoch": 2.52,
"grad_norm": 4.7536996299053405,
"learning_rate": 0.0001,
"loss": 1.9062,
"step": 556
},
{
"epoch": 2.52,
"grad_norm": 4.730154548981628,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 557
},
{
"epoch": 2.52,
"grad_norm": 4.483638601741794,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 558
},
{
"epoch": 2.53,
"grad_norm": 5.017903669312139,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 559
},
{
"epoch": 2.53,
"grad_norm": 4.497681177312601,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 560
},
{
"epoch": 2.54,
"grad_norm": 3.6938673030692635,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 561
},
{
"epoch": 2.54,
"grad_norm": 4.627269663966374,
"learning_rate": 0.0001,
"loss": 3.0156,
"step": 562
},
{
"epoch": 2.55,
"grad_norm": 6.028249115854368,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 563
},
{
"epoch": 2.55,
"grad_norm": 5.895270257080288,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 564
},
{
"epoch": 2.56,
"grad_norm": 5.782086264164467,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 565
},
{
"epoch": 2.56,
"grad_norm": 4.832572929404261,
"learning_rate": 0.0001,
"loss": 2.25,
"step": 566
},
{
"epoch": 2.57,
"grad_norm": 4.777919636172623,
"learning_rate": 0.0001,
"loss": 2.5469,
"step": 567
},
{
"epoch": 2.57,
"grad_norm": 5.844834596317805,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 568
},
{
"epoch": 2.57,
"grad_norm": 4.3943966123305165,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 569
},
{
"epoch": 2.58,
"grad_norm": 4.3493264028288285,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 570
},
{
"epoch": 2.58,
"grad_norm": 4.579129290042329,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 571
},
{
"epoch": 2.59,
"grad_norm": 5.106305124633349,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 572
},
{
"epoch": 2.59,
"grad_norm": 3.8121132157014146,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 573
},
{
"epoch": 2.6,
"grad_norm": 7.674035029680783,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 574
},
{
"epoch": 2.6,
"grad_norm": 4.0155468016806255,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 575
},
{
"epoch": 2.61,
"grad_norm": 4.535174372431354,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 576
},
{
"epoch": 2.61,
"grad_norm": 7.713765819686499,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 577
},
{
"epoch": 2.62,
"grad_norm": 6.992348998956431,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 578
},
{
"epoch": 2.62,
"grad_norm": 5.366316115730502,
"learning_rate": 0.0001,
"loss": 3.2969,
"step": 579
},
{
"epoch": 2.62,
"grad_norm": 4.628717491988879,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 580
},
{
"epoch": 2.63,
"grad_norm": 3.736490314420957,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 581
},
{
"epoch": 2.63,
"grad_norm": 3.270159000979569,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 582
},
{
"epoch": 2.64,
"grad_norm": 4.209971095000819,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 583
},
{
"epoch": 2.64,
"grad_norm": 5.904260931835911,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 584
},
{
"epoch": 2.65,
"grad_norm": 4.56368296212557,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 585
},
{
"epoch": 2.65,
"grad_norm": 4.356143809551981,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 586
},
{
"epoch": 2.66,
"grad_norm": 5.192460484725346,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 587
},
{
"epoch": 2.66,
"grad_norm": 4.970184066855408,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 588
},
{
"epoch": 2.67,
"grad_norm": 3.668363909507462,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 589
},
{
"epoch": 2.67,
"grad_norm": 4.975508877974084,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 590
},
{
"epoch": 2.67,
"grad_norm": 4.426312606397656,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 591
},
{
"epoch": 2.68,
"grad_norm": 4.871684486582093,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 592
},
{
"epoch": 2.68,
"grad_norm": 3.8454122192166054,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 593
},
{
"epoch": 2.69,
"grad_norm": 6.046171861939742,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 594
},
{
"epoch": 2.69,
"grad_norm": 4.236762915206439,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 595
},
{
"epoch": 2.7,
"grad_norm": 4.754038548279519,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 596
},
{
"epoch": 2.7,
"grad_norm": 5.3018074744010235,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 597
},
{
"epoch": 2.71,
"grad_norm": 5.138289855563257,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 598
},
{
"epoch": 2.71,
"grad_norm": 5.478633422952733,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 599
},
{
"epoch": 2.71,
"grad_norm": 6.855786567331759,
"learning_rate": 0.0001,
"loss": 2.4844,
"step": 600
},
{
"epoch": 2.72,
"grad_norm": 4.265407093713752,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 601
},
{
"epoch": 2.72,
"grad_norm": 5.036701430071399,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 602
},
{
"epoch": 2.73,
"grad_norm": 4.849399765693369,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 603
},
{
"epoch": 2.73,
"grad_norm": 6.244463203398971,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 604
},
{
"epoch": 2.74,
"grad_norm": 4.911627851437594,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 605
},
{
"epoch": 2.74,
"grad_norm": 5.648660311489794,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 606
},
{
"epoch": 2.75,
"grad_norm": 5.510436851938899,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 607
},
{
"epoch": 2.75,
"grad_norm": 5.73305511033946,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 608
},
{
"epoch": 2.76,
"grad_norm": 5.035663640250028,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 609
},
{
"epoch": 2.76,
"grad_norm": 5.706713259761806,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 610
},
{
"epoch": 2.76,
"grad_norm": 3.9388716131805177,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 611
},
{
"epoch": 2.77,
"grad_norm": 5.208282554862349,
"learning_rate": 0.0001,
"loss": 2.2656,
"step": 612
},
{
"epoch": 2.77,
"grad_norm": 4.030323203128877,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 613
},
{
"epoch": 2.78,
"grad_norm": 4.982573254410871,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 614
},
{
"epoch": 2.78,
"grad_norm": 4.282555681716531,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 615
},
{
"epoch": 2.79,
"grad_norm": 5.254295196214892,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 616
},
{
"epoch": 2.79,
"grad_norm": 6.428623089456714,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 617
},
{
"epoch": 2.8,
"grad_norm": 4.767141810934083,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 618
},
{
"epoch": 2.8,
"grad_norm": 5.419475658643534,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 619
},
{
"epoch": 2.81,
"grad_norm": 5.472135282653432,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 620
},
{
"epoch": 2.81,
"grad_norm": 6.675110370195235,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 621
},
{
"epoch": 2.81,
"grad_norm": 4.366266038957683,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 622
},
{
"epoch": 2.82,
"grad_norm": 3.896695556817304,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 623
},
{
"epoch": 2.82,
"grad_norm": 6.3050055621586605,
"learning_rate": 0.0001,
"loss": 2.4844,
"step": 624
},
{
"epoch": 2.83,
"grad_norm": 4.491540640178512,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 625
},
{
"epoch": 2.83,
"grad_norm": 6.046498345815951,
"learning_rate": 0.0001,
"loss": 2.4844,
"step": 626
},
{
"epoch": 2.84,
"grad_norm": 6.403430130057227,
"learning_rate": 0.0001,
"loss": 3.0781,
"step": 627
},
{
"epoch": 2.84,
"grad_norm": 5.363921266764386,
"learning_rate": 0.0001,
"loss": 2.1875,
"step": 628
},
{
"epoch": 2.85,
"grad_norm": 3.827451350516171,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 629
},
{
"epoch": 2.85,
"grad_norm": 5.50903534823105,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 630
},
{
"epoch": 2.86,
"grad_norm": 4.175065236833921,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 631
},
{
"epoch": 2.86,
"grad_norm": 5.037111092373576,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 632
},
{
"epoch": 2.86,
"grad_norm": 4.6903714504924725,
"learning_rate": 0.0001,
"loss": 2.4531,
"step": 633
},
{
"epoch": 2.87,
"grad_norm": 5.433323796199888,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 634
},
{
"epoch": 2.87,
"grad_norm": 6.2352792612834,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 635
},
{
"epoch": 2.88,
"grad_norm": 5.2711186382571515,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 636
},
{
"epoch": 2.88,
"grad_norm": 7.882069923983431,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 637
},
{
"epoch": 2.89,
"grad_norm": 4.487399648438269,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 638
},
{
"epoch": 2.89,
"grad_norm": 4.582740219582226,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 639
},
{
"epoch": 2.9,
"grad_norm": 5.253149476772823,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 640
},
{
"epoch": 2.9,
"grad_norm": 4.717261080461104,
"learning_rate": 0.0001,
"loss": 2.4531,
"step": 641
},
{
"epoch": 2.9,
"grad_norm": 4.170314259960067,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 642
},
{
"epoch": 2.91,
"grad_norm": 4.9179394418307,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 643
},
{
"epoch": 2.91,
"grad_norm": 5.211586048763502,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 644
},
{
"epoch": 2.92,
"grad_norm": 4.138246002031018,
"learning_rate": 0.0001,
"loss": 2.2656,
"step": 645
},
{
"epoch": 2.92,
"grad_norm": 4.040019263183381,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 646
},
{
"epoch": 2.93,
"grad_norm": 5.016054162159075,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 647
},
{
"epoch": 2.93,
"grad_norm": 4.160869140788706,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 648
},
{
"epoch": 2.94,
"grad_norm": 4.319009327023701,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 649
},
{
"epoch": 2.94,
"grad_norm": 4.788179201751919,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 650
},
{
"epoch": 2.95,
"grad_norm": 4.081130439995906,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 651
},
{
"epoch": 2.95,
"grad_norm": 5.50642508525935,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 652
},
{
"epoch": 2.95,
"grad_norm": 5.585953512031478,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 653
},
{
"epoch": 2.96,
"grad_norm": 5.31807917503442,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 654
},
{
"epoch": 2.96,
"grad_norm": 3.59491039054475,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 655
},
{
"epoch": 2.97,
"grad_norm": 4.4689533270014135,
"learning_rate": 0.0001,
"loss": 3.2656,
"step": 656
},
{
"epoch": 2.97,
"grad_norm": 4.968753065799484,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 657
},
{
"epoch": 2.98,
"grad_norm": 5.528728807749279,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 658
},
{
"epoch": 2.98,
"grad_norm": 5.035149173943463,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 659
},
{
"epoch": 2.99,
"grad_norm": 5.1939238070569935,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 660
},
{
"epoch": 2.99,
"grad_norm": 7.036562828613889,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 661
},
{
"epoch": 3.0,
"grad_norm": 6.297262433586043,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 662
},
{
"epoch": 3.0,
"grad_norm": 5.58699604459073,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 663
},
{
"epoch": 3.0,
"grad_norm": 4.092479681857709,
"learning_rate": 0.0001,
"loss": 3.1094,
"step": 664
},
{
"epoch": 3.01,
"grad_norm": 5.358390139052102,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 665
},
{
"epoch": 3.01,
"grad_norm": 5.280630772569093,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 666
},
{
"epoch": 3.02,
"grad_norm": 4.491504971073838,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 667
},
{
"epoch": 3.02,
"grad_norm": 4.969900611469726,
"learning_rate": 0.0001,
"loss": 2.0312,
"step": 668
},
{
"epoch": 3.03,
"grad_norm": 5.26280232160558,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 669
},
{
"epoch": 3.03,
"grad_norm": 4.34447668238551,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 670
},
{
"epoch": 3.04,
"grad_norm": 4.233433870902845,
"learning_rate": 0.0001,
"loss": 2.1719,
"step": 671
},
{
"epoch": 3.04,
"grad_norm": 4.471039084021881,
"learning_rate": 0.0001,
"loss": 2.2969,
"step": 672
},
{
"epoch": 3.05,
"grad_norm": 5.759691256699786,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 673
},
{
"epoch": 3.05,
"grad_norm": 5.222531825557149,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 674
},
{
"epoch": 3.05,
"grad_norm": 4.719114936711103,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 675
},
{
"epoch": 3.06,
"grad_norm": 5.322770709604783,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 676
},
{
"epoch": 3.06,
"grad_norm": 4.411446154538555,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 677
},
{
"epoch": 3.07,
"grad_norm": 4.64030284036272,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 678
},
{
"epoch": 3.07,
"grad_norm": 4.138101002033864,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 679
},
{
"epoch": 3.08,
"grad_norm": 5.086493549779005,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 680
},
{
"epoch": 3.08,
"grad_norm": 3.930014385181362,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 681
},
{
"epoch": 3.09,
"grad_norm": 4.729385169407767,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 682
},
{
"epoch": 3.09,
"grad_norm": 3.9189890006972665,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 683
},
{
"epoch": 3.1,
"grad_norm": 5.903640555457615,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 684
},
{
"epoch": 3.1,
"grad_norm": 6.904659388771962,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 685
},
{
"epoch": 3.1,
"grad_norm": 8.989355711263677,
"learning_rate": 0.0001,
"loss": 2.0,
"step": 686
},
{
"epoch": 3.11,
"grad_norm": 4.7695501889288145,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 687
},
{
"epoch": 3.11,
"grad_norm": 5.69647405964174,
"learning_rate": 0.0001,
"loss": 2.5469,
"step": 688
},
{
"epoch": 3.12,
"grad_norm": 6.100773782892059,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 689
},
{
"epoch": 3.12,
"grad_norm": 5.686663254617055,
"learning_rate": 0.0001,
"loss": 2.0469,
"step": 690
},
{
"epoch": 3.13,
"grad_norm": 6.571330530953064,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 691
},
{
"epoch": 3.13,
"grad_norm": 5.718821223772928,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 692
},
{
"epoch": 3.14,
"grad_norm": 4.373723630863873,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 693
},
{
"epoch": 3.14,
"grad_norm": 5.085229478433662,
"learning_rate": 0.0001,
"loss": 2.8594,
"step": 694
},
{
"epoch": 3.14,
"grad_norm": 7.021754317383601,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 695
},
{
"epoch": 3.15,
"grad_norm": 6.04687595749246,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 696
},
{
"epoch": 3.15,
"grad_norm": 6.788165163318837,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 697
},
{
"epoch": 3.16,
"grad_norm": 5.561311682085884,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 698
},
{
"epoch": 3.16,
"grad_norm": 4.608936689327367,
"learning_rate": 0.0001,
"loss": 2.4219,
"step": 699
},
{
"epoch": 3.17,
"grad_norm": 5.968780759931178,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 700
},
{
"epoch": 3.17,
"grad_norm": 5.103044298583068,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 701
},
{
"epoch": 3.18,
"grad_norm": 6.45236411621874,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 702
},
{
"epoch": 3.18,
"grad_norm": 4.8281182531637095,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 703
},
{
"epoch": 3.19,
"grad_norm": 4.389882657051867,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 704
},
{
"epoch": 3.19,
"grad_norm": 4.962384851428117,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 705
},
{
"epoch": 3.19,
"grad_norm": 4.982045946774825,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 706
},
{
"epoch": 3.2,
"grad_norm": 6.41107777745504,
"learning_rate": 0.0001,
"loss": 2.1406,
"step": 707
},
{
"epoch": 3.2,
"grad_norm": 5.090681442306407,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 708
},
{
"epoch": 3.21,
"grad_norm": 5.165328647890187,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 709
},
{
"epoch": 3.21,
"grad_norm": 4.705626239369721,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 710
},
{
"epoch": 3.22,
"grad_norm": 7.677966149358577,
"learning_rate": 0.0001,
"loss": 2.3281,
"step": 711
},
{
"epoch": 3.22,
"grad_norm": 5.659659609102272,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 712
},
{
"epoch": 3.23,
"grad_norm": 4.766715309460445,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 713
},
{
"epoch": 3.23,
"grad_norm": 5.570413898672718,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 714
},
{
"epoch": 3.24,
"grad_norm": 6.482364017731351,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 715
},
{
"epoch": 3.24,
"grad_norm": 7.676974890118262,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 716
},
{
"epoch": 3.24,
"grad_norm": 7.269455563092696,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 717
},
{
"epoch": 3.25,
"grad_norm": 7.4750489121354695,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 718
},
{
"epoch": 3.25,
"grad_norm": 5.096852230759679,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 719
},
{
"epoch": 3.26,
"grad_norm": 9.551421687804485,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 720
},
{
"epoch": 3.26,
"grad_norm": 8.308410042464198,
"learning_rate": 0.0001,
"loss": 2.3438,
"step": 721
},
{
"epoch": 3.27,
"grad_norm": 7.249812504415762,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 722
},
{
"epoch": 3.27,
"grad_norm": 5.625497642517863,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 723
},
{
"epoch": 3.28,
"grad_norm": 5.528162218971885,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 724
},
{
"epoch": 3.28,
"grad_norm": 5.97119760259338,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 725
},
{
"epoch": 3.29,
"grad_norm": 5.0051682876370736,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 726
},
{
"epoch": 3.29,
"grad_norm": 6.153680263773301,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 727
},
{
"epoch": 3.29,
"grad_norm": 12.408064278903167,
"learning_rate": 0.0001,
"loss": 2.2188,
"step": 728
},
{
"epoch": 3.3,
"grad_norm": 6.967676512014587,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 729
},
{
"epoch": 3.3,
"grad_norm": 5.596141752113706,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 730
},
{
"epoch": 3.31,
"grad_norm": 6.15457256616309,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 731
},
{
"epoch": 3.31,
"grad_norm": 5.7616559342316584,
"learning_rate": 0.0001,
"loss": 2.4844,
"step": 732
},
{
"epoch": 3.32,
"grad_norm": 5.9891271652349936,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 733
},
{
"epoch": 3.32,
"grad_norm": 6.652398069643558,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 734
},
{
"epoch": 3.33,
"grad_norm": 6.356574056404722,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 735
},
{
"epoch": 3.33,
"grad_norm": 4.593278190996901,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 736
},
{
"epoch": 3.33,
"grad_norm": 6.0605354589139635,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 737
},
{
"epoch": 3.34,
"grad_norm": 5.70706586433816,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 738
},
{
"epoch": 3.34,
"grad_norm": 4.893608346712496,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 739
},
{
"epoch": 3.35,
"grad_norm": 6.088522365219184,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 740
},
{
"epoch": 3.35,
"grad_norm": 5.431869278014897,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 741
},
{
"epoch": 3.36,
"grad_norm": 5.596003857292665,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 742
},
{
"epoch": 3.36,
"grad_norm": 5.97698909669504,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 743
},
{
"epoch": 3.37,
"grad_norm": 5.805355147115389,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 744
},
{
"epoch": 3.37,
"grad_norm": 5.6494434657268515,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 745
},
{
"epoch": 3.38,
"grad_norm": 6.6389660268244945,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 746
},
{
"epoch": 3.38,
"grad_norm": 5.25191874182185,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 747
},
{
"epoch": 3.38,
"grad_norm": 4.828238204533096,
"learning_rate": 0.0001,
"loss": 2.3281,
"step": 748
},
{
"epoch": 3.39,
"grad_norm": 5.71616111332261,
"learning_rate": 0.0001,
"loss": 2.2656,
"step": 749
},
{
"epoch": 3.39,
"grad_norm": 5.33198279309472,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 750
},
{
"epoch": 3.4,
"grad_norm": 6.676847054708395,
"learning_rate": 0.0001,
"loss": 2.4531,
"step": 751
},
{
"epoch": 3.4,
"grad_norm": 6.987831720140158,
"learning_rate": 0.0001,
"loss": 2.4375,
"step": 752
},
{
"epoch": 3.41,
"grad_norm": 5.246383776912542,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 753
},
{
"epoch": 3.41,
"grad_norm": 6.2816140273248235,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 754
},
{
"epoch": 3.42,
"grad_norm": 7.924891850428938,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 755
},
{
"epoch": 3.42,
"grad_norm": 6.172178443029792,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 756
},
{
"epoch": 3.43,
"grad_norm": 4.599788973324431,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 757
},
{
"epoch": 3.43,
"grad_norm": 4.9529853584549075,
"learning_rate": 0.0001,
"loss": 2.2656,
"step": 758
},
{
"epoch": 3.43,
"grad_norm": 6.319742548596082,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 759
},
{
"epoch": 3.44,
"grad_norm": 7.205622577272047,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 760
},
{
"epoch": 3.44,
"grad_norm": 9.18574662601195,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 761
},
{
"epoch": 3.45,
"grad_norm": 6.956078746670551,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 762
},
{
"epoch": 3.45,
"grad_norm": 5.0104025328964115,
"learning_rate": 0.0001,
"loss": 2.2656,
"step": 763
},
{
"epoch": 3.46,
"grad_norm": 5.418890629156982,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 764
},
{
"epoch": 3.46,
"grad_norm": 5.404410173821834,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 765
},
{
"epoch": 3.47,
"grad_norm": 6.568137897267971,
"learning_rate": 0.0001,
"loss": 2.1719,
"step": 766
},
{
"epoch": 3.47,
"grad_norm": 5.578470702623748,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 767
},
{
"epoch": 3.48,
"grad_norm": 6.235890736509007,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 768
},
{
"epoch": 3.48,
"grad_norm": 5.114739271673889,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 769
},
{
"epoch": 3.48,
"grad_norm": 5.631412619020986,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 770
},
{
"epoch": 3.49,
"grad_norm": 6.0619101459204305,
"learning_rate": 0.0001,
"loss": 2.2344,
"step": 771
},
{
"epoch": 3.49,
"grad_norm": 6.715082841558436,
"learning_rate": 0.0001,
"loss": 2.3125,
"step": 772
},
{
"epoch": 3.5,
"grad_norm": 7.162414120906969,
"learning_rate": 0.0001,
"loss": 2.1406,
"step": 773
},
{
"epoch": 3.5,
"grad_norm": 6.361814355074576,
"learning_rate": 0.0001,
"loss": 2.375,
"step": 774
},
{
"epoch": 3.51,
"grad_norm": 5.233411385933616,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 775
},
{
"epoch": 3.51,
"grad_norm": 5.773944915421188,
"learning_rate": 0.0001,
"loss": 2.6094,
"step": 776
},
{
"epoch": 3.52,
"grad_norm": 6.980243552018795,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 777
},
{
"epoch": 3.52,
"grad_norm": 5.40460163775666,
"learning_rate": 0.0001,
"loss": 2.9688,
"step": 778
},
{
"epoch": 3.52,
"grad_norm": 8.48311753833824,
"learning_rate": 0.0001,
"loss": 2.2031,
"step": 779
},
{
"epoch": 3.53,
"grad_norm": 5.876697470240219,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 780
},
{
"epoch": 3.53,
"grad_norm": 6.706613026904167,
"learning_rate": 0.0001,
"loss": 3.1875,
"step": 781
},
{
"epoch": 3.54,
"grad_norm": 7.2764400516796846,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 782
},
{
"epoch": 3.54,
"grad_norm": 6.094306397049338,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 783
},
{
"epoch": 3.55,
"grad_norm": 6.278005609224948,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 784
},
{
"epoch": 3.55,
"grad_norm": 5.3086920461585185,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 785
},
{
"epoch": 3.56,
"grad_norm": 5.684767409346208,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 786
},
{
"epoch": 3.56,
"grad_norm": 5.9289557000568704,
"learning_rate": 0.0001,
"loss": 2.6875,
"step": 787
},
{
"epoch": 3.57,
"grad_norm": 5.5746989859790945,
"learning_rate": 0.0001,
"loss": 2.1719,
"step": 788
},
{
"epoch": 3.57,
"grad_norm": 9.301668372300803,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 789
},
{
"epoch": 3.57,
"grad_norm": 6.513123479044599,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 790
},
{
"epoch": 3.58,
"grad_norm": 4.777033577512223,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 791
},
{
"epoch": 3.58,
"grad_norm": 4.457114397241286,
"learning_rate": 0.0001,
"loss": 3.0,
"step": 792
},
{
"epoch": 3.59,
"grad_norm": 6.689527406548995,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 793
},
{
"epoch": 3.59,
"grad_norm": 6.983770238015634,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 794
},
{
"epoch": 3.6,
"grad_norm": 6.802187681181655,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 795
},
{
"epoch": 3.6,
"grad_norm": 7.033033169580362,
"learning_rate": 0.0001,
"loss": 2.3594,
"step": 796
},
{
"epoch": 3.61,
"grad_norm": 6.26808782701995,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 797
},
{
"epoch": 3.61,
"grad_norm": 6.005513689302465,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 798
},
{
"epoch": 3.62,
"grad_norm": 7.767577684581086,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 799
},
{
"epoch": 3.62,
"grad_norm": 8.120626893479068,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 800
},
{
"epoch": 3.62,
"grad_norm": 7.580438309650423,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 801
},
{
"epoch": 3.63,
"grad_norm": 6.601396559848888,
"learning_rate": 0.0001,
"loss": 3.2031,
"step": 802
},
{
"epoch": 3.63,
"grad_norm": 5.589941827621371,
"learning_rate": 0.0001,
"loss": 2.7656,
"step": 803
},
{
"epoch": 3.64,
"grad_norm": 7.904399035942282,
"learning_rate": 0.0001,
"loss": 2.4531,
"step": 804
},
{
"epoch": 3.64,
"grad_norm": 4.281379429844708,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 805
},
{
"epoch": 3.65,
"grad_norm": 6.356739367521081,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 806
},
{
"epoch": 3.65,
"grad_norm": 6.96355742617709,
"learning_rate": 0.0001,
"loss": 2.5469,
"step": 807
},
{
"epoch": 3.66,
"grad_norm": 7.726157531308813,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 808
},
{
"epoch": 3.66,
"grad_norm": 4.480569716014928,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 809
},
{
"epoch": 3.67,
"grad_norm": 5.8390738745875055,
"learning_rate": 0.0001,
"loss": 3.0312,
"step": 810
},
{
"epoch": 3.67,
"grad_norm": 6.615258635722885,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 811
},
{
"epoch": 3.67,
"grad_norm": 7.391480221820631,
"learning_rate": 0.0001,
"loss": 2.2812,
"step": 812
},
{
"epoch": 3.68,
"grad_norm": 5.191253582921654,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 813
},
{
"epoch": 3.68,
"grad_norm": 5.4670864088124045,
"learning_rate": 0.0001,
"loss": 3.125,
"step": 814
},
{
"epoch": 3.69,
"grad_norm": 5.624287979287397,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 815
},
{
"epoch": 3.69,
"grad_norm": 6.313425088933223,
"learning_rate": 0.0001,
"loss": 2.5938,
"step": 816
},
{
"epoch": 3.7,
"grad_norm": 6.818221819002743,
"learning_rate": 0.0001,
"loss": 1.7891,
"step": 817
},
{
"epoch": 3.7,
"grad_norm": 6.464530097139393,
"learning_rate": 0.0001,
"loss": 2.75,
"step": 818
},
{
"epoch": 3.71,
"grad_norm": 5.69636118536784,
"learning_rate": 0.0001,
"loss": 2.7969,
"step": 819
},
{
"epoch": 3.71,
"grad_norm": 6.3130472400300635,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 820
},
{
"epoch": 3.71,
"grad_norm": 6.164694991173136,
"learning_rate": 0.0001,
"loss": 2.25,
"step": 821
},
{
"epoch": 3.72,
"grad_norm": 7.070295165751384,
"learning_rate": 0.0001,
"loss": 1.9844,
"step": 822
},
{
"epoch": 3.72,
"grad_norm": 5.154099348952669,
"learning_rate": 0.0001,
"loss": 2.9062,
"step": 823
},
{
"epoch": 3.73,
"grad_norm": 4.925492637519167,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 824
},
{
"epoch": 3.73,
"grad_norm": 6.390984018933043,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 825
},
{
"epoch": 3.74,
"grad_norm": 6.9138596085917285,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 826
},
{
"epoch": 3.74,
"grad_norm": 5.369883533158641,
"learning_rate": 0.0001,
"loss": 2.8125,
"step": 827
},
{
"epoch": 3.75,
"grad_norm": 7.456914473954453,
"learning_rate": 0.0001,
"loss": 2.7812,
"step": 828
},
{
"epoch": 3.75,
"grad_norm": 5.791370397982031,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 829
},
{
"epoch": 3.76,
"grad_norm": 4.473564456526789,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 830
},
{
"epoch": 3.76,
"grad_norm": 6.377472917090085,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 831
},
{
"epoch": 3.76,
"grad_norm": 7.061870345826034,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 832
},
{
"epoch": 3.77,
"grad_norm": 7.475056565446695,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 833
},
{
"epoch": 3.77,
"grad_norm": 5.943474193334696,
"learning_rate": 0.0001,
"loss": 2.0781,
"step": 834
},
{
"epoch": 3.78,
"grad_norm": 5.786165179983016,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 835
},
{
"epoch": 3.78,
"grad_norm": 6.540859565742491,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 836
},
{
"epoch": 3.79,
"grad_norm": 6.852178210980113,
"learning_rate": 0.0001,
"loss": 2.2969,
"step": 837
},
{
"epoch": 3.79,
"grad_norm": 6.427660937563824,
"learning_rate": 0.0001,
"loss": 3.0469,
"step": 838
},
{
"epoch": 3.8,
"grad_norm": 7.404832954994649,
"learning_rate": 0.0001,
"loss": 2.875,
"step": 839
},
{
"epoch": 3.8,
"grad_norm": 5.15547466947869,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 840
},
{
"epoch": 3.81,
"grad_norm": 9.284304247251512,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 841
},
{
"epoch": 3.81,
"grad_norm": 6.44211346051645,
"learning_rate": 0.0001,
"loss": 2.4062,
"step": 842
},
{
"epoch": 3.81,
"grad_norm": 7.407204225068616,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 843
},
{
"epoch": 3.82,
"grad_norm": 6.536030004163721,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 844
},
{
"epoch": 3.82,
"grad_norm": 5.664816059826595,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 845
},
{
"epoch": 3.83,
"grad_norm": 7.074285974063112,
"learning_rate": 0.0001,
"loss": 2.3281,
"step": 846
},
{
"epoch": 3.83,
"grad_norm": 4.930926666384628,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 847
},
{
"epoch": 3.84,
"grad_norm": 5.663530450660167,
"learning_rate": 0.0001,
"loss": 3.0625,
"step": 848
},
{
"epoch": 3.84,
"grad_norm": 5.635849522363945,
"learning_rate": 0.0001,
"loss": 2.9375,
"step": 849
},
{
"epoch": 3.85,
"grad_norm": 8.186385957751863,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 850
},
{
"epoch": 3.85,
"grad_norm": 5.9945008515729254,
"learning_rate": 0.0001,
"loss": 2.5312,
"step": 851
},
{
"epoch": 3.86,
"grad_norm": 5.472155827201553,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 852
},
{
"epoch": 3.86,
"grad_norm": 9.14877945827067,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 853
},
{
"epoch": 3.86,
"grad_norm": 6.100888417967896,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 854
},
{
"epoch": 3.87,
"grad_norm": 7.028201495974476,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 855
},
{
"epoch": 3.87,
"grad_norm": 7.158110848813972,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 856
},
{
"epoch": 3.88,
"grad_norm": 5.9746126891715985,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 857
},
{
"epoch": 3.88,
"grad_norm": 6.194895879605958,
"learning_rate": 0.0001,
"loss": 2.2812,
"step": 858
},
{
"epoch": 3.89,
"grad_norm": 5.7717568499039755,
"learning_rate": 0.0001,
"loss": 2.7031,
"step": 859
},
{
"epoch": 3.89,
"grad_norm": 7.816780702683526,
"learning_rate": 0.0001,
"loss": 2.8906,
"step": 860
},
{
"epoch": 3.9,
"grad_norm": 6.419624396463192,
"learning_rate": 0.0001,
"loss": 2.5625,
"step": 861
},
{
"epoch": 3.9,
"grad_norm": 7.185649890506834,
"learning_rate": 0.0001,
"loss": 2.8438,
"step": 862
},
{
"epoch": 3.9,
"grad_norm": 5.6908416343601935,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 863
},
{
"epoch": 3.91,
"grad_norm": 4.859486347190332,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 864
},
{
"epoch": 3.91,
"grad_norm": 6.285213197221704,
"learning_rate": 0.0001,
"loss": 2.625,
"step": 865
},
{
"epoch": 3.92,
"grad_norm": 6.738698450535726,
"learning_rate": 0.0001,
"loss": 2.9219,
"step": 866
},
{
"epoch": 3.92,
"grad_norm": 5.9454993899618875,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 867
},
{
"epoch": 3.93,
"grad_norm": 8.998641819830656,
"learning_rate": 0.0001,
"loss": 2.0625,
"step": 868
},
{
"epoch": 3.93,
"grad_norm": 7.536207240118511,
"learning_rate": 0.0001,
"loss": 2.5,
"step": 869
},
{
"epoch": 3.94,
"grad_norm": 5.971690600825191,
"learning_rate": 0.0001,
"loss": 2.6406,
"step": 870
},
{
"epoch": 3.94,
"grad_norm": 4.808198125624626,
"learning_rate": 0.0001,
"loss": 2.6719,
"step": 871
},
{
"epoch": 3.95,
"grad_norm": 5.456368344689776,
"learning_rate": 0.0001,
"loss": 2.3438,
"step": 872
},
{
"epoch": 3.95,
"grad_norm": 5.88624229180227,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 873
},
{
"epoch": 3.95,
"grad_norm": 6.792604392226326,
"learning_rate": 0.0001,
"loss": 2.9531,
"step": 874
},
{
"epoch": 3.96,
"grad_norm": 8.842989968926629,
"learning_rate": 0.0001,
"loss": 2.4219,
"step": 875
},
{
"epoch": 3.96,
"grad_norm": 6.812604406395962,
"learning_rate": 0.0001,
"loss": 2.3281,
"step": 876
},
{
"epoch": 3.97,
"grad_norm": 8.040123847392087,
"learning_rate": 0.0001,
"loss": 2.7188,
"step": 877
},
{
"epoch": 3.97,
"grad_norm": 5.413687121021301,
"learning_rate": 0.0001,
"loss": 2.8281,
"step": 878
},
{
"epoch": 3.98,
"grad_norm": 5.85591948084603,
"learning_rate": 0.0001,
"loss": 2.6562,
"step": 879
},
{
"epoch": 3.98,
"grad_norm": 7.731647984076021,
"learning_rate": 0.0001,
"loss": 2.2656,
"step": 880
},
{
"epoch": 3.99,
"grad_norm": 5.3911022010810346,
"learning_rate": 0.0001,
"loss": 2.5781,
"step": 881
},
{
"epoch": 3.99,
"grad_norm": 7.088453541216916,
"learning_rate": 0.0001,
"loss": 2.4688,
"step": 882
},
{
"epoch": 4.0,
"grad_norm": 6.873463198730782,
"learning_rate": 0.0001,
"loss": 2.7344,
"step": 883
},
{
"epoch": 4.0,
"grad_norm": 8.452510173369372,
"learning_rate": 0.0001,
"loss": 2.5156,
"step": 884
},
{
"epoch": 4.0,
"step": 884,
"total_flos": 17145224232960.0,
"train_loss": 2.7685502686651584,
"train_runtime": 2858.5803,
"train_samples_per_second": 2.47,
"train_steps_per_second": 0.309
}
],
"logging_steps": 1.0,
"max_steps": 884,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 500,
"total_flos": 17145224232960.0,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}