|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9977335435553804, |
|
"eval_steps": 500, |
|
"global_step": 40500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.012317698068584943, |
|
"grad_norm": 2.049720048904419, |
|
"learning_rate": 4.938534686637761e-05, |
|
"loss": 1.4868, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.024635396137169887, |
|
"grad_norm": 1.9058018922805786, |
|
"learning_rate": 4.8769461962948364e-05, |
|
"loss": 1.1702, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03695309420575483, |
|
"grad_norm": 1.4330534934997559, |
|
"learning_rate": 4.815357705951912e-05, |
|
"loss": 1.0684, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.049270792274339774, |
|
"grad_norm": 2.9333412647247314, |
|
"learning_rate": 4.753769215608987e-05, |
|
"loss": 1.0331, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06158849034292471, |
|
"grad_norm": 1.674055576324463, |
|
"learning_rate": 4.6921807252660625e-05, |
|
"loss": 1.0139, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.07390618841150966, |
|
"grad_norm": 1.7064428329467773, |
|
"learning_rate": 4.630592234923138e-05, |
|
"loss": 0.987, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0862238864800946, |
|
"grad_norm": 1.4718812704086304, |
|
"learning_rate": 4.569003744580213e-05, |
|
"loss": 0.9881, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09854158454867955, |
|
"grad_norm": 1.326068639755249, |
|
"learning_rate": 4.507415254237288e-05, |
|
"loss": 0.9511, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.11085928261726448, |
|
"grad_norm": 1.724241852760315, |
|
"learning_rate": 4.4458267638943634e-05, |
|
"loss": 0.952, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.12317698068584942, |
|
"grad_norm": 0.20040610432624817, |
|
"learning_rate": 4.384361450532125e-05, |
|
"loss": 0.9575, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.13549467875443438, |
|
"grad_norm": 1.6827597618103027, |
|
"learning_rate": 4.3227729601892e-05, |
|
"loss": 0.9589, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.14781237682301931, |
|
"grad_norm": 0.9528223276138306, |
|
"learning_rate": 4.2611844698462756e-05, |
|
"loss": 0.9464, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.16013007489160425, |
|
"grad_norm": 1.1192400455474854, |
|
"learning_rate": 4.199595979503351e-05, |
|
"loss": 0.9395, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.1724477729601892, |
|
"grad_norm": 1.5803415775299072, |
|
"learning_rate": 4.138007489160426e-05, |
|
"loss": 0.9376, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.18476547102877414, |
|
"grad_norm": 1.0329970121383667, |
|
"learning_rate": 4.076542175798187e-05, |
|
"loss": 0.9247, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.1970831690973591, |
|
"grad_norm": 1.0181173086166382, |
|
"learning_rate": 4.014953685455262e-05, |
|
"loss": 0.9469, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.20940086716594403, |
|
"grad_norm": 0.837753176689148, |
|
"learning_rate": 3.953365195112337e-05, |
|
"loss": 0.9017, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.22171856523452896, |
|
"grad_norm": 1.228780746459961, |
|
"learning_rate": 3.891776704769413e-05, |
|
"loss": 0.9006, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.23403626330311392, |
|
"grad_norm": 1.6748690605163574, |
|
"learning_rate": 3.830188214426488e-05, |
|
"loss": 0.9312, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.24635396137169885, |
|
"grad_norm": 2.5065925121307373, |
|
"learning_rate": 3.7687229010642495e-05, |
|
"loss": 0.9102, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.2586716594402838, |
|
"grad_norm": 0.4429062008857727, |
|
"learning_rate": 3.70725758770201e-05, |
|
"loss": 0.9185, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.27098935750886877, |
|
"grad_norm": 0.8518178462982178, |
|
"learning_rate": 3.6456690973590856e-05, |
|
"loss": 0.9158, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.28330705557745367, |
|
"grad_norm": 1.952348232269287, |
|
"learning_rate": 3.584080607016161e-05, |
|
"loss": 0.8735, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.29562475364603863, |
|
"grad_norm": 0.9509502053260803, |
|
"learning_rate": 3.5224921166732364e-05, |
|
"loss": 0.8721, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.3079424517146236, |
|
"grad_norm": 0.9900819659233093, |
|
"learning_rate": 3.460903626330311e-05, |
|
"loss": 0.9015, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.3202601497832085, |
|
"grad_norm": 1.6706575155258179, |
|
"learning_rate": 3.399315135987387e-05, |
|
"loss": 0.9053, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.33257784785179345, |
|
"grad_norm": 2.245882272720337, |
|
"learning_rate": 3.3377266456444625e-05, |
|
"loss": 0.9008, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.3448955459203784, |
|
"grad_norm": 1.2105801105499268, |
|
"learning_rate": 3.276138155301537e-05, |
|
"loss": 0.8859, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.35721324398896337, |
|
"grad_norm": 1.136516809463501, |
|
"learning_rate": 3.214672841939299e-05, |
|
"loss": 0.8922, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.3695309420575483, |
|
"grad_norm": 1.4451056718826294, |
|
"learning_rate": 3.1530843515963734e-05, |
|
"loss": 0.8907, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.38184864012613323, |
|
"grad_norm": 0.8413623571395874, |
|
"learning_rate": 3.091495861253449e-05, |
|
"loss": 0.8929, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.3941663381947182, |
|
"grad_norm": 1.7711334228515625, |
|
"learning_rate": 3.029907370910524e-05, |
|
"loss": 0.8792, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.4064840362633031, |
|
"grad_norm": 1.3520532846450806, |
|
"learning_rate": 2.9683188805676e-05, |
|
"loss": 0.8809, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.41880173433188805, |
|
"grad_norm": 1.8900405168533325, |
|
"learning_rate": 2.906853567205361e-05, |
|
"loss": 0.8834, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.431119432400473, |
|
"grad_norm": 1.377387523651123, |
|
"learning_rate": 2.845265076862436e-05, |
|
"loss": 0.8902, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.4434371304690579, |
|
"grad_norm": 1.2132008075714111, |
|
"learning_rate": 2.7836765865195114e-05, |
|
"loss": 0.8607, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.4557548285376429, |
|
"grad_norm": 0.94814532995224, |
|
"learning_rate": 2.7220880961765865e-05, |
|
"loss": 0.88, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.46807252660622783, |
|
"grad_norm": 0.8629463315010071, |
|
"learning_rate": 2.6606227828143476e-05, |
|
"loss": 0.8609, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.4803902246748128, |
|
"grad_norm": 1.1995203495025635, |
|
"learning_rate": 2.599034292471423e-05, |
|
"loss": 0.8778, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.4927079227433977, |
|
"grad_norm": 1.2757550477981567, |
|
"learning_rate": 2.5374458021284987e-05, |
|
"loss": 0.853, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5050256208119827, |
|
"grad_norm": 1.4950288534164429, |
|
"learning_rate": 2.4758573117855734e-05, |
|
"loss": 0.8688, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.5173433188805676, |
|
"grad_norm": 0.997870147228241, |
|
"learning_rate": 2.4143919984233348e-05, |
|
"loss": 0.8738, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.5296610169491526, |
|
"grad_norm": 1.2419720888137817, |
|
"learning_rate": 2.3528035080804102e-05, |
|
"loss": 0.8929, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.5419787150177375, |
|
"grad_norm": 1.197637915611267, |
|
"learning_rate": 2.2912150177374853e-05, |
|
"loss": 0.8848, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.5542964130863224, |
|
"grad_norm": 1.501338005065918, |
|
"learning_rate": 2.2296265273945606e-05, |
|
"loss": 0.8543, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.5666141111549073, |
|
"grad_norm": 1.2377902269363403, |
|
"learning_rate": 2.1681612140323217e-05, |
|
"loss": 0.8402, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.5789318092234923, |
|
"grad_norm": 0.6151154637336731, |
|
"learning_rate": 2.106572723689397e-05, |
|
"loss": 0.872, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.5912495072920773, |
|
"grad_norm": 0.8272470235824585, |
|
"learning_rate": 2.044984233346472e-05, |
|
"loss": 0.8524, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6035672053606622, |
|
"grad_norm": 1.718947410583496, |
|
"learning_rate": 1.9833957430035476e-05, |
|
"loss": 0.8404, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.6158849034292472, |
|
"grad_norm": 1.5924681425094604, |
|
"learning_rate": 1.921807252660623e-05, |
|
"loss": 0.868, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.6282026014978321, |
|
"grad_norm": 1.3491289615631104, |
|
"learning_rate": 1.860341939298384e-05, |
|
"loss": 0.8521, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.640520299566417, |
|
"grad_norm": 1.2171722650527954, |
|
"learning_rate": 1.798753448955459e-05, |
|
"loss": 0.8312, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.6528379976350019, |
|
"grad_norm": 1.0879688262939453, |
|
"learning_rate": 1.7371649586125348e-05, |
|
"loss": 0.8471, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.6651556957035869, |
|
"grad_norm": 0.9785297513008118, |
|
"learning_rate": 1.67557646826961e-05, |
|
"loss": 0.8571, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.6774733937721719, |
|
"grad_norm": 2.27364444732666, |
|
"learning_rate": 1.614111154907371e-05, |
|
"loss": 0.8548, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.6897910918407568, |
|
"grad_norm": 1.1297192573547363, |
|
"learning_rate": 1.552645841545132e-05, |
|
"loss": 0.8662, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.7021087899093418, |
|
"grad_norm": 1.4344604015350342, |
|
"learning_rate": 1.4910573512022074e-05, |
|
"loss": 0.8609, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.7144264879779267, |
|
"grad_norm": 0.8162903189659119, |
|
"learning_rate": 1.4294688608592826e-05, |
|
"loss": 0.8519, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.7267441860465116, |
|
"grad_norm": 1.4598755836486816, |
|
"learning_rate": 1.3678803705163579e-05, |
|
"loss": 0.8376, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.7390618841150965, |
|
"grad_norm": 0.9850552082061768, |
|
"learning_rate": 1.3062918801734334e-05, |
|
"loss": 0.842, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.7513795821836815, |
|
"grad_norm": 1.2781943082809448, |
|
"learning_rate": 1.2447033898305085e-05, |
|
"loss": 0.8539, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.7636972802522665, |
|
"grad_norm": 0.9877688884735107, |
|
"learning_rate": 1.1831148994875837e-05, |
|
"loss": 0.8439, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.7760149783208514, |
|
"grad_norm": 1.0702171325683594, |
|
"learning_rate": 1.121526409144659e-05, |
|
"loss": 0.8492, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.7883326763894364, |
|
"grad_norm": 1.0662543773651123, |
|
"learning_rate": 1.0600610957824203e-05, |
|
"loss": 0.8572, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.8006503744580212, |
|
"grad_norm": 1.029024600982666, |
|
"learning_rate": 9.985957824201814e-06, |
|
"loss": 0.8347, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.8129680725266062, |
|
"grad_norm": 0.965148389339447, |
|
"learning_rate": 9.370072920772566e-06, |
|
"loss": 0.8408, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.8252857705951911, |
|
"grad_norm": 1.441133975982666, |
|
"learning_rate": 8.754188017343319e-06, |
|
"loss": 0.8385, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.8376034686637761, |
|
"grad_norm": 1.6267979145050049, |
|
"learning_rate": 8.138303113914072e-06, |
|
"loss": 0.8426, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.8499211667323611, |
|
"grad_norm": 1.0745171308517456, |
|
"learning_rate": 7.522418210484825e-06, |
|
"loss": 0.8509, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.862238864800946, |
|
"grad_norm": 1.2497833967208862, |
|
"learning_rate": 6.906533307055578e-06, |
|
"loss": 0.868, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.874556562869531, |
|
"grad_norm": 1.1674834489822388, |
|
"learning_rate": 6.2918801734331886e-06, |
|
"loss": 0.8548, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.8868742609381158, |
|
"grad_norm": 1.369992733001709, |
|
"learning_rate": 5.6759952700039425e-06, |
|
"loss": 0.8662, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.8991919590067008, |
|
"grad_norm": 2.269826889038086, |
|
"learning_rate": 5.060110366574695e-06, |
|
"loss": 0.8518, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.9115096570752858, |
|
"grad_norm": 1.3908036947250366, |
|
"learning_rate": 4.444225463145448e-06, |
|
"loss": 0.8583, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.9238273551438707, |
|
"grad_norm": 0.907351553440094, |
|
"learning_rate": 3.8283405597162e-06, |
|
"loss": 0.8568, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.9361450532124557, |
|
"grad_norm": 1.2205787897109985, |
|
"learning_rate": 3.2124556562869534e-06, |
|
"loss": 0.86, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.9484627512810406, |
|
"grad_norm": 0.8388875722885132, |
|
"learning_rate": 2.596570752857706e-06, |
|
"loss": 0.8228, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.9607804493496256, |
|
"grad_norm": 1.2039369344711304, |
|
"learning_rate": 1.980685849428459e-06, |
|
"loss": 0.8462, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.9730981474182104, |
|
"grad_norm": 1.1439008712768555, |
|
"learning_rate": 1.3660327158060701e-06, |
|
"loss": 0.8669, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.9854158454867954, |
|
"grad_norm": 1.5806646347045898, |
|
"learning_rate": 7.50147812376823e-07, |
|
"loss": 0.8306, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.9977335435553804, |
|
"grad_norm": 0.8370537757873535, |
|
"learning_rate": 1.342629089475759e-07, |
|
"loss": 0.8485, |
|
"step": 40500 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 40592, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 74657562624000.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|