|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 10178, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.039300451955197484, |
|
"grad_norm": 0.5243114233016968, |
|
"learning_rate": 7.861635220125787e-07, |
|
"loss": 1.9595, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07860090391039497, |
|
"grad_norm": 0.5634641051292419, |
|
"learning_rate": 1.5723270440251573e-06, |
|
"loss": 1.9417, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.11790135586559246, |
|
"grad_norm": 0.5528621077537537, |
|
"learning_rate": 2.358490566037736e-06, |
|
"loss": 1.9052, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15720180782078993, |
|
"grad_norm": 0.5267557501792908, |
|
"learning_rate": 3.1446540880503146e-06, |
|
"loss": 1.8587, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.19650225977598743, |
|
"grad_norm": 0.4375621974468231, |
|
"learning_rate": 3.930817610062894e-06, |
|
"loss": 1.802, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.23580271173118492, |
|
"grad_norm": 0.44920215010643005, |
|
"learning_rate": 4.716981132075472e-06, |
|
"loss": 1.7458, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2751031636863824, |
|
"grad_norm": 0.5727487802505493, |
|
"learning_rate": 5.503144654088051e-06, |
|
"loss": 1.6559, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.31440361564157987, |
|
"grad_norm": 0.5727925300598145, |
|
"learning_rate": 6.289308176100629e-06, |
|
"loss": 1.5392, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.35370406759677736, |
|
"grad_norm": 0.5609890818595886, |
|
"learning_rate": 7.0754716981132075e-06, |
|
"loss": 1.3973, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.39300451955197485, |
|
"grad_norm": 0.5241280794143677, |
|
"learning_rate": 7.861635220125787e-06, |
|
"loss": 1.3599, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.43230497150717234, |
|
"grad_norm": 0.4475036859512329, |
|
"learning_rate": 8.647798742138366e-06, |
|
"loss": 1.3509, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.47160542346236983, |
|
"grad_norm": 0.46985116600990295, |
|
"learning_rate": 9.433962264150944e-06, |
|
"loss": 1.3268, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5109058754175673, |
|
"grad_norm": 0.49204403162002563, |
|
"learning_rate": 1.0220125786163524e-05, |
|
"loss": 1.3113, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.5502063273727648, |
|
"grad_norm": 0.5724858045578003, |
|
"learning_rate": 1.1006289308176102e-05, |
|
"loss": 1.3017, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.5895067793279622, |
|
"grad_norm": 0.6059098839759827, |
|
"learning_rate": 1.179245283018868e-05, |
|
"loss": 1.3185, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.6288072312831597, |
|
"grad_norm": 0.6029092669487, |
|
"learning_rate": 1.2578616352201259e-05, |
|
"loss": 1.3167, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.6681076832383572, |
|
"grad_norm": 0.5854910612106323, |
|
"learning_rate": 1.3364779874213839e-05, |
|
"loss": 1.3188, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.7074081351935547, |
|
"grad_norm": 0.7484495043754578, |
|
"learning_rate": 1.4150943396226415e-05, |
|
"loss": 1.3073, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.7467085871487522, |
|
"grad_norm": 0.5744655132293701, |
|
"learning_rate": 1.4937106918238995e-05, |
|
"loss": 1.3014, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.7860090391039497, |
|
"grad_norm": 0.6052381992340088, |
|
"learning_rate": 1.5723270440251575e-05, |
|
"loss": 1.2955, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.8253094910591472, |
|
"grad_norm": 0.7050319910049438, |
|
"learning_rate": 1.650943396226415e-05, |
|
"loss": 1.3033, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.8646099430143447, |
|
"grad_norm": 0.596123456954956, |
|
"learning_rate": 1.729559748427673e-05, |
|
"loss": 1.3067, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.9039103949695422, |
|
"grad_norm": 0.618027925491333, |
|
"learning_rate": 1.8081761006289308e-05, |
|
"loss": 1.2855, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.9432108469247397, |
|
"grad_norm": 0.7311336994171143, |
|
"learning_rate": 1.8867924528301888e-05, |
|
"loss": 1.278, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.9825112988799372, |
|
"grad_norm": 0.6211506128311157, |
|
"learning_rate": 1.9654088050314464e-05, |
|
"loss": 1.3028, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.999803497740224, |
|
"eval_loss": 1.2847634553909302, |
|
"eval_runtime": 874.8058, |
|
"eval_samples_per_second": 4.363, |
|
"eval_steps_per_second": 0.728, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 1.0218117508351345, |
|
"grad_norm": 0.7327952980995178, |
|
"learning_rate": 1.9999704794507125e-05, |
|
"loss": 1.2766, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.0611122027903321, |
|
"grad_norm": 0.6698949337005615, |
|
"learning_rate": 1.9997709221324418e-05, |
|
"loss": 1.2867, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.1004126547455295, |
|
"grad_norm": 0.6375765204429626, |
|
"learning_rate": 1.9993831411601573e-05, |
|
"loss": 1.2934, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.1397131067007271, |
|
"grad_norm": 0.7647634148597717, |
|
"learning_rate": 1.998807209540135e-05, |
|
"loss": 1.2868, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.1790135586559245, |
|
"grad_norm": 0.722791314125061, |
|
"learning_rate": 1.9980432357011672e-05, |
|
"loss": 1.2833, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.218314010611122, |
|
"grad_norm": 0.6894689202308655, |
|
"learning_rate": 1.9970913634741498e-05, |
|
"loss": 1.2675, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.2576144625663195, |
|
"grad_norm": 0.8223153352737427, |
|
"learning_rate": 1.995951772065004e-05, |
|
"loss": 1.2946, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.2969149145215169, |
|
"grad_norm": 0.7805577516555786, |
|
"learning_rate": 1.994624676020936e-05, |
|
"loss": 1.2843, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 1.3362153664767145, |
|
"grad_norm": 0.8491411209106445, |
|
"learning_rate": 1.9931103251900485e-05, |
|
"loss": 1.2665, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 1.375515818431912, |
|
"grad_norm": 0.700754702091217, |
|
"learning_rate": 1.9914090046742984e-05, |
|
"loss": 1.2724, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 1.4148162703871094, |
|
"grad_norm": 0.7429256439208984, |
|
"learning_rate": 1.9895210347758233e-05, |
|
"loss": 1.2818, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 1.4541167223423068, |
|
"grad_norm": 0.7554534077644348, |
|
"learning_rate": 1.9874467709366408e-05, |
|
"loss": 1.2717, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 1.4934171742975044, |
|
"grad_norm": 0.6377087831497192, |
|
"learning_rate": 1.985186603671728e-05, |
|
"loss": 1.2817, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 1.532717626252702, |
|
"grad_norm": 0.6828142404556274, |
|
"learning_rate": 1.9827409584955025e-05, |
|
"loss": 1.2666, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 1.5720180782078994, |
|
"grad_norm": 0.7497674822807312, |
|
"learning_rate": 1.9801102958417107e-05, |
|
"loss": 1.2774, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 1.6113185301630968, |
|
"grad_norm": 0.6864811778068542, |
|
"learning_rate": 1.977295110976744e-05, |
|
"loss": 1.2754, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 1.6506189821182944, |
|
"grad_norm": 0.6789698004722595, |
|
"learning_rate": 1.9742959339063977e-05, |
|
"loss": 1.2786, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 1.689919434073492, |
|
"grad_norm": 0.7490926384925842, |
|
"learning_rate": 1.971113329276087e-05, |
|
"loss": 1.2809, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.7292198860286894, |
|
"grad_norm": 0.7881296873092651, |
|
"learning_rate": 1.9677478962645422e-05, |
|
"loss": 1.2621, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.7685203379838867, |
|
"grad_norm": 0.7230281829833984, |
|
"learning_rate": 1.9642002684710065e-05, |
|
"loss": 1.2852, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.8078207899390843, |
|
"grad_norm": 0.7554972767829895, |
|
"learning_rate": 1.960471113795947e-05, |
|
"loss": 1.2721, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.8471212418942817, |
|
"grad_norm": 0.734038770198822, |
|
"learning_rate": 1.9565611343153133e-05, |
|
"loss": 1.2814, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.886421693849479, |
|
"grad_norm": 0.7878260016441345, |
|
"learning_rate": 1.9524710661483594e-05, |
|
"loss": 1.2678, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.9257221458046767, |
|
"grad_norm": 0.6968903541564941, |
|
"learning_rate": 1.9482016793190554e-05, |
|
"loss": 1.2728, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.9650225977598743, |
|
"grad_norm": 0.6642516851425171, |
|
"learning_rate": 1.9437537776111207e-05, |
|
"loss": 1.2597, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 1.2640804052352905, |
|
"eval_runtime": 870.1498, |
|
"eval_samples_per_second": 4.387, |
|
"eval_steps_per_second": 0.732, |
|
"step": 5089 |
|
}, |
|
{ |
|
"epoch": 2.004323049715072, |
|
"grad_norm": 0.7448713183403015, |
|
"learning_rate": 1.9391281984166944e-05, |
|
"loss": 1.28, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 2.043623501670269, |
|
"grad_norm": 0.7270140647888184, |
|
"learning_rate": 1.9343258125786866e-05, |
|
"loss": 1.2702, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 2.0829239536254667, |
|
"grad_norm": 0.7596098184585571, |
|
"learning_rate": 1.9293475242268224e-05, |
|
"loss": 1.2719, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 2.1222244055806643, |
|
"grad_norm": 0.64553302526474, |
|
"learning_rate": 1.92419427060743e-05, |
|
"loss": 1.2817, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 2.161524857535862, |
|
"grad_norm": 0.7140522003173828, |
|
"learning_rate": 1.9188670219069825e-05, |
|
"loss": 1.2781, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 2.200825309491059, |
|
"grad_norm": 0.7687891721725464, |
|
"learning_rate": 1.913366781069449e-05, |
|
"loss": 1.2616, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 2.2401257614462566, |
|
"grad_norm": 0.7907755374908447, |
|
"learning_rate": 1.9076945836074716e-05, |
|
"loss": 1.2526, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 2.2794262134014542, |
|
"grad_norm": 0.9397710561752319, |
|
"learning_rate": 1.901851497407411e-05, |
|
"loss": 1.256, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 2.3187266653566514, |
|
"grad_norm": 0.7789448499679565, |
|
"learning_rate": 1.8958386225283018e-05, |
|
"loss": 1.2759, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 2.358027117311849, |
|
"grad_norm": 0.8291410207748413, |
|
"learning_rate": 1.8896570909947477e-05, |
|
"loss": 1.2546, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 2.3973275692670466, |
|
"grad_norm": 0.8539474606513977, |
|
"learning_rate": 1.8833080665837956e-05, |
|
"loss": 1.2643, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 2.436628021222244, |
|
"grad_norm": 0.7579149007797241, |
|
"learning_rate": 1.876792744605839e-05, |
|
"loss": 1.2425, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 2.4759284731774414, |
|
"grad_norm": 0.7331416606903076, |
|
"learning_rate": 1.8701123516795797e-05, |
|
"loss": 1.2541, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 2.515228925132639, |
|
"grad_norm": 0.9984397888183594, |
|
"learning_rate": 1.8632681455010937e-05, |
|
"loss": 1.2564, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 2.5545293770878366, |
|
"grad_norm": 0.8411275744438171, |
|
"learning_rate": 1.856261414607053e-05, |
|
"loss": 1.2519, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 2.5938298290430337, |
|
"grad_norm": 0.8506852388381958, |
|
"learning_rate": 1.849093478132133e-05, |
|
"loss": 1.2384, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 2.6331302809982313, |
|
"grad_norm": 0.821840226650238, |
|
"learning_rate": 1.841765685560666e-05, |
|
"loss": 1.2591, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 2.672430732953429, |
|
"grad_norm": 0.8434679508209229, |
|
"learning_rate": 1.834279416472577e-05, |
|
"loss": 1.2414, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 2.7117311849086265, |
|
"grad_norm": 0.7219915986061096, |
|
"learning_rate": 1.8266360802836542e-05, |
|
"loss": 1.2632, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 2.751031636863824, |
|
"grad_norm": 0.7417508363723755, |
|
"learning_rate": 1.8188371159802046e-05, |
|
"loss": 1.2247, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 2.7903320888190213, |
|
"grad_norm": 0.9285396337509155, |
|
"learning_rate": 1.8108839918481384e-05, |
|
"loss": 1.2532, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 2.829632540774219, |
|
"grad_norm": 0.8321588039398193, |
|
"learning_rate": 1.8027782051965408e-05, |
|
"loss": 1.2465, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 2.8689329927294165, |
|
"grad_norm": 0.9930406808853149, |
|
"learning_rate": 1.794521282075778e-05, |
|
"loss": 1.2382, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 2.9082334446846136, |
|
"grad_norm": 0.8240166306495667, |
|
"learning_rate": 1.786114776990194e-05, |
|
"loss": 1.2632, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 2.9475338966398112, |
|
"grad_norm": 0.8060221672058105, |
|
"learning_rate": 1.777560272605447e-05, |
|
"loss": 1.2625, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 2.986834348595009, |
|
"grad_norm": 0.9597970843315125, |
|
"learning_rate": 1.7688593794505466e-05, |
|
"loss": 1.2472, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 2.999803497740224, |
|
"eval_loss": 1.2436127662658691, |
|
"eval_runtime": 871.1991, |
|
"eval_samples_per_second": 4.381, |
|
"eval_steps_per_second": 0.731, |
|
"step": 7633 |
|
}, |
|
{ |
|
"epoch": 3.0261348005502064, |
|
"grad_norm": 0.7766150236129761, |
|
"learning_rate": 1.760013735614646e-05, |
|
"loss": 1.252, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 3.0654352525054036, |
|
"grad_norm": 0.7692065834999084, |
|
"learning_rate": 1.751025006438643e-05, |
|
"loss": 1.2421, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 3.104735704460601, |
|
"grad_norm": 0.86983722448349, |
|
"learning_rate": 1.7418948842016515e-05, |
|
"loss": 1.2405, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 3.144036156415799, |
|
"grad_norm": 0.8825920820236206, |
|
"learning_rate": 1.732625087802402e-05, |
|
"loss": 1.2511, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 3.1833366083709964, |
|
"grad_norm": 0.7962430119514465, |
|
"learning_rate": 1.7232173624356307e-05, |
|
"loss": 1.2397, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 3.2226370603261936, |
|
"grad_norm": 0.8126371502876282, |
|
"learning_rate": 1.7136734792635154e-05, |
|
"loss": 1.2465, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 3.261937512281391, |
|
"grad_norm": 1.0476093292236328, |
|
"learning_rate": 1.7039952350822275e-05, |
|
"loss": 1.2414, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 3.3012379642365888, |
|
"grad_norm": 0.868041455745697, |
|
"learning_rate": 1.694184451983651e-05, |
|
"loss": 1.2318, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 3.3405384161917864, |
|
"grad_norm": 1.0523380041122437, |
|
"learning_rate": 1.6842429770123476e-05, |
|
"loss": 1.2439, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 3.3798388681469835, |
|
"grad_norm": 0.8371441960334778, |
|
"learning_rate": 1.6741726818178154e-05, |
|
"loss": 1.2219, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 3.419139320102181, |
|
"grad_norm": 0.9032717943191528, |
|
"learning_rate": 1.6639754623021227e-05, |
|
"loss": 1.2496, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 3.4584397720573787, |
|
"grad_norm": 0.8264920115470886, |
|
"learning_rate": 1.6536532382629696e-05, |
|
"loss": 1.2527, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 3.4977402240125763, |
|
"grad_norm": 0.87635737657547, |
|
"learning_rate": 1.643207953032258e-05, |
|
"loss": 1.2295, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 3.5370406759677735, |
|
"grad_norm": 0.9072712063789368, |
|
"learning_rate": 1.6326415731102226e-05, |
|
"loss": 1.2424, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 3.576341127922971, |
|
"grad_norm": 0.8442856073379517, |
|
"learning_rate": 1.6219560877952052e-05, |
|
"loss": 1.2437, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 3.6156415798781687, |
|
"grad_norm": 0.8766356706619263, |
|
"learning_rate": 1.6111535088091388e-05, |
|
"loss": 1.2538, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 3.6549420318333663, |
|
"grad_norm": 1.021742820739746, |
|
"learning_rate": 1.6002358699188035e-05, |
|
"loss": 1.247, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 3.6942424837885635, |
|
"grad_norm": 0.8045344352722168, |
|
"learning_rate": 1.5892052265529378e-05, |
|
"loss": 1.2459, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 3.733542935743761, |
|
"grad_norm": 0.8662866950035095, |
|
"learning_rate": 1.5780636554152682e-05, |
|
"loss": 1.2381, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 3.7728433876989587, |
|
"grad_norm": 1.0202692747116089, |
|
"learning_rate": 1.566813254093538e-05, |
|
"loss": 1.2382, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 3.812143839654156, |
|
"grad_norm": 0.8142303228378296, |
|
"learning_rate": 1.5554561406645964e-05, |
|
"loss": 1.2332, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 3.8514442916093534, |
|
"grad_norm": 0.9972567558288574, |
|
"learning_rate": 1.5439944532956398e-05, |
|
"loss": 1.2392, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 3.890744743564551, |
|
"grad_norm": 0.8277762532234192, |
|
"learning_rate": 1.5324303498416622e-05, |
|
"loss": 1.2476, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 3.9300451955197486, |
|
"grad_norm": 0.6960517168045044, |
|
"learning_rate": 1.520766007439205e-05, |
|
"loss": 1.2556, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 3.9693456474749462, |
|
"grad_norm": 0.9125708937644958, |
|
"learning_rate": 1.509003622096474e-05, |
|
"loss": 1.2438, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 1.2373554706573486, |
|
"eval_runtime": 870.4282, |
|
"eval_samples_per_second": 4.385, |
|
"eval_steps_per_second": 0.732, |
|
"step": 10178 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 25440, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.2731866625592525e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|