llama_dialogue_ep4 / trainer_state.json
JesseLiu's picture
init;
23d9bea
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 10178,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.039300451955197484,
"grad_norm": 0.5243114233016968,
"learning_rate": 7.861635220125787e-07,
"loss": 1.9595,
"step": 100
},
{
"epoch": 0.07860090391039497,
"grad_norm": 0.5634641051292419,
"learning_rate": 1.5723270440251573e-06,
"loss": 1.9417,
"step": 200
},
{
"epoch": 0.11790135586559246,
"grad_norm": 0.5528621077537537,
"learning_rate": 2.358490566037736e-06,
"loss": 1.9052,
"step": 300
},
{
"epoch": 0.15720180782078993,
"grad_norm": 0.5267557501792908,
"learning_rate": 3.1446540880503146e-06,
"loss": 1.8587,
"step": 400
},
{
"epoch": 0.19650225977598743,
"grad_norm": 0.4375621974468231,
"learning_rate": 3.930817610062894e-06,
"loss": 1.802,
"step": 500
},
{
"epoch": 0.23580271173118492,
"grad_norm": 0.44920215010643005,
"learning_rate": 4.716981132075472e-06,
"loss": 1.7458,
"step": 600
},
{
"epoch": 0.2751031636863824,
"grad_norm": 0.5727487802505493,
"learning_rate": 5.503144654088051e-06,
"loss": 1.6559,
"step": 700
},
{
"epoch": 0.31440361564157987,
"grad_norm": 0.5727925300598145,
"learning_rate": 6.289308176100629e-06,
"loss": 1.5392,
"step": 800
},
{
"epoch": 0.35370406759677736,
"grad_norm": 0.5609890818595886,
"learning_rate": 7.0754716981132075e-06,
"loss": 1.3973,
"step": 900
},
{
"epoch": 0.39300451955197485,
"grad_norm": 0.5241280794143677,
"learning_rate": 7.861635220125787e-06,
"loss": 1.3599,
"step": 1000
},
{
"epoch": 0.43230497150717234,
"grad_norm": 0.4475036859512329,
"learning_rate": 8.647798742138366e-06,
"loss": 1.3509,
"step": 1100
},
{
"epoch": 0.47160542346236983,
"grad_norm": 0.46985116600990295,
"learning_rate": 9.433962264150944e-06,
"loss": 1.3268,
"step": 1200
},
{
"epoch": 0.5109058754175673,
"grad_norm": 0.49204403162002563,
"learning_rate": 1.0220125786163524e-05,
"loss": 1.3113,
"step": 1300
},
{
"epoch": 0.5502063273727648,
"grad_norm": 0.5724858045578003,
"learning_rate": 1.1006289308176102e-05,
"loss": 1.3017,
"step": 1400
},
{
"epoch": 0.5895067793279622,
"grad_norm": 0.6059098839759827,
"learning_rate": 1.179245283018868e-05,
"loss": 1.3185,
"step": 1500
},
{
"epoch": 0.6288072312831597,
"grad_norm": 0.6029092669487,
"learning_rate": 1.2578616352201259e-05,
"loss": 1.3167,
"step": 1600
},
{
"epoch": 0.6681076832383572,
"grad_norm": 0.5854910612106323,
"learning_rate": 1.3364779874213839e-05,
"loss": 1.3188,
"step": 1700
},
{
"epoch": 0.7074081351935547,
"grad_norm": 0.7484495043754578,
"learning_rate": 1.4150943396226415e-05,
"loss": 1.3073,
"step": 1800
},
{
"epoch": 0.7467085871487522,
"grad_norm": 0.5744655132293701,
"learning_rate": 1.4937106918238995e-05,
"loss": 1.3014,
"step": 1900
},
{
"epoch": 0.7860090391039497,
"grad_norm": 0.6052381992340088,
"learning_rate": 1.5723270440251575e-05,
"loss": 1.2955,
"step": 2000
},
{
"epoch": 0.8253094910591472,
"grad_norm": 0.7050319910049438,
"learning_rate": 1.650943396226415e-05,
"loss": 1.3033,
"step": 2100
},
{
"epoch": 0.8646099430143447,
"grad_norm": 0.596123456954956,
"learning_rate": 1.729559748427673e-05,
"loss": 1.3067,
"step": 2200
},
{
"epoch": 0.9039103949695422,
"grad_norm": 0.618027925491333,
"learning_rate": 1.8081761006289308e-05,
"loss": 1.2855,
"step": 2300
},
{
"epoch": 0.9432108469247397,
"grad_norm": 0.7311336994171143,
"learning_rate": 1.8867924528301888e-05,
"loss": 1.278,
"step": 2400
},
{
"epoch": 0.9825112988799372,
"grad_norm": 0.6211506128311157,
"learning_rate": 1.9654088050314464e-05,
"loss": 1.3028,
"step": 2500
},
{
"epoch": 0.999803497740224,
"eval_loss": 1.2847634553909302,
"eval_runtime": 874.8058,
"eval_samples_per_second": 4.363,
"eval_steps_per_second": 0.728,
"step": 2544
},
{
"epoch": 1.0218117508351345,
"grad_norm": 0.7327952980995178,
"learning_rate": 1.9999704794507125e-05,
"loss": 1.2766,
"step": 2600
},
{
"epoch": 1.0611122027903321,
"grad_norm": 0.6698949337005615,
"learning_rate": 1.9997709221324418e-05,
"loss": 1.2867,
"step": 2700
},
{
"epoch": 1.1004126547455295,
"grad_norm": 0.6375765204429626,
"learning_rate": 1.9993831411601573e-05,
"loss": 1.2934,
"step": 2800
},
{
"epoch": 1.1397131067007271,
"grad_norm": 0.7647634148597717,
"learning_rate": 1.998807209540135e-05,
"loss": 1.2868,
"step": 2900
},
{
"epoch": 1.1790135586559245,
"grad_norm": 0.722791314125061,
"learning_rate": 1.9980432357011672e-05,
"loss": 1.2833,
"step": 3000
},
{
"epoch": 1.218314010611122,
"grad_norm": 0.6894689202308655,
"learning_rate": 1.9970913634741498e-05,
"loss": 1.2675,
"step": 3100
},
{
"epoch": 1.2576144625663195,
"grad_norm": 0.8223153352737427,
"learning_rate": 1.995951772065004e-05,
"loss": 1.2946,
"step": 3200
},
{
"epoch": 1.2969149145215169,
"grad_norm": 0.7805577516555786,
"learning_rate": 1.994624676020936e-05,
"loss": 1.2843,
"step": 3300
},
{
"epoch": 1.3362153664767145,
"grad_norm": 0.8491411209106445,
"learning_rate": 1.9931103251900485e-05,
"loss": 1.2665,
"step": 3400
},
{
"epoch": 1.375515818431912,
"grad_norm": 0.700754702091217,
"learning_rate": 1.9914090046742984e-05,
"loss": 1.2724,
"step": 3500
},
{
"epoch": 1.4148162703871094,
"grad_norm": 0.7429256439208984,
"learning_rate": 1.9895210347758233e-05,
"loss": 1.2818,
"step": 3600
},
{
"epoch": 1.4541167223423068,
"grad_norm": 0.7554534077644348,
"learning_rate": 1.9874467709366408e-05,
"loss": 1.2717,
"step": 3700
},
{
"epoch": 1.4934171742975044,
"grad_norm": 0.6377087831497192,
"learning_rate": 1.985186603671728e-05,
"loss": 1.2817,
"step": 3800
},
{
"epoch": 1.532717626252702,
"grad_norm": 0.6828142404556274,
"learning_rate": 1.9827409584955025e-05,
"loss": 1.2666,
"step": 3900
},
{
"epoch": 1.5720180782078994,
"grad_norm": 0.7497674822807312,
"learning_rate": 1.9801102958417107e-05,
"loss": 1.2774,
"step": 4000
},
{
"epoch": 1.6113185301630968,
"grad_norm": 0.6864811778068542,
"learning_rate": 1.977295110976744e-05,
"loss": 1.2754,
"step": 4100
},
{
"epoch": 1.6506189821182944,
"grad_norm": 0.6789698004722595,
"learning_rate": 1.9742959339063977e-05,
"loss": 1.2786,
"step": 4200
},
{
"epoch": 1.689919434073492,
"grad_norm": 0.7490926384925842,
"learning_rate": 1.971113329276087e-05,
"loss": 1.2809,
"step": 4300
},
{
"epoch": 1.7292198860286894,
"grad_norm": 0.7881296873092651,
"learning_rate": 1.9677478962645422e-05,
"loss": 1.2621,
"step": 4400
},
{
"epoch": 1.7685203379838867,
"grad_norm": 0.7230281829833984,
"learning_rate": 1.9642002684710065e-05,
"loss": 1.2852,
"step": 4500
},
{
"epoch": 1.8078207899390843,
"grad_norm": 0.7554972767829895,
"learning_rate": 1.960471113795947e-05,
"loss": 1.2721,
"step": 4600
},
{
"epoch": 1.8471212418942817,
"grad_norm": 0.734038770198822,
"learning_rate": 1.9565611343153133e-05,
"loss": 1.2814,
"step": 4700
},
{
"epoch": 1.886421693849479,
"grad_norm": 0.7878260016441345,
"learning_rate": 1.9524710661483594e-05,
"loss": 1.2678,
"step": 4800
},
{
"epoch": 1.9257221458046767,
"grad_norm": 0.6968903541564941,
"learning_rate": 1.9482016793190554e-05,
"loss": 1.2728,
"step": 4900
},
{
"epoch": 1.9650225977598743,
"grad_norm": 0.6642516851425171,
"learning_rate": 1.9437537776111207e-05,
"loss": 1.2597,
"step": 5000
},
{
"epoch": 2.0,
"eval_loss": 1.2640804052352905,
"eval_runtime": 870.1498,
"eval_samples_per_second": 4.387,
"eval_steps_per_second": 0.732,
"step": 5089
},
{
"epoch": 2.004323049715072,
"grad_norm": 0.7448713183403015,
"learning_rate": 1.9391281984166944e-05,
"loss": 1.28,
"step": 5100
},
{
"epoch": 2.043623501670269,
"grad_norm": 0.7270140647888184,
"learning_rate": 1.9343258125786866e-05,
"loss": 1.2702,
"step": 5200
},
{
"epoch": 2.0829239536254667,
"grad_norm": 0.7596098184585571,
"learning_rate": 1.9293475242268224e-05,
"loss": 1.2719,
"step": 5300
},
{
"epoch": 2.1222244055806643,
"grad_norm": 0.64553302526474,
"learning_rate": 1.92419427060743e-05,
"loss": 1.2817,
"step": 5400
},
{
"epoch": 2.161524857535862,
"grad_norm": 0.7140522003173828,
"learning_rate": 1.9188670219069825e-05,
"loss": 1.2781,
"step": 5500
},
{
"epoch": 2.200825309491059,
"grad_norm": 0.7687891721725464,
"learning_rate": 1.913366781069449e-05,
"loss": 1.2616,
"step": 5600
},
{
"epoch": 2.2401257614462566,
"grad_norm": 0.7907755374908447,
"learning_rate": 1.9076945836074716e-05,
"loss": 1.2526,
"step": 5700
},
{
"epoch": 2.2794262134014542,
"grad_norm": 0.9397710561752319,
"learning_rate": 1.901851497407411e-05,
"loss": 1.256,
"step": 5800
},
{
"epoch": 2.3187266653566514,
"grad_norm": 0.7789448499679565,
"learning_rate": 1.8958386225283018e-05,
"loss": 1.2759,
"step": 5900
},
{
"epoch": 2.358027117311849,
"grad_norm": 0.8291410207748413,
"learning_rate": 1.8896570909947477e-05,
"loss": 1.2546,
"step": 6000
},
{
"epoch": 2.3973275692670466,
"grad_norm": 0.8539474606513977,
"learning_rate": 1.8833080665837956e-05,
"loss": 1.2643,
"step": 6100
},
{
"epoch": 2.436628021222244,
"grad_norm": 0.7579149007797241,
"learning_rate": 1.876792744605839e-05,
"loss": 1.2425,
"step": 6200
},
{
"epoch": 2.4759284731774414,
"grad_norm": 0.7331416606903076,
"learning_rate": 1.8701123516795797e-05,
"loss": 1.2541,
"step": 6300
},
{
"epoch": 2.515228925132639,
"grad_norm": 0.9984397888183594,
"learning_rate": 1.8632681455010937e-05,
"loss": 1.2564,
"step": 6400
},
{
"epoch": 2.5545293770878366,
"grad_norm": 0.8411275744438171,
"learning_rate": 1.856261414607053e-05,
"loss": 1.2519,
"step": 6500
},
{
"epoch": 2.5938298290430337,
"grad_norm": 0.8506852388381958,
"learning_rate": 1.849093478132133e-05,
"loss": 1.2384,
"step": 6600
},
{
"epoch": 2.6331302809982313,
"grad_norm": 0.821840226650238,
"learning_rate": 1.841765685560666e-05,
"loss": 1.2591,
"step": 6700
},
{
"epoch": 2.672430732953429,
"grad_norm": 0.8434679508209229,
"learning_rate": 1.834279416472577e-05,
"loss": 1.2414,
"step": 6800
},
{
"epoch": 2.7117311849086265,
"grad_norm": 0.7219915986061096,
"learning_rate": 1.8266360802836542e-05,
"loss": 1.2632,
"step": 6900
},
{
"epoch": 2.751031636863824,
"grad_norm": 0.7417508363723755,
"learning_rate": 1.8188371159802046e-05,
"loss": 1.2247,
"step": 7000
},
{
"epoch": 2.7903320888190213,
"grad_norm": 0.9285396337509155,
"learning_rate": 1.8108839918481384e-05,
"loss": 1.2532,
"step": 7100
},
{
"epoch": 2.829632540774219,
"grad_norm": 0.8321588039398193,
"learning_rate": 1.8027782051965408e-05,
"loss": 1.2465,
"step": 7200
},
{
"epoch": 2.8689329927294165,
"grad_norm": 0.9930406808853149,
"learning_rate": 1.794521282075778e-05,
"loss": 1.2382,
"step": 7300
},
{
"epoch": 2.9082334446846136,
"grad_norm": 0.8240166306495667,
"learning_rate": 1.786114776990194e-05,
"loss": 1.2632,
"step": 7400
},
{
"epoch": 2.9475338966398112,
"grad_norm": 0.8060221672058105,
"learning_rate": 1.777560272605447e-05,
"loss": 1.2625,
"step": 7500
},
{
"epoch": 2.986834348595009,
"grad_norm": 0.9597970843315125,
"learning_rate": 1.7688593794505466e-05,
"loss": 1.2472,
"step": 7600
},
{
"epoch": 2.999803497740224,
"eval_loss": 1.2436127662658691,
"eval_runtime": 871.1991,
"eval_samples_per_second": 4.381,
"eval_steps_per_second": 0.731,
"step": 7633
},
{
"epoch": 3.0261348005502064,
"grad_norm": 0.7766150236129761,
"learning_rate": 1.760013735614646e-05,
"loss": 1.252,
"step": 7700
},
{
"epoch": 3.0654352525054036,
"grad_norm": 0.7692065834999084,
"learning_rate": 1.751025006438643e-05,
"loss": 1.2421,
"step": 7800
},
{
"epoch": 3.104735704460601,
"grad_norm": 0.86983722448349,
"learning_rate": 1.7418948842016515e-05,
"loss": 1.2405,
"step": 7900
},
{
"epoch": 3.144036156415799,
"grad_norm": 0.8825920820236206,
"learning_rate": 1.732625087802402e-05,
"loss": 1.2511,
"step": 8000
},
{
"epoch": 3.1833366083709964,
"grad_norm": 0.7962430119514465,
"learning_rate": 1.7232173624356307e-05,
"loss": 1.2397,
"step": 8100
},
{
"epoch": 3.2226370603261936,
"grad_norm": 0.8126371502876282,
"learning_rate": 1.7136734792635154e-05,
"loss": 1.2465,
"step": 8200
},
{
"epoch": 3.261937512281391,
"grad_norm": 1.0476093292236328,
"learning_rate": 1.7039952350822275e-05,
"loss": 1.2414,
"step": 8300
},
{
"epoch": 3.3012379642365888,
"grad_norm": 0.868041455745697,
"learning_rate": 1.694184451983651e-05,
"loss": 1.2318,
"step": 8400
},
{
"epoch": 3.3405384161917864,
"grad_norm": 1.0523380041122437,
"learning_rate": 1.6842429770123476e-05,
"loss": 1.2439,
"step": 8500
},
{
"epoch": 3.3798388681469835,
"grad_norm": 0.8371441960334778,
"learning_rate": 1.6741726818178154e-05,
"loss": 1.2219,
"step": 8600
},
{
"epoch": 3.419139320102181,
"grad_norm": 0.9032717943191528,
"learning_rate": 1.6639754623021227e-05,
"loss": 1.2496,
"step": 8700
},
{
"epoch": 3.4584397720573787,
"grad_norm": 0.8264920115470886,
"learning_rate": 1.6536532382629696e-05,
"loss": 1.2527,
"step": 8800
},
{
"epoch": 3.4977402240125763,
"grad_norm": 0.87635737657547,
"learning_rate": 1.643207953032258e-05,
"loss": 1.2295,
"step": 8900
},
{
"epoch": 3.5370406759677735,
"grad_norm": 0.9072712063789368,
"learning_rate": 1.6326415731102226e-05,
"loss": 1.2424,
"step": 9000
},
{
"epoch": 3.576341127922971,
"grad_norm": 0.8442856073379517,
"learning_rate": 1.6219560877952052e-05,
"loss": 1.2437,
"step": 9100
},
{
"epoch": 3.6156415798781687,
"grad_norm": 0.8766356706619263,
"learning_rate": 1.6111535088091388e-05,
"loss": 1.2538,
"step": 9200
},
{
"epoch": 3.6549420318333663,
"grad_norm": 1.021742820739746,
"learning_rate": 1.6002358699188035e-05,
"loss": 1.247,
"step": 9300
},
{
"epoch": 3.6942424837885635,
"grad_norm": 0.8045344352722168,
"learning_rate": 1.5892052265529378e-05,
"loss": 1.2459,
"step": 9400
},
{
"epoch": 3.733542935743761,
"grad_norm": 0.8662866950035095,
"learning_rate": 1.5780636554152682e-05,
"loss": 1.2381,
"step": 9500
},
{
"epoch": 3.7728433876989587,
"grad_norm": 1.0202692747116089,
"learning_rate": 1.566813254093538e-05,
"loss": 1.2382,
"step": 9600
},
{
"epoch": 3.812143839654156,
"grad_norm": 0.8142303228378296,
"learning_rate": 1.5554561406645964e-05,
"loss": 1.2332,
"step": 9700
},
{
"epoch": 3.8514442916093534,
"grad_norm": 0.9972567558288574,
"learning_rate": 1.5439944532956398e-05,
"loss": 1.2392,
"step": 9800
},
{
"epoch": 3.890744743564551,
"grad_norm": 0.8277762532234192,
"learning_rate": 1.5324303498416622e-05,
"loss": 1.2476,
"step": 9900
},
{
"epoch": 3.9300451955197486,
"grad_norm": 0.6960517168045044,
"learning_rate": 1.520766007439205e-05,
"loss": 1.2556,
"step": 10000
},
{
"epoch": 3.9693456474749462,
"grad_norm": 0.9125708937644958,
"learning_rate": 1.509003622096474e-05,
"loss": 1.2438,
"step": 10100
},
{
"epoch": 4.0,
"eval_loss": 1.2373554706573486,
"eval_runtime": 870.4282,
"eval_samples_per_second": 4.385,
"eval_steps_per_second": 0.732,
"step": 10178
}
],
"logging_steps": 100,
"max_steps": 25440,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.2731866625592525e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}