|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.998698642870422, |
|
"eval_steps": 500, |
|
"global_step": 3360, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.014872652909462726, |
|
"grad_norm": 1.0425002574920654, |
|
"learning_rate": 8.928571428571428e-06, |
|
"loss": 0.4346, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02974530581892545, |
|
"grad_norm": 0.5861272811889648, |
|
"learning_rate": 1.7857142857142855e-05, |
|
"loss": 0.3527, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.044617958728388175, |
|
"grad_norm": 0.5629558563232422, |
|
"learning_rate": 2.6785714285714284e-05, |
|
"loss": 0.2922, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0594906116378509, |
|
"grad_norm": 0.49566933512687683, |
|
"learning_rate": 3.571428571428571e-05, |
|
"loss": 0.281, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.07436326454731362, |
|
"grad_norm": 0.45387113094329834, |
|
"learning_rate": 4.4642857142857136e-05, |
|
"loss": 0.2689, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.08923591745677635, |
|
"grad_norm": 0.43913352489471436, |
|
"learning_rate": 5.357142857142857e-05, |
|
"loss": 0.2506, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.10410857036623908, |
|
"grad_norm": 0.7242547869682312, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.229, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1189812232757018, |
|
"grad_norm": 0.5109072923660278, |
|
"learning_rate": 7.142857142857142e-05, |
|
"loss": 0.2373, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.13385387618516453, |
|
"grad_norm": 0.5291035175323486, |
|
"learning_rate": 8.035714285714285e-05, |
|
"loss": 0.2405, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.14872652909462725, |
|
"grad_norm": 0.48036572337150574, |
|
"learning_rate": 8.928571428571427e-05, |
|
"loss": 0.2274, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.16359918200409, |
|
"grad_norm": 0.3294093906879425, |
|
"learning_rate": 9.82142857142857e-05, |
|
"loss": 0.2038, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.1784718349135527, |
|
"grad_norm": 0.49968525767326355, |
|
"learning_rate": 0.00010714285714285714, |
|
"loss": 0.2084, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.19334448782301544, |
|
"grad_norm": 0.32227209210395813, |
|
"learning_rate": 0.00011607142857142857, |
|
"loss": 0.1981, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.20821714073247816, |
|
"grad_norm": 0.37266677618026733, |
|
"learning_rate": 0.000125, |
|
"loss": 0.2192, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.22308979364194087, |
|
"grad_norm": 0.5228686928749084, |
|
"learning_rate": 0.00013392857142857144, |
|
"loss": 0.2014, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2379624465514036, |
|
"grad_norm": 0.4202245771884918, |
|
"learning_rate": 0.00014285714285714284, |
|
"loss": 0.1912, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.25283509946086635, |
|
"grad_norm": 0.45801258087158203, |
|
"learning_rate": 0.00015178571428571427, |
|
"loss": 0.212, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.26770775237032907, |
|
"grad_norm": 0.4326329827308655, |
|
"learning_rate": 0.0001607142857142857, |
|
"loss": 0.1973, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.2825804052797918, |
|
"grad_norm": 0.38971471786499023, |
|
"learning_rate": 0.0001696428571428571, |
|
"loss": 0.1907, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.2974530581892545, |
|
"grad_norm": 0.3728097975254059, |
|
"learning_rate": 0.00017857142857142854, |
|
"loss": 0.192, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3123257110987172, |
|
"grad_norm": 0.34695690870285034, |
|
"learning_rate": 0.00018749999999999998, |
|
"loss": 0.1855, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.32719836400818, |
|
"grad_norm": 0.41753408312797546, |
|
"learning_rate": 0.0001964285714285714, |
|
"loss": 0.1883, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.3420710169176427, |
|
"grad_norm": 0.27681878209114075, |
|
"learning_rate": 0.00020535714285714284, |
|
"loss": 0.1809, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.3569436698271054, |
|
"grad_norm": 2.382871150970459, |
|
"learning_rate": 0.00021428571428571427, |
|
"loss": 0.1735, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.3718163227365681, |
|
"grad_norm": 160.2670440673828, |
|
"learning_rate": 0.0002232142857142857, |
|
"loss": 1.2159, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.3866889756460309, |
|
"grad_norm": 21.60050392150879, |
|
"learning_rate": 0.00023214285714285714, |
|
"loss": 5.4026, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.4015616285554936, |
|
"grad_norm": 13.928524017333984, |
|
"learning_rate": 0.00024107142857142857, |
|
"loss": 4.3573, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.4164342814649563, |
|
"grad_norm": 5.3707685470581055, |
|
"learning_rate": 0.00025, |
|
"loss": 3.2782, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.431306934374419, |
|
"grad_norm": 5.556903839111328, |
|
"learning_rate": 0.0002589285714285714, |
|
"loss": 2.8033, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.44617958728388174, |
|
"grad_norm": 2.512521505355835, |
|
"learning_rate": 0.00026785714285714287, |
|
"loss": 2.5486, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.4610522401933445, |
|
"grad_norm": 3.592169761657715, |
|
"learning_rate": 0.0002767857142857143, |
|
"loss": 2.2779, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.4759248931028072, |
|
"grad_norm": 2.791459321975708, |
|
"learning_rate": 0.0002857142857142857, |
|
"loss": 2.1011, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.49079754601226994, |
|
"grad_norm": 1.1407463550567627, |
|
"learning_rate": 0.0002946428571428571, |
|
"loss": 1.9929, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.5056701989217327, |
|
"grad_norm": 1.795841097831726, |
|
"learning_rate": 0.0002999987048597728, |
|
"loss": 1.8818, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.5205428518311954, |
|
"grad_norm": 1.4798821210861206, |
|
"learning_rate": 0.00029998413478906613, |
|
"loss": 1.772, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5354155047406581, |
|
"grad_norm": 1.5337024927139282, |
|
"learning_rate": 0.0002999533773001224, |
|
"loss": 1.6782, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.5502881576501208, |
|
"grad_norm": 1.332065463066101, |
|
"learning_rate": 0.00029990643571252174, |
|
"loss": 1.6035, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.5651608105595836, |
|
"grad_norm": 1.0516103506088257, |
|
"learning_rate": 0.00029984331509255415, |
|
"loss": 1.5053, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.5800334634690463, |
|
"grad_norm": 1.034192442893982, |
|
"learning_rate": 0.00029976402225267247, |
|
"loss": 1.3906, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.594906116378509, |
|
"grad_norm": 1.2757515907287598, |
|
"learning_rate": 0.0002996685657507577, |
|
"loss": 1.2592, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.6097787692879717, |
|
"grad_norm": 0.8252782225608826, |
|
"learning_rate": 0.000299556955889195, |
|
"loss": 1.0907, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.6246514221974344, |
|
"grad_norm": 1.020588994026184, |
|
"learning_rate": 0.0002994292047137618, |
|
"loss": 0.9035, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.6395240751068972, |
|
"grad_norm": 0.5973761677742004, |
|
"learning_rate": 0.0002992853260123278, |
|
"loss": 0.7538, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.65439672801636, |
|
"grad_norm": 0.6886543035507202, |
|
"learning_rate": 0.0002991253353133668, |
|
"loss": 0.6621, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.6692693809258227, |
|
"grad_norm": 0.44221287965774536, |
|
"learning_rate": 0.00029894924988428087, |
|
"loss": 0.59, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6841420338352854, |
|
"grad_norm": 0.7888408899307251, |
|
"learning_rate": 0.00029875708872953677, |
|
"loss": 0.539, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.6990146867447481, |
|
"grad_norm": 0.43110209703445435, |
|
"learning_rate": 0.00029854887258861447, |
|
"loss": 0.4903, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.7138873396542108, |
|
"grad_norm": 0.41334015130996704, |
|
"learning_rate": 0.0002983246239337692, |
|
"loss": 0.4488, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.7287599925636735, |
|
"grad_norm": 0.3482460379600525, |
|
"learning_rate": 0.0002980843669676061, |
|
"loss": 0.4165, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.7436326454731362, |
|
"grad_norm": 0.3593901991844177, |
|
"learning_rate": 0.0002978281276204675, |
|
"loss": 0.3821, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7436326454731362, |
|
"eval_loss": 0.37597203254699707, |
|
"eval_runtime": 212.4955, |
|
"eval_samples_per_second": 22.499, |
|
"eval_steps_per_second": 22.499, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.758505298382599, |
|
"grad_norm": 0.4221905469894409, |
|
"learning_rate": 0.00029755593354763516, |
|
"loss": 0.3627, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.7733779512920618, |
|
"grad_norm": 0.31105437874794006, |
|
"learning_rate": 0.0002972678141263449, |
|
"loss": 0.3346, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.7882506042015245, |
|
"grad_norm": 0.2600822150707245, |
|
"learning_rate": 0.000296963800452616, |
|
"loss": 0.3217, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.8031232571109872, |
|
"grad_norm": 0.21437157690525055, |
|
"learning_rate": 0.0002966439253378957, |
|
"loss": 0.3095, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.8179959100204499, |
|
"grad_norm": 0.22641418874263763, |
|
"learning_rate": 0.000296308223305517, |
|
"loss": 0.2866, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8328685629299126, |
|
"grad_norm": 0.2200980931520462, |
|
"learning_rate": 0.00029595673058697357, |
|
"loss": 0.2579, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.8477412158393753, |
|
"grad_norm": 0.21351036429405212, |
|
"learning_rate": 0.0002955894851180086, |
|
"loss": 0.2727, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.862613868748838, |
|
"grad_norm": 0.2137759029865265, |
|
"learning_rate": 0.0002952065265345211, |
|
"loss": 0.2621, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.8774865216583008, |
|
"grad_norm": 0.18923349678516388, |
|
"learning_rate": 0.00029480789616828765, |
|
"loss": 0.2647, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.8923591745677635, |
|
"grad_norm": 0.1697588562965393, |
|
"learning_rate": 0.00029439363704250176, |
|
"loss": 0.2434, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9072318274772263, |
|
"grad_norm": 0.15528830885887146, |
|
"learning_rate": 0.0002939637938671306, |
|
"loss": 0.2293, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.922104480386689, |
|
"grad_norm": 0.43390974402427673, |
|
"learning_rate": 0.0002935184130340893, |
|
"loss": 0.228, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.9369771332961517, |
|
"grad_norm": 0.2026420682668686, |
|
"learning_rate": 0.000293057542612234, |
|
"loss": 0.2355, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.9518497862056144, |
|
"grad_norm": 0.16864228248596191, |
|
"learning_rate": 0.00029258123234217435, |
|
"loss": 0.2213, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.9667224391150772, |
|
"grad_norm": 0.15947186946868896, |
|
"learning_rate": 0.0002920895336309044, |
|
"loss": 0.2079, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9815950920245399, |
|
"grad_norm": 0.21965055167675018, |
|
"learning_rate": 0.0002915824995462551, |
|
"loss": 0.2002, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.9964677449340026, |
|
"grad_norm": 0.23223313689231873, |
|
"learning_rate": 0.00029106018481116626, |
|
"loss": 0.1983, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.0117122141662018, |
|
"grad_norm": 0.26117920875549316, |
|
"learning_rate": 0.00029052264579778063, |
|
"loss": 0.2175, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.0265848670756645, |
|
"grad_norm": 0.176736518740654, |
|
"learning_rate": 0.00028996994052135996, |
|
"loss": 0.1831, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.0414575199851273, |
|
"grad_norm": 0.17873461544513702, |
|
"learning_rate": 0.0002894021286340233, |
|
"loss": 0.1784, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.05633017289459, |
|
"grad_norm": 0.2646450996398926, |
|
"learning_rate": 0.0002888192714183092, |
|
"loss": 0.1784, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.0712028258040527, |
|
"grad_norm": 0.16840551793575287, |
|
"learning_rate": 0.00028822143178056114, |
|
"loss": 0.1726, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.0860754787135156, |
|
"grad_norm": 0.1423952877521515, |
|
"learning_rate": 0.0002876086742441387, |
|
"loss": 0.1608, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.1009481316229783, |
|
"grad_norm": 0.16237640380859375, |
|
"learning_rate": 0.0002869810649424535, |
|
"loss": 0.179, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.115820784532441, |
|
"grad_norm": 0.158773735165596, |
|
"learning_rate": 0.0002863386716118316, |
|
"loss": 0.1742, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.1306934374419038, |
|
"grad_norm": 0.17627516388893127, |
|
"learning_rate": 0.0002856815635842029, |
|
"loss": 0.1821, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.1455660903513665, |
|
"grad_norm": 0.23613831400871277, |
|
"learning_rate": 0.00028500981177961816, |
|
"loss": 0.156, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.1604387432608292, |
|
"grad_norm": 0.16501256823539734, |
|
"learning_rate": 0.0002843234886985951, |
|
"loss": 0.1517, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.175311396170292, |
|
"grad_norm": 0.2365158647298813, |
|
"learning_rate": 0.00028362266841429345, |
|
"loss": 0.1391, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.1901840490797546, |
|
"grad_norm": 0.17508777976036072, |
|
"learning_rate": 0.00028290742656452014, |
|
"loss": 0.1434, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.2050567019892173, |
|
"grad_norm": 0.145797461271286, |
|
"learning_rate": 0.0002821778403435663, |
|
"loss": 0.1607, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.21992935489868, |
|
"grad_norm": 0.15968403220176697, |
|
"learning_rate": 0.00028143398849387577, |
|
"loss": 0.1536, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.2348020078081428, |
|
"grad_norm": 0.1553070992231369, |
|
"learning_rate": 0.00028067595129754647, |
|
"loss": 0.1481, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.2496746607176055, |
|
"grad_norm": 0.1769135743379593, |
|
"learning_rate": 0.0002799038105676658, |
|
"loss": 0.1285, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 1.2645473136270682, |
|
"grad_norm": 0.1639111191034317, |
|
"learning_rate": 0.0002791176496394808, |
|
"loss": 0.144, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 1.279419966536531, |
|
"grad_norm": 0.19045153260231018, |
|
"learning_rate": 0.00027831755336140416, |
|
"loss": 0.1347, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 1.2942926194459936, |
|
"grad_norm": 0.18079642951488495, |
|
"learning_rate": 0.00027750360808585637, |
|
"loss": 0.1254, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 1.3091652723554563, |
|
"grad_norm": 0.18368874490261078, |
|
"learning_rate": 0.00027667590165994613, |
|
"loss": 0.1289, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 1.324037925264919, |
|
"grad_norm": 0.20005619525909424, |
|
"learning_rate": 0.00027583452341598935, |
|
"loss": 0.1246, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 1.338910578174382, |
|
"grad_norm": 0.1317131668329239, |
|
"learning_rate": 0.0002749795641618673, |
|
"loss": 0.1238, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.3537832310838445, |
|
"grad_norm": 0.15287995338439941, |
|
"learning_rate": 0.00027411111617122656, |
|
"loss": 0.1224, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 1.3686558839933074, |
|
"grad_norm": 0.1613466739654541, |
|
"learning_rate": 0.0002732292731735196, |
|
"loss": 0.1178, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 1.3835285369027701, |
|
"grad_norm": 0.1685304194688797, |
|
"learning_rate": 0.000272334130343889, |
|
"loss": 0.1201, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 1.3984011898122328, |
|
"grad_norm": 0.19208119809627533, |
|
"learning_rate": 0.0002714257842928956, |
|
"loss": 0.1103, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 1.4132738427216955, |
|
"grad_norm": 0.17899583280086517, |
|
"learning_rate": 0.00027050433305609125, |
|
"loss": 0.1128, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 1.4281464956311583, |
|
"grad_norm": 0.19848547875881195, |
|
"learning_rate": 0.0002695698760834384, |
|
"loss": 0.1112, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 1.443019148540621, |
|
"grad_norm": 0.1710231602191925, |
|
"learning_rate": 0.0002686225142285762, |
|
"loss": 0.1107, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 1.4578918014500837, |
|
"grad_norm": 0.1552249938249588, |
|
"learning_rate": 0.0002676623497379363, |
|
"loss": 0.0984, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 1.4727644543595464, |
|
"grad_norm": 0.1702568084001541, |
|
"learning_rate": 0.0002666894862397072, |
|
"loss": 0.1109, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 1.487637107269009, |
|
"grad_norm": 0.12360525131225586, |
|
"learning_rate": 0.00026570402873264996, |
|
"loss": 0.1018, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.487637107269009, |
|
"eval_loss": 0.10193677991628647, |
|
"eval_runtime": 212.152, |
|
"eval_samples_per_second": 22.536, |
|
"eval_steps_per_second": 22.536, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.5025097601784718, |
|
"grad_norm": 0.14356306195259094, |
|
"learning_rate": 0.0002647060835747659, |
|
"loss": 0.101, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 1.5173824130879345, |
|
"grad_norm": 0.12723973393440247, |
|
"learning_rate": 0.00026369575847181795, |
|
"loss": 0.095, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 1.5322550659973972, |
|
"grad_norm": 0.12857410311698914, |
|
"learning_rate": 0.0002626731624657058, |
|
"loss": 0.0915, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 1.54712771890686, |
|
"grad_norm": 0.1593610793352127, |
|
"learning_rate": 0.0002616384059226977, |
|
"loss": 0.0993, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 1.562000371816323, |
|
"grad_norm": 0.11687605082988739, |
|
"learning_rate": 0.0002605916005215186, |
|
"loss": 0.0894, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.5768730247257854, |
|
"grad_norm": 0.1873299479484558, |
|
"learning_rate": 0.0002595328592412969, |
|
"loss": 0.097, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 1.5917456776352483, |
|
"grad_norm": 0.1516319364309311, |
|
"learning_rate": 0.00025846229634937136, |
|
"loss": 0.0931, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 1.6066183305447108, |
|
"grad_norm": 0.1431397646665573, |
|
"learning_rate": 0.0002573800273889577, |
|
"loss": 0.0918, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 1.6214909834541738, |
|
"grad_norm": 0.17975349724292755, |
|
"learning_rate": 0.0002562861691666793, |
|
"loss": 0.0892, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 1.6363636363636362, |
|
"grad_norm": 0.1414797306060791, |
|
"learning_rate": 0.0002551808397399597, |
|
"loss": 0.0952, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.6512362892730992, |
|
"grad_norm": 0.151850625872612, |
|
"learning_rate": 0.0002540641584042812, |
|
"loss": 0.1008, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 1.6661089421825617, |
|
"grad_norm": 0.1266675442457199, |
|
"learning_rate": 0.00025293624568031, |
|
"loss": 0.0782, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 1.6809815950920246, |
|
"grad_norm": 0.12076599150896072, |
|
"learning_rate": 0.0002517972233008882, |
|
"loss": 0.0772, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 1.6958542480014873, |
|
"grad_norm": 0.125094935297966, |
|
"learning_rate": 0.0002506472141978955, |
|
"loss": 0.0837, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 1.71072690091095, |
|
"grad_norm": 0.13272984325885773, |
|
"learning_rate": 0.0002494863424889819, |
|
"loss": 0.0736, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.7255995538204127, |
|
"grad_norm": 0.16893050074577332, |
|
"learning_rate": 0.00024831473346417153, |
|
"loss": 0.0856, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 1.7404722067298755, |
|
"grad_norm": 0.11702137440443039, |
|
"learning_rate": 0.00024713251357234053, |
|
"loss": 0.0799, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 1.7553448596393382, |
|
"grad_norm": 0.13682794570922852, |
|
"learning_rate": 0.00024593981040756997, |
|
"loss": 0.089, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 1.7702175125488009, |
|
"grad_norm": 0.13676613569259644, |
|
"learning_rate": 0.0002447367526953746, |
|
"loss": 0.0797, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 1.7850901654582636, |
|
"grad_norm": 0.13324877619743347, |
|
"learning_rate": 0.00024352347027881003, |
|
"loss": 0.0792, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.7999628183677263, |
|
"grad_norm": 0.11255478858947754, |
|
"learning_rate": 0.00024230009410445893, |
|
"loss": 0.0763, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 1.814835471277189, |
|
"grad_norm": 0.10950371623039246, |
|
"learning_rate": 0.0002410667562082985, |
|
"loss": 0.0663, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 1.8297081241866517, |
|
"grad_norm": 0.11777317523956299, |
|
"learning_rate": 0.00023982358970145004, |
|
"loss": 0.0694, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 1.8445807770961147, |
|
"grad_norm": 0.1194106712937355, |
|
"learning_rate": 0.00023857072875581244, |
|
"loss": 0.0703, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 1.8594534300055772, |
|
"grad_norm": 0.11233114451169968, |
|
"learning_rate": 0.00023730830858958177, |
|
"loss": 0.0655, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.87432608291504, |
|
"grad_norm": 0.11641702055931091, |
|
"learning_rate": 0.00023603646545265687, |
|
"loss": 0.0645, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 1.8891987358245026, |
|
"grad_norm": 0.1414889097213745, |
|
"learning_rate": 0.00023475533661193495, |
|
"loss": 0.068, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 1.9040713887339655, |
|
"grad_norm": 0.10632241517305374, |
|
"learning_rate": 0.00023346506033649614, |
|
"loss": 0.064, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 1.918944041643428, |
|
"grad_norm": 0.10176625102758408, |
|
"learning_rate": 0.0002321657758826807, |
|
"loss": 0.062, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 1.933816694552891, |
|
"grad_norm": 0.09434150904417038, |
|
"learning_rate": 0.00023085762347905943, |
|
"loss": 0.0684, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.9486893474623534, |
|
"grad_norm": 0.12967799603939056, |
|
"learning_rate": 0.00022954074431129915, |
|
"loss": 0.0605, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.9635620003718164, |
|
"grad_norm": 0.1181391179561615, |
|
"learning_rate": 0.0002282152805069247, |
|
"loss": 0.0654, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.978434653281279, |
|
"grad_norm": 0.10801093280315399, |
|
"learning_rate": 0.00022688137511997977, |
|
"loss": 0.07, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.9933073061907418, |
|
"grad_norm": 0.11437591165304184, |
|
"learning_rate": 0.00022553917211558713, |
|
"loss": 0.0578, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 2.008551775422941, |
|
"grad_norm": 0.11018254607915878, |
|
"learning_rate": 0.0002241888163544111, |
|
"loss": 0.0565, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 2.0234244283324037, |
|
"grad_norm": 0.08331198990345001, |
|
"learning_rate": 0.0002228304535770228, |
|
"loss": 0.0399, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 2.0382970812418666, |
|
"grad_norm": 0.09547814726829529, |
|
"learning_rate": 0.00022146423038817102, |
|
"loss": 0.0438, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 2.053169734151329, |
|
"grad_norm": 0.10641171038150787, |
|
"learning_rate": 0.00022009029424095928, |
|
"loss": 0.0384, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 2.068042387060792, |
|
"grad_norm": 0.10844069719314575, |
|
"learning_rate": 0.0002187087934209318, |
|
"loss": 0.044, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 2.0829150399702545, |
|
"grad_norm": 0.10333788394927979, |
|
"learning_rate": 0.00021731987703006933, |
|
"loss": 0.041, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.0977876928797174, |
|
"grad_norm": 0.10635129362344742, |
|
"learning_rate": 0.0002159236949706967, |
|
"loss": 0.04, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 2.11266034578918, |
|
"grad_norm": 0.09010270237922668, |
|
"learning_rate": 0.00021452039792930474, |
|
"loss": 0.0402, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 2.127532998698643, |
|
"grad_norm": 0.09274252504110336, |
|
"learning_rate": 0.00021311013736028658, |
|
"loss": 0.0384, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 2.1424056516081054, |
|
"grad_norm": 0.08550871163606644, |
|
"learning_rate": 0.00021169306546959174, |
|
"loss": 0.0428, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 2.1572783045175683, |
|
"grad_norm": 0.10152186453342438, |
|
"learning_rate": 0.00021026933519829896, |
|
"loss": 0.0442, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 2.1721509574270312, |
|
"grad_norm": 0.08528181910514832, |
|
"learning_rate": 0.00020883910020610957, |
|
"loss": 0.0375, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 2.1870236103364937, |
|
"grad_norm": 0.09736708551645279, |
|
"learning_rate": 0.00020740251485476345, |
|
"loss": 0.0387, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 2.2018962632459567, |
|
"grad_norm": 0.09133671224117279, |
|
"learning_rate": 0.00020595973419137908, |
|
"loss": 0.0373, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 2.216768916155419, |
|
"grad_norm": 0.08406363427639008, |
|
"learning_rate": 0.00020451091393171964, |
|
"loss": 0.0381, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 2.231641569064882, |
|
"grad_norm": 0.08503925055265427, |
|
"learning_rate": 0.00020305621044338718, |
|
"loss": 0.0376, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.231641569064882, |
|
"eval_loss": 0.051042910665273666, |
|
"eval_runtime": 212.5441, |
|
"eval_samples_per_second": 22.494, |
|
"eval_steps_per_second": 22.494, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.2465142219743446, |
|
"grad_norm": 0.09201103448867798, |
|
"learning_rate": 0.00020159578072894606, |
|
"loss": 0.0393, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 2.2613868748838075, |
|
"grad_norm": 0.09499834477901459, |
|
"learning_rate": 0.00020012978240897814, |
|
"loss": 0.0346, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 2.27625952779327, |
|
"grad_norm": 0.09396501630544662, |
|
"learning_rate": 0.00019865837370507106, |
|
"loss": 0.039, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 2.291132180702733, |
|
"grad_norm": 0.08983522653579712, |
|
"learning_rate": 0.00019718171342274205, |
|
"loss": 0.0387, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 2.3060048336121954, |
|
"grad_norm": 0.1118871420621872, |
|
"learning_rate": 0.00019569996093429814, |
|
"loss": 0.0379, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 2.3208774865216584, |
|
"grad_norm": 0.08434595167636871, |
|
"learning_rate": 0.00019421327616163563, |
|
"loss": 0.0372, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 2.335750139431121, |
|
"grad_norm": 0.0915694460272789, |
|
"learning_rate": 0.00019272181955898017, |
|
"loss": 0.036, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 2.350622792340584, |
|
"grad_norm": 0.08459066599607468, |
|
"learning_rate": 0.0001912257520955692, |
|
"loss": 0.0363, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 2.3654954452500463, |
|
"grad_norm": 0.09195558726787567, |
|
"learning_rate": 0.00018972523523827907, |
|
"loss": 0.0389, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 2.3803680981595092, |
|
"grad_norm": 0.09830203652381897, |
|
"learning_rate": 0.0001882204309341982, |
|
"loss": 0.0373, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 2.3952407510689717, |
|
"grad_norm": 0.08541320264339447, |
|
"learning_rate": 0.00018671150159314855, |
|
"loss": 0.0342, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 2.4101134039784347, |
|
"grad_norm": 0.08817029744386673, |
|
"learning_rate": 0.00018519861007015729, |
|
"loss": 0.0371, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 2.4249860568878976, |
|
"grad_norm": 0.08839129656553268, |
|
"learning_rate": 0.00018368191964788, |
|
"loss": 0.0355, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 2.43985870979736, |
|
"grad_norm": 0.08589951694011688, |
|
"learning_rate": 0.00018216159401897812, |
|
"loss": 0.0339, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 2.454731362706823, |
|
"grad_norm": 0.09998754411935806, |
|
"learning_rate": 0.00018063779726845203, |
|
"loss": 0.0339, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 2.4696040156162855, |
|
"grad_norm": 0.08363664150238037, |
|
"learning_rate": 0.0001791106938559317, |
|
"loss": 0.0357, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 2.4844766685257484, |
|
"grad_norm": 0.08930620551109314, |
|
"learning_rate": 0.00017758044859792705, |
|
"loss": 0.0347, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 2.499349321435211, |
|
"grad_norm": 0.08270251750946045, |
|
"learning_rate": 0.00017604722665003956, |
|
"loss": 0.0332, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 2.514221974344674, |
|
"grad_norm": 0.09085123986005783, |
|
"learning_rate": 0.00017451119348913744, |
|
"loss": 0.0357, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 2.5290946272541364, |
|
"grad_norm": 0.0897296592593193, |
|
"learning_rate": 0.00017297251489549638, |
|
"loss": 0.0368, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 2.5439672801635993, |
|
"grad_norm": 0.07172433286905289, |
|
"learning_rate": 0.000171431356934907, |
|
"loss": 0.0371, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 2.558839933073062, |
|
"grad_norm": 0.0848449245095253, |
|
"learning_rate": 0.0001698878859407519, |
|
"loss": 0.032, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 2.5737125859825247, |
|
"grad_norm": 0.08270355314016342, |
|
"learning_rate": 0.00016834226849605371, |
|
"loss": 0.0333, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 2.588585238891987, |
|
"grad_norm": 0.07130729407072067, |
|
"learning_rate": 0.00016679467141549617, |
|
"loss": 0.0324, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 2.60345789180145, |
|
"grad_norm": 0.07863139361143112, |
|
"learning_rate": 0.00016524526172742026, |
|
"loss": 0.0295, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 2.6183305447109126, |
|
"grad_norm": 0.08600688725709915, |
|
"learning_rate": 0.00016369420665579725, |
|
"loss": 0.0342, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 2.6332031976203756, |
|
"grad_norm": 0.10146727412939072, |
|
"learning_rate": 0.0001621416736021805, |
|
"loss": 0.032, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 2.648075850529838, |
|
"grad_norm": 0.0812121257185936, |
|
"learning_rate": 0.00016058783012763844, |
|
"loss": 0.0341, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 2.662948503439301, |
|
"grad_norm": 0.0973149985074997, |
|
"learning_rate": 0.00015903284393466987, |
|
"loss": 0.0313, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 2.677821156348764, |
|
"grad_norm": 0.0835902988910675, |
|
"learning_rate": 0.00015747688284910457, |
|
"loss": 0.0298, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 2.6926938092582264, |
|
"grad_norm": 0.07972200214862823, |
|
"learning_rate": 0.00015592011480198992, |
|
"loss": 0.0346, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 2.707566462167689, |
|
"grad_norm": 0.07594762742519379, |
|
"learning_rate": 0.0001543627078114667, |
|
"loss": 0.0338, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 2.722439115077152, |
|
"grad_norm": 0.07757771015167236, |
|
"learning_rate": 0.00015280482996463533, |
|
"loss": 0.0315, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 2.737311767986615, |
|
"grad_norm": 0.06432707607746124, |
|
"learning_rate": 0.00015124664939941457, |
|
"loss": 0.0319, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 2.7521844208960773, |
|
"grad_norm": 0.07696104794740677, |
|
"learning_rate": 0.00014968833428639474, |
|
"loss": 0.0301, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 2.7670570738055402, |
|
"grad_norm": 0.07426641881465912, |
|
"learning_rate": 0.00014813005281068774, |
|
"loss": 0.0285, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 2.7819297267150027, |
|
"grad_norm": 0.0765393078327179, |
|
"learning_rate": 0.00014657197315377495, |
|
"loss": 0.0313, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 2.7968023796244657, |
|
"grad_norm": 0.07151610404253006, |
|
"learning_rate": 0.00014501426347535598, |
|
"loss": 0.03, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 2.811675032533928, |
|
"grad_norm": 0.07834175229072571, |
|
"learning_rate": 0.0001434570918951996, |
|
"loss": 0.0286, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 2.826547685443391, |
|
"grad_norm": 0.09932053834199905, |
|
"learning_rate": 0.00014190062647499892, |
|
"loss": 0.0307, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 2.8414203383528536, |
|
"grad_norm": 0.08595503121614456, |
|
"learning_rate": 0.00014034503520023297, |
|
"loss": 0.0306, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 2.8562929912623165, |
|
"grad_norm": 0.08349858224391937, |
|
"learning_rate": 0.00013879048596203636, |
|
"loss": 0.0306, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 2.871165644171779, |
|
"grad_norm": 0.07905739545822144, |
|
"learning_rate": 0.0001372371465390794, |
|
"loss": 0.0305, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 2.886038297081242, |
|
"grad_norm": 0.06820567697286606, |
|
"learning_rate": 0.0001356851845794598, |
|
"loss": 0.0276, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 2.9009109499907044, |
|
"grad_norm": 0.07227708399295807, |
|
"learning_rate": 0.00013413476758260934, |
|
"loss": 0.0267, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 2.9157836029001674, |
|
"grad_norm": 0.09035148471593857, |
|
"learning_rate": 0.00013258606288121542, |
|
"loss": 0.0287, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 2.9306562558096303, |
|
"grad_norm": 0.08626757562160492, |
|
"learning_rate": 0.00013103923762316198, |
|
"loss": 0.0298, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 2.945528908719093, |
|
"grad_norm": 0.0765102431178093, |
|
"learning_rate": 0.00012949445875348902, |
|
"loss": 0.0274, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 2.9604015616285553, |
|
"grad_norm": 0.08610813319683075, |
|
"learning_rate": 0.00012795189299637483, |
|
"loss": 0.0283, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 2.975274214538018, |
|
"grad_norm": 0.08020433783531189, |
|
"learning_rate": 0.00012641170683714222, |
|
"loss": 0.0267, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.975274214538018, |
|
"eval_loss": 0.0338360071182251, |
|
"eval_runtime": 212.7237, |
|
"eval_samples_per_second": 22.475, |
|
"eval_steps_per_second": 22.475, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 2.990146867447481, |
|
"grad_norm": 0.06885667890310287, |
|
"learning_rate": 0.00012487406650428954, |
|
"loss": 0.0277, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 3.00539133667968, |
|
"grad_norm": 0.07658534497022629, |
|
"learning_rate": 0.00012333913795155053, |
|
"loss": 0.0251, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 3.020263989589143, |
|
"grad_norm": 0.06449634581804276, |
|
"learning_rate": 0.00012180708683998321, |
|
"loss": 0.0147, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 3.0351366424986055, |
|
"grad_norm": 0.06312290579080582, |
|
"learning_rate": 0.00012027807852009038, |
|
"loss": 0.0157, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 3.0500092954080684, |
|
"grad_norm": 0.07343071699142456, |
|
"learning_rate": 0.00011875227801397381, |
|
"loss": 0.0149, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 3.064881948317531, |
|
"grad_norm": 0.06489036977291107, |
|
"learning_rate": 0.00011722984999752392, |
|
"loss": 0.0155, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 3.079754601226994, |
|
"grad_norm": 0.06041651591658592, |
|
"learning_rate": 0.00011571095878264658, |
|
"loss": 0.0139, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 3.094627254136457, |
|
"grad_norm": 0.07048339396715164, |
|
"learning_rate": 0.00011419576829952933, |
|
"loss": 0.014, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 3.1094999070459193, |
|
"grad_norm": 0.05680292099714279, |
|
"learning_rate": 0.00011268444207894902, |
|
"loss": 0.0133, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 3.124372559955382, |
|
"grad_norm": 0.0727318823337555, |
|
"learning_rate": 0.00011117714323462186, |
|
"loss": 0.0147, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.1392452128648447, |
|
"grad_norm": 0.054686855524778366, |
|
"learning_rate": 0.00010967403444559963, |
|
"loss": 0.0143, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 3.1541178657743076, |
|
"grad_norm": 0.05729954317212105, |
|
"learning_rate": 0.00010817527793871143, |
|
"loss": 0.0134, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 3.16899051868377, |
|
"grad_norm": 0.08314567804336548, |
|
"learning_rate": 0.00010668103547105553, |
|
"loss": 0.0148, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 3.183863171593233, |
|
"grad_norm": 0.05523039028048515, |
|
"learning_rate": 0.00010519146831254088, |
|
"loss": 0.0129, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 3.1987358245026956, |
|
"grad_norm": 0.05546917766332626, |
|
"learning_rate": 0.00010370673722848183, |
|
"loss": 0.0139, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 3.2136084774121585, |
|
"grad_norm": 0.05486704409122467, |
|
"learning_rate": 0.00010222700246224735, |
|
"loss": 0.0135, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 3.228481130321621, |
|
"grad_norm": 0.05656208097934723, |
|
"learning_rate": 0.00010075242371796585, |
|
"loss": 0.0125, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 3.243353783231084, |
|
"grad_norm": 0.053801827132701874, |
|
"learning_rate": 9.928316014328916e-05, |
|
"loss": 0.0141, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 3.2582264361405464, |
|
"grad_norm": 0.061040911823511124, |
|
"learning_rate": 9.781937031221589e-05, |
|
"loss": 0.0136, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 3.2730990890500093, |
|
"grad_norm": 0.05558522418141365, |
|
"learning_rate": 9.636121220797707e-05, |
|
"loss": 0.0138, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 3.287971741959472, |
|
"grad_norm": 0.055547308176755905, |
|
"learning_rate": 9.490884320598516e-05, |
|
"loss": 0.0136, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 3.3028443948689348, |
|
"grad_norm": 0.061592597514390945, |
|
"learning_rate": 9.34624200568492e-05, |
|
"loss": 0.014, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 3.3177170477783973, |
|
"grad_norm": 0.05287894979119301, |
|
"learning_rate": 9.202209886945698e-05, |
|
"loss": 0.0125, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 3.33258970068786, |
|
"grad_norm": 0.06365808844566345, |
|
"learning_rate": 9.058803509412646e-05, |
|
"loss": 0.0139, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 3.347462353597323, |
|
"grad_norm": 0.05474059656262398, |
|
"learning_rate": 8.916038350582876e-05, |
|
"loss": 0.0141, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 3.3623350065067856, |
|
"grad_norm": 0.054872963577508926, |
|
"learning_rate": 8.773929818748315e-05, |
|
"loss": 0.0135, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 3.3772076594162486, |
|
"grad_norm": 0.05935963988304138, |
|
"learning_rate": 8.632493251332793e-05, |
|
"loss": 0.0128, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 3.392080312325711, |
|
"grad_norm": 0.06830602139234543, |
|
"learning_rate": 8.491743913236628e-05, |
|
"loss": 0.0133, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 3.406952965235174, |
|
"grad_norm": 0.057178862392902374, |
|
"learning_rate": 8.351696995189218e-05, |
|
"loss": 0.0121, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 3.4218256181446365, |
|
"grad_norm": 0.06827449798583984, |
|
"learning_rate": 8.212367612109464e-05, |
|
"loss": 0.0127, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 3.4366982710540994, |
|
"grad_norm": 0.04981634393334389, |
|
"learning_rate": 8.073770801474495e-05, |
|
"loss": 0.0132, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 3.451570923963562, |
|
"grad_norm": 0.052124422043561935, |
|
"learning_rate": 7.935921521696702e-05, |
|
"loss": 0.0129, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 3.466443576873025, |
|
"grad_norm": 0.05991722270846367, |
|
"learning_rate": 7.798834650509306e-05, |
|
"loss": 0.0128, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 3.4813162297824873, |
|
"grad_norm": 0.05946414917707443, |
|
"learning_rate": 7.662524983360665e-05, |
|
"loss": 0.0127, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 3.4961888826919503, |
|
"grad_norm": 0.05650801584124565, |
|
"learning_rate": 7.527007231817389e-05, |
|
"loss": 0.0127, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 3.5110615356014128, |
|
"grad_norm": 0.04841410368680954, |
|
"learning_rate": 7.392296021976614e-05, |
|
"loss": 0.0122, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 3.5259341885108757, |
|
"grad_norm": 0.05933946743607521, |
|
"learning_rate": 7.258405892887398e-05, |
|
"loss": 0.0121, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 3.540806841420338, |
|
"grad_norm": 0.05451497435569763, |
|
"learning_rate": 7.125351294981598e-05, |
|
"loss": 0.0127, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 3.555679494329801, |
|
"grad_norm": 0.05574881285429001, |
|
"learning_rate": 6.993146588514225e-05, |
|
"loss": 0.0124, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 3.5705521472392636, |
|
"grad_norm": 0.057919006794691086, |
|
"learning_rate": 6.86180604201361e-05, |
|
"loss": 0.0119, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 3.5854248001487266, |
|
"grad_norm": 0.051368821412324905, |
|
"learning_rate": 6.731343830741433e-05, |
|
"loss": 0.0126, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 3.6002974530581895, |
|
"grad_norm": 0.06351654976606369, |
|
"learning_rate": 6.6017740351628e-05, |
|
"loss": 0.0135, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 3.615170105967652, |
|
"grad_norm": 0.053709421306848526, |
|
"learning_rate": 6.473110639426616e-05, |
|
"loss": 0.0122, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 3.6300427588771145, |
|
"grad_norm": 0.061445701867341995, |
|
"learning_rate": 6.345367529856254e-05, |
|
"loss": 0.0132, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 3.6449154117865774, |
|
"grad_norm": 0.0678747370839119, |
|
"learning_rate": 6.218558493450893e-05, |
|
"loss": 0.0125, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 3.6597880646960403, |
|
"grad_norm": 0.05095114931464195, |
|
"learning_rate": 6.0926972163974775e-05, |
|
"loss": 0.012, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 3.674660717605503, |
|
"grad_norm": 0.05740583688020706, |
|
"learning_rate": 5.9677972825936254e-05, |
|
"loss": 0.0125, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 3.6895333705149658, |
|
"grad_norm": 0.05399662256240845, |
|
"learning_rate": 5.8438721721815536e-05, |
|
"loss": 0.0134, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 3.7044060234244283, |
|
"grad_norm": 0.056056030094623566, |
|
"learning_rate": 5.720935260093177e-05, |
|
"loss": 0.0125, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 3.719278676333891, |
|
"grad_norm": 0.046866290271282196, |
|
"learning_rate": 5.598999814606618e-05, |
|
"loss": 0.0118, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.719278676333891, |
|
"eval_loss": 0.031009526923298836, |
|
"eval_runtime": 212.3923, |
|
"eval_samples_per_second": 22.51, |
|
"eval_steps_per_second": 22.51, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 3.7341513292433537, |
|
"grad_norm": 0.046400755643844604, |
|
"learning_rate": 5.4780789959141524e-05, |
|
"loss": 0.0122, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 3.7490239821528166, |
|
"grad_norm": 0.05211547762155533, |
|
"learning_rate": 5.358185854701909e-05, |
|
"loss": 0.0122, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 3.763896635062279, |
|
"grad_norm": 0.0429752878844738, |
|
"learning_rate": 5.239333330741298e-05, |
|
"loss": 0.0124, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 3.778769287971742, |
|
"grad_norm": 0.05008607730269432, |
|
"learning_rate": 5.121534251492486e-05, |
|
"loss": 0.0125, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 3.7936419408812045, |
|
"grad_norm": 0.046397943049669266, |
|
"learning_rate": 5.004801330719941e-05, |
|
"loss": 0.0111, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 3.8085145937906675, |
|
"grad_norm": 0.05960022658109665, |
|
"learning_rate": 4.8891471671202675e-05, |
|
"loss": 0.0117, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 3.82338724670013, |
|
"grad_norm": 0.04353282228112221, |
|
"learning_rate": 4.7745842429624795e-05, |
|
"loss": 0.0119, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 3.838259899609593, |
|
"grad_norm": 0.05057670921087265, |
|
"learning_rate": 4.661124922740794e-05, |
|
"loss": 0.0116, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 3.853132552519056, |
|
"grad_norm": 0.04886782541871071, |
|
"learning_rate": 4.548781451840179e-05, |
|
"loss": 0.0113, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 3.8680052054285183, |
|
"grad_norm": 0.055182382464408875, |
|
"learning_rate": 4.437565955214723e-05, |
|
"loss": 0.0116, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 3.882877858337981, |
|
"grad_norm": 0.048834457993507385, |
|
"learning_rate": 4.3274904360790505e-05, |
|
"loss": 0.0121, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 3.8977505112474438, |
|
"grad_norm": 0.05025951564311981, |
|
"learning_rate": 4.218566774612802e-05, |
|
"loss": 0.0112, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 3.9126231641569067, |
|
"grad_norm": 0.05054251477122307, |
|
"learning_rate": 4.1108067266784746e-05, |
|
"loss": 0.0112, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 3.927495817066369, |
|
"grad_norm": 0.05326022952795029, |
|
"learning_rate": 4.004221922552608e-05, |
|
"loss": 0.0119, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 3.9423684699758317, |
|
"grad_norm": 0.05668502673506737, |
|
"learning_rate": 3.898823865670579e-05, |
|
"loss": 0.0114, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 3.9572411228852946, |
|
"grad_norm": 0.054235439747571945, |
|
"learning_rate": 3.794623931385062e-05, |
|
"loss": 0.0119, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 3.9721137757947576, |
|
"grad_norm": 0.05231969431042671, |
|
"learning_rate": 3.6916333657383024e-05, |
|
"loss": 0.0108, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 3.98698642870422, |
|
"grad_norm": 0.057500049471855164, |
|
"learning_rate": 3.5898632842483746e-05, |
|
"loss": 0.011, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 4.002230897936419, |
|
"grad_norm": 0.04203633964061737, |
|
"learning_rate": 3.489324670709494e-05, |
|
"loss": 0.0113, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 4.017103550845882, |
|
"grad_norm": 0.029648838564753532, |
|
"learning_rate": 3.390028376006589e-05, |
|
"loss": 0.0059, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 4.031976203755345, |
|
"grad_norm": 0.03779765963554382, |
|
"learning_rate": 3.2919851169441625e-05, |
|
"loss": 0.006, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 4.046848856664807, |
|
"grad_norm": 0.040116600692272186, |
|
"learning_rate": 3.195205475089667e-05, |
|
"loss": 0.0058, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 4.06172150957427, |
|
"grad_norm": 0.030058899894356728, |
|
"learning_rate": 3.099699895631474e-05, |
|
"loss": 0.0056, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 4.076594162483733, |
|
"grad_norm": 0.03675166517496109, |
|
"learning_rate": 3.0054786862515257e-05, |
|
"loss": 0.0058, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 4.091466815393196, |
|
"grad_norm": 0.03470413759350777, |
|
"learning_rate": 2.912552016012879e-05, |
|
"loss": 0.0057, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 4.106339468302658, |
|
"grad_norm": 0.03222460299730301, |
|
"learning_rate": 2.8209299142621522e-05, |
|
"loss": 0.0057, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 4.121212121212121, |
|
"grad_norm": 0.036458127200603485, |
|
"learning_rate": 2.7306222695471173e-05, |
|
"loss": 0.0056, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 4.136084774121584, |
|
"grad_norm": 0.035760316997766495, |
|
"learning_rate": 2.641638828549425e-05, |
|
"loss": 0.0055, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 4.150957427031047, |
|
"grad_norm": 0.04281270503997803, |
|
"learning_rate": 2.5539891950326875e-05, |
|
"loss": 0.0056, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 4.165830079940509, |
|
"grad_norm": 0.030339548364281654, |
|
"learning_rate": 2.4676828288059558e-05, |
|
"loss": 0.0057, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 4.180702732849972, |
|
"grad_norm": 0.03753247857093811, |
|
"learning_rate": 2.382729044702748e-05, |
|
"loss": 0.0058, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 4.195575385759435, |
|
"grad_norm": 0.035988811403512955, |
|
"learning_rate": 2.299137011575738e-05, |
|
"loss": 0.0055, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 4.210448038668898, |
|
"grad_norm": 0.0344134196639061, |
|
"learning_rate": 2.2169157513071566e-05, |
|
"loss": 0.0057, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 4.22532069157836, |
|
"grad_norm": 0.03696177527308464, |
|
"learning_rate": 2.136074137835107e-05, |
|
"loss": 0.0056, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 4.240193344487823, |
|
"grad_norm": 0.03733756020665169, |
|
"learning_rate": 2.056620896195804e-05, |
|
"loss": 0.0057, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 4.255065997397286, |
|
"grad_norm": 0.03630942478775978, |
|
"learning_rate": 1.978564601581919e-05, |
|
"loss": 0.0056, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 4.269938650306749, |
|
"grad_norm": 0.03577449545264244, |
|
"learning_rate": 1.9019136784170635e-05, |
|
"loss": 0.0055, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 4.284811303216211, |
|
"grad_norm": 0.03209745138883591, |
|
"learning_rate": 1.82667639944657e-05, |
|
"loss": 0.0054, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 4.299683956125674, |
|
"grad_norm": 0.03668665885925293, |
|
"learning_rate": 1.752860884844646e-05, |
|
"loss": 0.0055, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 4.314556609035137, |
|
"grad_norm": 0.03498975560069084, |
|
"learning_rate": 1.680475101337959e-05, |
|
"loss": 0.0055, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 4.3294292619445995, |
|
"grad_norm": 0.04088146984577179, |
|
"learning_rate": 1.60952686134583e-05, |
|
"loss": 0.0055, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 4.3443019148540625, |
|
"grad_norm": 0.035557616502046585, |
|
"learning_rate": 1.5400238221370413e-05, |
|
"loss": 0.0056, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 4.3591745677635245, |
|
"grad_norm": 0.03443196415901184, |
|
"learning_rate": 1.4719734850034277e-05, |
|
"loss": 0.0056, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 4.3740472206729875, |
|
"grad_norm": 0.03481742739677429, |
|
"learning_rate": 1.4053831944502508e-05, |
|
"loss": 0.0057, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 4.38891987358245, |
|
"grad_norm": 0.03648516163229942, |
|
"learning_rate": 1.340260137403557e-05, |
|
"loss": 0.0053, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 4.403792526491913, |
|
"grad_norm": 0.03400832787156105, |
|
"learning_rate": 1.2766113424344814e-05, |
|
"loss": 0.0055, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 4.418665179401375, |
|
"grad_norm": 0.03558880090713501, |
|
"learning_rate": 1.21444367900069e-05, |
|
"loss": 0.0055, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 4.433537832310838, |
|
"grad_norm": 0.035319775342941284, |
|
"learning_rate": 1.1537638567049729e-05, |
|
"loss": 0.0055, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 4.448410485220301, |
|
"grad_norm": 0.03432595729827881, |
|
"learning_rate": 1.0945784245710848e-05, |
|
"loss": 0.0054, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 4.463283138129764, |
|
"grad_norm": 0.03571225702762604, |
|
"learning_rate": 1.036893770336938e-05, |
|
"loss": 0.0055, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.463283138129764, |
|
"eval_loss": 0.03200867399573326, |
|
"eval_runtime": 212.5457, |
|
"eval_samples_per_second": 22.494, |
|
"eval_steps_per_second": 22.494, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 4.478155791039226, |
|
"grad_norm": 0.040391724556684494, |
|
"learning_rate": 9.807161197651742e-06, |
|
"loss": 0.0056, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 4.493028443948689, |
|
"grad_norm": 0.03410281240940094, |
|
"learning_rate": 9.260515359712517e-06, |
|
"loss": 0.0055, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 4.507901096858152, |
|
"grad_norm": 0.03447275608778, |
|
"learning_rate": 8.729059187690479e-06, |
|
"loss": 0.0054, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 4.522773749767615, |
|
"grad_norm": 0.032652657479047775, |
|
"learning_rate": 8.212850040341273e-06, |
|
"loss": 0.0055, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 4.537646402677078, |
|
"grad_norm": 0.035828616470098495, |
|
"learning_rate": 7.711943630846684e-06, |
|
"loss": 0.0053, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 4.55251905558654, |
|
"grad_norm": 0.03351854532957077, |
|
"learning_rate": 7.226394020801645e-06, |
|
"loss": 0.0054, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 4.567391708496003, |
|
"grad_norm": 0.03872072696685791, |
|
"learning_rate": 6.7562536143796254e-06, |
|
"loss": 0.0056, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 4.582264361405466, |
|
"grad_norm": 0.03518550843000412, |
|
"learning_rate": 6.301573152676664e-06, |
|
"loss": 0.0054, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 4.597137014314928, |
|
"grad_norm": 0.0351685993373394, |
|
"learning_rate": 5.862401708235076e-06, |
|
"loss": 0.0052, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 4.612009667224391, |
|
"grad_norm": 0.0348668210208416, |
|
"learning_rate": 5.438786679747081e-06, |
|
"loss": 0.0055, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 4.626882320133854, |
|
"grad_norm": 0.03660331293940544, |
|
"learning_rate": 5.030773786939319e-06, |
|
"loss": 0.0055, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 4.641754973043317, |
|
"grad_norm": 0.04046601429581642, |
|
"learning_rate": 4.638407065638322e-06, |
|
"loss": 0.0054, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 4.65662762595278, |
|
"grad_norm": 0.03230154886841774, |
|
"learning_rate": 4.261728863017827e-06, |
|
"loss": 0.0054, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 4.671500278862242, |
|
"grad_norm": 0.034297142177820206, |
|
"learning_rate": 3.900779833028472e-06, |
|
"loss": 0.0054, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 4.686372931771705, |
|
"grad_norm": 0.03240946680307388, |
|
"learning_rate": 3.5555989320099952e-06, |
|
"loss": 0.0053, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 4.701245584681168, |
|
"grad_norm": 0.04137023165822029, |
|
"learning_rate": 3.2262234144868116e-06, |
|
"loss": 0.0054, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 4.7161182375906305, |
|
"grad_norm": 0.030783316120505333, |
|
"learning_rate": 2.912688829147214e-06, |
|
"loss": 0.0052, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 4.730990890500093, |
|
"grad_norm": 0.03588159382343292, |
|
"learning_rate": 2.6150290150067588e-06, |
|
"loss": 0.0055, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 4.7458635434095555, |
|
"grad_norm": 0.03300805762410164, |
|
"learning_rate": 2.3332760977559873e-06, |
|
"loss": 0.0053, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 4.7607361963190185, |
|
"grad_norm": 0.03986676409840584, |
|
"learning_rate": 2.0674604862932654e-06, |
|
"loss": 0.0055, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 4.775608849228481, |
|
"grad_norm": 0.03252493590116501, |
|
"learning_rate": 1.8176108694427927e-06, |
|
"loss": 0.0052, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 4.790481502137943, |
|
"grad_norm": 0.03938417136669159, |
|
"learning_rate": 1.583754212858329e-06, |
|
"loss": 0.0054, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 4.805354155047406, |
|
"grad_norm": 0.03552339971065521, |
|
"learning_rate": 1.3659157561127732e-06, |
|
"loss": 0.0057, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 4.820226807956869, |
|
"grad_norm": 0.03480495885014534, |
|
"learning_rate": 1.1641190099741904e-06, |
|
"loss": 0.0053, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 4.835099460866332, |
|
"grad_norm": 0.03451026231050491, |
|
"learning_rate": 9.783857538683603e-07, |
|
"loss": 0.0053, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 4.849972113775795, |
|
"grad_norm": 0.033308371901512146, |
|
"learning_rate": 8.087360335281235e-07, |
|
"loss": 0.0055, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 4.864844766685257, |
|
"grad_norm": 0.035610370337963104, |
|
"learning_rate": 6.551881588299279e-07, |
|
"loss": 0.0054, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 4.87971741959472, |
|
"grad_norm": 0.030910024419426918, |
|
"learning_rate": 5.177587018176777e-07, |
|
"loss": 0.0054, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 4.894590072504183, |
|
"grad_norm": 0.034942276775836945, |
|
"learning_rate": 3.964624949141626e-07, |
|
"loss": 0.0054, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 4.909462725413646, |
|
"grad_norm": 0.03491232544183731, |
|
"learning_rate": 2.913126293202228e-07, |
|
"loss": 0.0053, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 4.924335378323108, |
|
"grad_norm": 0.0331818163394928, |
|
"learning_rate": 2.0232045360184523e-07, |
|
"loss": 0.0051, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 4.939208031232571, |
|
"grad_norm": 0.034393060952425, |
|
"learning_rate": 1.2949557246537678e-07, |
|
"loss": 0.0053, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 4.954080684142034, |
|
"grad_norm": 0.03940508887171745, |
|
"learning_rate": 7.284584572085361e-08, |
|
"loss": 0.0052, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 4.968953337051497, |
|
"grad_norm": 0.03125544637441635, |
|
"learning_rate": 3.237738743372964e-08, |
|
"loss": 0.0052, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 4.983825989960959, |
|
"grad_norm": 0.03558258339762688, |
|
"learning_rate": 8.094565265054365e-09, |
|
"loss": 0.0054, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 4.998698642870422, |
|
"grad_norm": 0.03360743075609207, |
|
"learning_rate": 0.0, |
|
"loss": 0.0054, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 4.998698642870422, |
|
"step": 3360, |
|
"total_flos": 5.14290499398402e+18, |
|
"train_loss": 0.19145491501161208, |
|
"train_runtime": 31931.328, |
|
"train_samples_per_second": 6.737, |
|
"train_steps_per_second": 0.105 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3360, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 5.14290499398402e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|