mcmaster-llama3-1b-full-pt / trainer_state.json
keatone's picture
End of training
ede525c verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.998698642870422,
"eval_steps": 500,
"global_step": 3360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.014872652909462726,
"grad_norm": 1.0425002574920654,
"learning_rate": 8.928571428571428e-06,
"loss": 0.4346,
"step": 10
},
{
"epoch": 0.02974530581892545,
"grad_norm": 0.5861272811889648,
"learning_rate": 1.7857142857142855e-05,
"loss": 0.3527,
"step": 20
},
{
"epoch": 0.044617958728388175,
"grad_norm": 0.5629558563232422,
"learning_rate": 2.6785714285714284e-05,
"loss": 0.2922,
"step": 30
},
{
"epoch": 0.0594906116378509,
"grad_norm": 0.49566933512687683,
"learning_rate": 3.571428571428571e-05,
"loss": 0.281,
"step": 40
},
{
"epoch": 0.07436326454731362,
"grad_norm": 0.45387113094329834,
"learning_rate": 4.4642857142857136e-05,
"loss": 0.2689,
"step": 50
},
{
"epoch": 0.08923591745677635,
"grad_norm": 0.43913352489471436,
"learning_rate": 5.357142857142857e-05,
"loss": 0.2506,
"step": 60
},
{
"epoch": 0.10410857036623908,
"grad_norm": 0.7242547869682312,
"learning_rate": 6.25e-05,
"loss": 0.229,
"step": 70
},
{
"epoch": 0.1189812232757018,
"grad_norm": 0.5109072923660278,
"learning_rate": 7.142857142857142e-05,
"loss": 0.2373,
"step": 80
},
{
"epoch": 0.13385387618516453,
"grad_norm": 0.5291035175323486,
"learning_rate": 8.035714285714285e-05,
"loss": 0.2405,
"step": 90
},
{
"epoch": 0.14872652909462725,
"grad_norm": 0.48036572337150574,
"learning_rate": 8.928571428571427e-05,
"loss": 0.2274,
"step": 100
},
{
"epoch": 0.16359918200409,
"grad_norm": 0.3294093906879425,
"learning_rate": 9.82142857142857e-05,
"loss": 0.2038,
"step": 110
},
{
"epoch": 0.1784718349135527,
"grad_norm": 0.49968525767326355,
"learning_rate": 0.00010714285714285714,
"loss": 0.2084,
"step": 120
},
{
"epoch": 0.19334448782301544,
"grad_norm": 0.32227209210395813,
"learning_rate": 0.00011607142857142857,
"loss": 0.1981,
"step": 130
},
{
"epoch": 0.20821714073247816,
"grad_norm": 0.37266677618026733,
"learning_rate": 0.000125,
"loss": 0.2192,
"step": 140
},
{
"epoch": 0.22308979364194087,
"grad_norm": 0.5228686928749084,
"learning_rate": 0.00013392857142857144,
"loss": 0.2014,
"step": 150
},
{
"epoch": 0.2379624465514036,
"grad_norm": 0.4202245771884918,
"learning_rate": 0.00014285714285714284,
"loss": 0.1912,
"step": 160
},
{
"epoch": 0.25283509946086635,
"grad_norm": 0.45801258087158203,
"learning_rate": 0.00015178571428571427,
"loss": 0.212,
"step": 170
},
{
"epoch": 0.26770775237032907,
"grad_norm": 0.4326329827308655,
"learning_rate": 0.0001607142857142857,
"loss": 0.1973,
"step": 180
},
{
"epoch": 0.2825804052797918,
"grad_norm": 0.38971471786499023,
"learning_rate": 0.0001696428571428571,
"loss": 0.1907,
"step": 190
},
{
"epoch": 0.2974530581892545,
"grad_norm": 0.3728097975254059,
"learning_rate": 0.00017857142857142854,
"loss": 0.192,
"step": 200
},
{
"epoch": 0.3123257110987172,
"grad_norm": 0.34695690870285034,
"learning_rate": 0.00018749999999999998,
"loss": 0.1855,
"step": 210
},
{
"epoch": 0.32719836400818,
"grad_norm": 0.41753408312797546,
"learning_rate": 0.0001964285714285714,
"loss": 0.1883,
"step": 220
},
{
"epoch": 0.3420710169176427,
"grad_norm": 0.27681878209114075,
"learning_rate": 0.00020535714285714284,
"loss": 0.1809,
"step": 230
},
{
"epoch": 0.3569436698271054,
"grad_norm": 2.382871150970459,
"learning_rate": 0.00021428571428571427,
"loss": 0.1735,
"step": 240
},
{
"epoch": 0.3718163227365681,
"grad_norm": 160.2670440673828,
"learning_rate": 0.0002232142857142857,
"loss": 1.2159,
"step": 250
},
{
"epoch": 0.3866889756460309,
"grad_norm": 21.60050392150879,
"learning_rate": 0.00023214285714285714,
"loss": 5.4026,
"step": 260
},
{
"epoch": 0.4015616285554936,
"grad_norm": 13.928524017333984,
"learning_rate": 0.00024107142857142857,
"loss": 4.3573,
"step": 270
},
{
"epoch": 0.4164342814649563,
"grad_norm": 5.3707685470581055,
"learning_rate": 0.00025,
"loss": 3.2782,
"step": 280
},
{
"epoch": 0.431306934374419,
"grad_norm": 5.556903839111328,
"learning_rate": 0.0002589285714285714,
"loss": 2.8033,
"step": 290
},
{
"epoch": 0.44617958728388174,
"grad_norm": 2.512521505355835,
"learning_rate": 0.00026785714285714287,
"loss": 2.5486,
"step": 300
},
{
"epoch": 0.4610522401933445,
"grad_norm": 3.592169761657715,
"learning_rate": 0.0002767857142857143,
"loss": 2.2779,
"step": 310
},
{
"epoch": 0.4759248931028072,
"grad_norm": 2.791459321975708,
"learning_rate": 0.0002857142857142857,
"loss": 2.1011,
"step": 320
},
{
"epoch": 0.49079754601226994,
"grad_norm": 1.1407463550567627,
"learning_rate": 0.0002946428571428571,
"loss": 1.9929,
"step": 330
},
{
"epoch": 0.5056701989217327,
"grad_norm": 1.795841097831726,
"learning_rate": 0.0002999987048597728,
"loss": 1.8818,
"step": 340
},
{
"epoch": 0.5205428518311954,
"grad_norm": 1.4798821210861206,
"learning_rate": 0.00029998413478906613,
"loss": 1.772,
"step": 350
},
{
"epoch": 0.5354155047406581,
"grad_norm": 1.5337024927139282,
"learning_rate": 0.0002999533773001224,
"loss": 1.6782,
"step": 360
},
{
"epoch": 0.5502881576501208,
"grad_norm": 1.332065463066101,
"learning_rate": 0.00029990643571252174,
"loss": 1.6035,
"step": 370
},
{
"epoch": 0.5651608105595836,
"grad_norm": 1.0516103506088257,
"learning_rate": 0.00029984331509255415,
"loss": 1.5053,
"step": 380
},
{
"epoch": 0.5800334634690463,
"grad_norm": 1.034192442893982,
"learning_rate": 0.00029976402225267247,
"loss": 1.3906,
"step": 390
},
{
"epoch": 0.594906116378509,
"grad_norm": 1.2757515907287598,
"learning_rate": 0.0002996685657507577,
"loss": 1.2592,
"step": 400
},
{
"epoch": 0.6097787692879717,
"grad_norm": 0.8252782225608826,
"learning_rate": 0.000299556955889195,
"loss": 1.0907,
"step": 410
},
{
"epoch": 0.6246514221974344,
"grad_norm": 1.020588994026184,
"learning_rate": 0.0002994292047137618,
"loss": 0.9035,
"step": 420
},
{
"epoch": 0.6395240751068972,
"grad_norm": 0.5973761677742004,
"learning_rate": 0.0002992853260123278,
"loss": 0.7538,
"step": 430
},
{
"epoch": 0.65439672801636,
"grad_norm": 0.6886543035507202,
"learning_rate": 0.0002991253353133668,
"loss": 0.6621,
"step": 440
},
{
"epoch": 0.6692693809258227,
"grad_norm": 0.44221287965774536,
"learning_rate": 0.00029894924988428087,
"loss": 0.59,
"step": 450
},
{
"epoch": 0.6841420338352854,
"grad_norm": 0.7888408899307251,
"learning_rate": 0.00029875708872953677,
"loss": 0.539,
"step": 460
},
{
"epoch": 0.6990146867447481,
"grad_norm": 0.43110209703445435,
"learning_rate": 0.00029854887258861447,
"loss": 0.4903,
"step": 470
},
{
"epoch": 0.7138873396542108,
"grad_norm": 0.41334015130996704,
"learning_rate": 0.0002983246239337692,
"loss": 0.4488,
"step": 480
},
{
"epoch": 0.7287599925636735,
"grad_norm": 0.3482460379600525,
"learning_rate": 0.0002980843669676061,
"loss": 0.4165,
"step": 490
},
{
"epoch": 0.7436326454731362,
"grad_norm": 0.3593901991844177,
"learning_rate": 0.0002978281276204675,
"loss": 0.3821,
"step": 500
},
{
"epoch": 0.7436326454731362,
"eval_loss": 0.37597203254699707,
"eval_runtime": 212.4955,
"eval_samples_per_second": 22.499,
"eval_steps_per_second": 22.499,
"step": 500
},
{
"epoch": 0.758505298382599,
"grad_norm": 0.4221905469894409,
"learning_rate": 0.00029755593354763516,
"loss": 0.3627,
"step": 510
},
{
"epoch": 0.7733779512920618,
"grad_norm": 0.31105437874794006,
"learning_rate": 0.0002972678141263449,
"loss": 0.3346,
"step": 520
},
{
"epoch": 0.7882506042015245,
"grad_norm": 0.2600822150707245,
"learning_rate": 0.000296963800452616,
"loss": 0.3217,
"step": 530
},
{
"epoch": 0.8031232571109872,
"grad_norm": 0.21437157690525055,
"learning_rate": 0.0002966439253378957,
"loss": 0.3095,
"step": 540
},
{
"epoch": 0.8179959100204499,
"grad_norm": 0.22641418874263763,
"learning_rate": 0.000296308223305517,
"loss": 0.2866,
"step": 550
},
{
"epoch": 0.8328685629299126,
"grad_norm": 0.2200980931520462,
"learning_rate": 0.00029595673058697357,
"loss": 0.2579,
"step": 560
},
{
"epoch": 0.8477412158393753,
"grad_norm": 0.21351036429405212,
"learning_rate": 0.0002955894851180086,
"loss": 0.2727,
"step": 570
},
{
"epoch": 0.862613868748838,
"grad_norm": 0.2137759029865265,
"learning_rate": 0.0002952065265345211,
"loss": 0.2621,
"step": 580
},
{
"epoch": 0.8774865216583008,
"grad_norm": 0.18923349678516388,
"learning_rate": 0.00029480789616828765,
"loss": 0.2647,
"step": 590
},
{
"epoch": 0.8923591745677635,
"grad_norm": 0.1697588562965393,
"learning_rate": 0.00029439363704250176,
"loss": 0.2434,
"step": 600
},
{
"epoch": 0.9072318274772263,
"grad_norm": 0.15528830885887146,
"learning_rate": 0.0002939637938671306,
"loss": 0.2293,
"step": 610
},
{
"epoch": 0.922104480386689,
"grad_norm": 0.43390974402427673,
"learning_rate": 0.0002935184130340893,
"loss": 0.228,
"step": 620
},
{
"epoch": 0.9369771332961517,
"grad_norm": 0.2026420682668686,
"learning_rate": 0.000293057542612234,
"loss": 0.2355,
"step": 630
},
{
"epoch": 0.9518497862056144,
"grad_norm": 0.16864228248596191,
"learning_rate": 0.00029258123234217435,
"loss": 0.2213,
"step": 640
},
{
"epoch": 0.9667224391150772,
"grad_norm": 0.15947186946868896,
"learning_rate": 0.0002920895336309044,
"loss": 0.2079,
"step": 650
},
{
"epoch": 0.9815950920245399,
"grad_norm": 0.21965055167675018,
"learning_rate": 0.0002915824995462551,
"loss": 0.2002,
"step": 660
},
{
"epoch": 0.9964677449340026,
"grad_norm": 0.23223313689231873,
"learning_rate": 0.00029106018481116626,
"loss": 0.1983,
"step": 670
},
{
"epoch": 1.0117122141662018,
"grad_norm": 0.26117920875549316,
"learning_rate": 0.00029052264579778063,
"loss": 0.2175,
"step": 680
},
{
"epoch": 1.0265848670756645,
"grad_norm": 0.176736518740654,
"learning_rate": 0.00028996994052135996,
"loss": 0.1831,
"step": 690
},
{
"epoch": 1.0414575199851273,
"grad_norm": 0.17873461544513702,
"learning_rate": 0.0002894021286340233,
"loss": 0.1784,
"step": 700
},
{
"epoch": 1.05633017289459,
"grad_norm": 0.2646450996398926,
"learning_rate": 0.0002888192714183092,
"loss": 0.1784,
"step": 710
},
{
"epoch": 1.0712028258040527,
"grad_norm": 0.16840551793575287,
"learning_rate": 0.00028822143178056114,
"loss": 0.1726,
"step": 720
},
{
"epoch": 1.0860754787135156,
"grad_norm": 0.1423952877521515,
"learning_rate": 0.0002876086742441387,
"loss": 0.1608,
"step": 730
},
{
"epoch": 1.1009481316229783,
"grad_norm": 0.16237640380859375,
"learning_rate": 0.0002869810649424535,
"loss": 0.179,
"step": 740
},
{
"epoch": 1.115820784532441,
"grad_norm": 0.158773735165596,
"learning_rate": 0.0002863386716118316,
"loss": 0.1742,
"step": 750
},
{
"epoch": 1.1306934374419038,
"grad_norm": 0.17627516388893127,
"learning_rate": 0.0002856815635842029,
"loss": 0.1821,
"step": 760
},
{
"epoch": 1.1455660903513665,
"grad_norm": 0.23613831400871277,
"learning_rate": 0.00028500981177961816,
"loss": 0.156,
"step": 770
},
{
"epoch": 1.1604387432608292,
"grad_norm": 0.16501256823539734,
"learning_rate": 0.0002843234886985951,
"loss": 0.1517,
"step": 780
},
{
"epoch": 1.175311396170292,
"grad_norm": 0.2365158647298813,
"learning_rate": 0.00028362266841429345,
"loss": 0.1391,
"step": 790
},
{
"epoch": 1.1901840490797546,
"grad_norm": 0.17508777976036072,
"learning_rate": 0.00028290742656452014,
"loss": 0.1434,
"step": 800
},
{
"epoch": 1.2050567019892173,
"grad_norm": 0.145797461271286,
"learning_rate": 0.0002821778403435663,
"loss": 0.1607,
"step": 810
},
{
"epoch": 1.21992935489868,
"grad_norm": 0.15968403220176697,
"learning_rate": 0.00028143398849387577,
"loss": 0.1536,
"step": 820
},
{
"epoch": 1.2348020078081428,
"grad_norm": 0.1553070992231369,
"learning_rate": 0.00028067595129754647,
"loss": 0.1481,
"step": 830
},
{
"epoch": 1.2496746607176055,
"grad_norm": 0.1769135743379593,
"learning_rate": 0.0002799038105676658,
"loss": 0.1285,
"step": 840
},
{
"epoch": 1.2645473136270682,
"grad_norm": 0.1639111191034317,
"learning_rate": 0.0002791176496394808,
"loss": 0.144,
"step": 850
},
{
"epoch": 1.279419966536531,
"grad_norm": 0.19045153260231018,
"learning_rate": 0.00027831755336140416,
"loss": 0.1347,
"step": 860
},
{
"epoch": 1.2942926194459936,
"grad_norm": 0.18079642951488495,
"learning_rate": 0.00027750360808585637,
"loss": 0.1254,
"step": 870
},
{
"epoch": 1.3091652723554563,
"grad_norm": 0.18368874490261078,
"learning_rate": 0.00027667590165994613,
"loss": 0.1289,
"step": 880
},
{
"epoch": 1.324037925264919,
"grad_norm": 0.20005619525909424,
"learning_rate": 0.00027583452341598935,
"loss": 0.1246,
"step": 890
},
{
"epoch": 1.338910578174382,
"grad_norm": 0.1317131668329239,
"learning_rate": 0.0002749795641618673,
"loss": 0.1238,
"step": 900
},
{
"epoch": 1.3537832310838445,
"grad_norm": 0.15287995338439941,
"learning_rate": 0.00027411111617122656,
"loss": 0.1224,
"step": 910
},
{
"epoch": 1.3686558839933074,
"grad_norm": 0.1613466739654541,
"learning_rate": 0.0002732292731735196,
"loss": 0.1178,
"step": 920
},
{
"epoch": 1.3835285369027701,
"grad_norm": 0.1685304194688797,
"learning_rate": 0.000272334130343889,
"loss": 0.1201,
"step": 930
},
{
"epoch": 1.3984011898122328,
"grad_norm": 0.19208119809627533,
"learning_rate": 0.0002714257842928956,
"loss": 0.1103,
"step": 940
},
{
"epoch": 1.4132738427216955,
"grad_norm": 0.17899583280086517,
"learning_rate": 0.00027050433305609125,
"loss": 0.1128,
"step": 950
},
{
"epoch": 1.4281464956311583,
"grad_norm": 0.19848547875881195,
"learning_rate": 0.0002695698760834384,
"loss": 0.1112,
"step": 960
},
{
"epoch": 1.443019148540621,
"grad_norm": 0.1710231602191925,
"learning_rate": 0.0002686225142285762,
"loss": 0.1107,
"step": 970
},
{
"epoch": 1.4578918014500837,
"grad_norm": 0.1552249938249588,
"learning_rate": 0.0002676623497379363,
"loss": 0.0984,
"step": 980
},
{
"epoch": 1.4727644543595464,
"grad_norm": 0.1702568084001541,
"learning_rate": 0.0002666894862397072,
"loss": 0.1109,
"step": 990
},
{
"epoch": 1.487637107269009,
"grad_norm": 0.12360525131225586,
"learning_rate": 0.00026570402873264996,
"loss": 0.1018,
"step": 1000
},
{
"epoch": 1.487637107269009,
"eval_loss": 0.10193677991628647,
"eval_runtime": 212.152,
"eval_samples_per_second": 22.536,
"eval_steps_per_second": 22.536,
"step": 1000
},
{
"epoch": 1.5025097601784718,
"grad_norm": 0.14356306195259094,
"learning_rate": 0.0002647060835747659,
"loss": 0.101,
"step": 1010
},
{
"epoch": 1.5173824130879345,
"grad_norm": 0.12723973393440247,
"learning_rate": 0.00026369575847181795,
"loss": 0.095,
"step": 1020
},
{
"epoch": 1.5322550659973972,
"grad_norm": 0.12857410311698914,
"learning_rate": 0.0002626731624657058,
"loss": 0.0915,
"step": 1030
},
{
"epoch": 1.54712771890686,
"grad_norm": 0.1593610793352127,
"learning_rate": 0.0002616384059226977,
"loss": 0.0993,
"step": 1040
},
{
"epoch": 1.562000371816323,
"grad_norm": 0.11687605082988739,
"learning_rate": 0.0002605916005215186,
"loss": 0.0894,
"step": 1050
},
{
"epoch": 1.5768730247257854,
"grad_norm": 0.1873299479484558,
"learning_rate": 0.0002595328592412969,
"loss": 0.097,
"step": 1060
},
{
"epoch": 1.5917456776352483,
"grad_norm": 0.1516319364309311,
"learning_rate": 0.00025846229634937136,
"loss": 0.0931,
"step": 1070
},
{
"epoch": 1.6066183305447108,
"grad_norm": 0.1431397646665573,
"learning_rate": 0.0002573800273889577,
"loss": 0.0918,
"step": 1080
},
{
"epoch": 1.6214909834541738,
"grad_norm": 0.17975349724292755,
"learning_rate": 0.0002562861691666793,
"loss": 0.0892,
"step": 1090
},
{
"epoch": 1.6363636363636362,
"grad_norm": 0.1414797306060791,
"learning_rate": 0.0002551808397399597,
"loss": 0.0952,
"step": 1100
},
{
"epoch": 1.6512362892730992,
"grad_norm": 0.151850625872612,
"learning_rate": 0.0002540641584042812,
"loss": 0.1008,
"step": 1110
},
{
"epoch": 1.6661089421825617,
"grad_norm": 0.1266675442457199,
"learning_rate": 0.00025293624568031,
"loss": 0.0782,
"step": 1120
},
{
"epoch": 1.6809815950920246,
"grad_norm": 0.12076599150896072,
"learning_rate": 0.0002517972233008882,
"loss": 0.0772,
"step": 1130
},
{
"epoch": 1.6958542480014873,
"grad_norm": 0.125094935297966,
"learning_rate": 0.0002506472141978955,
"loss": 0.0837,
"step": 1140
},
{
"epoch": 1.71072690091095,
"grad_norm": 0.13272984325885773,
"learning_rate": 0.0002494863424889819,
"loss": 0.0736,
"step": 1150
},
{
"epoch": 1.7255995538204127,
"grad_norm": 0.16893050074577332,
"learning_rate": 0.00024831473346417153,
"loss": 0.0856,
"step": 1160
},
{
"epoch": 1.7404722067298755,
"grad_norm": 0.11702137440443039,
"learning_rate": 0.00024713251357234053,
"loss": 0.0799,
"step": 1170
},
{
"epoch": 1.7553448596393382,
"grad_norm": 0.13682794570922852,
"learning_rate": 0.00024593981040756997,
"loss": 0.089,
"step": 1180
},
{
"epoch": 1.7702175125488009,
"grad_norm": 0.13676613569259644,
"learning_rate": 0.0002447367526953746,
"loss": 0.0797,
"step": 1190
},
{
"epoch": 1.7850901654582636,
"grad_norm": 0.13324877619743347,
"learning_rate": 0.00024352347027881003,
"loss": 0.0792,
"step": 1200
},
{
"epoch": 1.7999628183677263,
"grad_norm": 0.11255478858947754,
"learning_rate": 0.00024230009410445893,
"loss": 0.0763,
"step": 1210
},
{
"epoch": 1.814835471277189,
"grad_norm": 0.10950371623039246,
"learning_rate": 0.0002410667562082985,
"loss": 0.0663,
"step": 1220
},
{
"epoch": 1.8297081241866517,
"grad_norm": 0.11777317523956299,
"learning_rate": 0.00023982358970145004,
"loss": 0.0694,
"step": 1230
},
{
"epoch": 1.8445807770961147,
"grad_norm": 0.1194106712937355,
"learning_rate": 0.00023857072875581244,
"loss": 0.0703,
"step": 1240
},
{
"epoch": 1.8594534300055772,
"grad_norm": 0.11233114451169968,
"learning_rate": 0.00023730830858958177,
"loss": 0.0655,
"step": 1250
},
{
"epoch": 1.87432608291504,
"grad_norm": 0.11641702055931091,
"learning_rate": 0.00023603646545265687,
"loss": 0.0645,
"step": 1260
},
{
"epoch": 1.8891987358245026,
"grad_norm": 0.1414889097213745,
"learning_rate": 0.00023475533661193495,
"loss": 0.068,
"step": 1270
},
{
"epoch": 1.9040713887339655,
"grad_norm": 0.10632241517305374,
"learning_rate": 0.00023346506033649614,
"loss": 0.064,
"step": 1280
},
{
"epoch": 1.918944041643428,
"grad_norm": 0.10176625102758408,
"learning_rate": 0.0002321657758826807,
"loss": 0.062,
"step": 1290
},
{
"epoch": 1.933816694552891,
"grad_norm": 0.09434150904417038,
"learning_rate": 0.00023085762347905943,
"loss": 0.0684,
"step": 1300
},
{
"epoch": 1.9486893474623534,
"grad_norm": 0.12967799603939056,
"learning_rate": 0.00022954074431129915,
"loss": 0.0605,
"step": 1310
},
{
"epoch": 1.9635620003718164,
"grad_norm": 0.1181391179561615,
"learning_rate": 0.0002282152805069247,
"loss": 0.0654,
"step": 1320
},
{
"epoch": 1.978434653281279,
"grad_norm": 0.10801093280315399,
"learning_rate": 0.00022688137511997977,
"loss": 0.07,
"step": 1330
},
{
"epoch": 1.9933073061907418,
"grad_norm": 0.11437591165304184,
"learning_rate": 0.00022553917211558713,
"loss": 0.0578,
"step": 1340
},
{
"epoch": 2.008551775422941,
"grad_norm": 0.11018254607915878,
"learning_rate": 0.0002241888163544111,
"loss": 0.0565,
"step": 1350
},
{
"epoch": 2.0234244283324037,
"grad_norm": 0.08331198990345001,
"learning_rate": 0.0002228304535770228,
"loss": 0.0399,
"step": 1360
},
{
"epoch": 2.0382970812418666,
"grad_norm": 0.09547814726829529,
"learning_rate": 0.00022146423038817102,
"loss": 0.0438,
"step": 1370
},
{
"epoch": 2.053169734151329,
"grad_norm": 0.10641171038150787,
"learning_rate": 0.00022009029424095928,
"loss": 0.0384,
"step": 1380
},
{
"epoch": 2.068042387060792,
"grad_norm": 0.10844069719314575,
"learning_rate": 0.0002187087934209318,
"loss": 0.044,
"step": 1390
},
{
"epoch": 2.0829150399702545,
"grad_norm": 0.10333788394927979,
"learning_rate": 0.00021731987703006933,
"loss": 0.041,
"step": 1400
},
{
"epoch": 2.0977876928797174,
"grad_norm": 0.10635129362344742,
"learning_rate": 0.0002159236949706967,
"loss": 0.04,
"step": 1410
},
{
"epoch": 2.11266034578918,
"grad_norm": 0.09010270237922668,
"learning_rate": 0.00021452039792930474,
"loss": 0.0402,
"step": 1420
},
{
"epoch": 2.127532998698643,
"grad_norm": 0.09274252504110336,
"learning_rate": 0.00021311013736028658,
"loss": 0.0384,
"step": 1430
},
{
"epoch": 2.1424056516081054,
"grad_norm": 0.08550871163606644,
"learning_rate": 0.00021169306546959174,
"loss": 0.0428,
"step": 1440
},
{
"epoch": 2.1572783045175683,
"grad_norm": 0.10152186453342438,
"learning_rate": 0.00021026933519829896,
"loss": 0.0442,
"step": 1450
},
{
"epoch": 2.1721509574270312,
"grad_norm": 0.08528181910514832,
"learning_rate": 0.00020883910020610957,
"loss": 0.0375,
"step": 1460
},
{
"epoch": 2.1870236103364937,
"grad_norm": 0.09736708551645279,
"learning_rate": 0.00020740251485476345,
"loss": 0.0387,
"step": 1470
},
{
"epoch": 2.2018962632459567,
"grad_norm": 0.09133671224117279,
"learning_rate": 0.00020595973419137908,
"loss": 0.0373,
"step": 1480
},
{
"epoch": 2.216768916155419,
"grad_norm": 0.08406363427639008,
"learning_rate": 0.00020451091393171964,
"loss": 0.0381,
"step": 1490
},
{
"epoch": 2.231641569064882,
"grad_norm": 0.08503925055265427,
"learning_rate": 0.00020305621044338718,
"loss": 0.0376,
"step": 1500
},
{
"epoch": 2.231641569064882,
"eval_loss": 0.051042910665273666,
"eval_runtime": 212.5441,
"eval_samples_per_second": 22.494,
"eval_steps_per_second": 22.494,
"step": 1500
},
{
"epoch": 2.2465142219743446,
"grad_norm": 0.09201103448867798,
"learning_rate": 0.00020159578072894606,
"loss": 0.0393,
"step": 1510
},
{
"epoch": 2.2613868748838075,
"grad_norm": 0.09499834477901459,
"learning_rate": 0.00020012978240897814,
"loss": 0.0346,
"step": 1520
},
{
"epoch": 2.27625952779327,
"grad_norm": 0.09396501630544662,
"learning_rate": 0.00019865837370507106,
"loss": 0.039,
"step": 1530
},
{
"epoch": 2.291132180702733,
"grad_norm": 0.08983522653579712,
"learning_rate": 0.00019718171342274205,
"loss": 0.0387,
"step": 1540
},
{
"epoch": 2.3060048336121954,
"grad_norm": 0.1118871420621872,
"learning_rate": 0.00019569996093429814,
"loss": 0.0379,
"step": 1550
},
{
"epoch": 2.3208774865216584,
"grad_norm": 0.08434595167636871,
"learning_rate": 0.00019421327616163563,
"loss": 0.0372,
"step": 1560
},
{
"epoch": 2.335750139431121,
"grad_norm": 0.0915694460272789,
"learning_rate": 0.00019272181955898017,
"loss": 0.036,
"step": 1570
},
{
"epoch": 2.350622792340584,
"grad_norm": 0.08459066599607468,
"learning_rate": 0.0001912257520955692,
"loss": 0.0363,
"step": 1580
},
{
"epoch": 2.3654954452500463,
"grad_norm": 0.09195558726787567,
"learning_rate": 0.00018972523523827907,
"loss": 0.0389,
"step": 1590
},
{
"epoch": 2.3803680981595092,
"grad_norm": 0.09830203652381897,
"learning_rate": 0.0001882204309341982,
"loss": 0.0373,
"step": 1600
},
{
"epoch": 2.3952407510689717,
"grad_norm": 0.08541320264339447,
"learning_rate": 0.00018671150159314855,
"loss": 0.0342,
"step": 1610
},
{
"epoch": 2.4101134039784347,
"grad_norm": 0.08817029744386673,
"learning_rate": 0.00018519861007015729,
"loss": 0.0371,
"step": 1620
},
{
"epoch": 2.4249860568878976,
"grad_norm": 0.08839129656553268,
"learning_rate": 0.00018368191964788,
"loss": 0.0355,
"step": 1630
},
{
"epoch": 2.43985870979736,
"grad_norm": 0.08589951694011688,
"learning_rate": 0.00018216159401897812,
"loss": 0.0339,
"step": 1640
},
{
"epoch": 2.454731362706823,
"grad_norm": 0.09998754411935806,
"learning_rate": 0.00018063779726845203,
"loss": 0.0339,
"step": 1650
},
{
"epoch": 2.4696040156162855,
"grad_norm": 0.08363664150238037,
"learning_rate": 0.0001791106938559317,
"loss": 0.0357,
"step": 1660
},
{
"epoch": 2.4844766685257484,
"grad_norm": 0.08930620551109314,
"learning_rate": 0.00017758044859792705,
"loss": 0.0347,
"step": 1670
},
{
"epoch": 2.499349321435211,
"grad_norm": 0.08270251750946045,
"learning_rate": 0.00017604722665003956,
"loss": 0.0332,
"step": 1680
},
{
"epoch": 2.514221974344674,
"grad_norm": 0.09085123986005783,
"learning_rate": 0.00017451119348913744,
"loss": 0.0357,
"step": 1690
},
{
"epoch": 2.5290946272541364,
"grad_norm": 0.0897296592593193,
"learning_rate": 0.00017297251489549638,
"loss": 0.0368,
"step": 1700
},
{
"epoch": 2.5439672801635993,
"grad_norm": 0.07172433286905289,
"learning_rate": 0.000171431356934907,
"loss": 0.0371,
"step": 1710
},
{
"epoch": 2.558839933073062,
"grad_norm": 0.0848449245095253,
"learning_rate": 0.0001698878859407519,
"loss": 0.032,
"step": 1720
},
{
"epoch": 2.5737125859825247,
"grad_norm": 0.08270355314016342,
"learning_rate": 0.00016834226849605371,
"loss": 0.0333,
"step": 1730
},
{
"epoch": 2.588585238891987,
"grad_norm": 0.07130729407072067,
"learning_rate": 0.00016679467141549617,
"loss": 0.0324,
"step": 1740
},
{
"epoch": 2.60345789180145,
"grad_norm": 0.07863139361143112,
"learning_rate": 0.00016524526172742026,
"loss": 0.0295,
"step": 1750
},
{
"epoch": 2.6183305447109126,
"grad_norm": 0.08600688725709915,
"learning_rate": 0.00016369420665579725,
"loss": 0.0342,
"step": 1760
},
{
"epoch": 2.6332031976203756,
"grad_norm": 0.10146727412939072,
"learning_rate": 0.0001621416736021805,
"loss": 0.032,
"step": 1770
},
{
"epoch": 2.648075850529838,
"grad_norm": 0.0812121257185936,
"learning_rate": 0.00016058783012763844,
"loss": 0.0341,
"step": 1780
},
{
"epoch": 2.662948503439301,
"grad_norm": 0.0973149985074997,
"learning_rate": 0.00015903284393466987,
"loss": 0.0313,
"step": 1790
},
{
"epoch": 2.677821156348764,
"grad_norm": 0.0835902988910675,
"learning_rate": 0.00015747688284910457,
"loss": 0.0298,
"step": 1800
},
{
"epoch": 2.6926938092582264,
"grad_norm": 0.07972200214862823,
"learning_rate": 0.00015592011480198992,
"loss": 0.0346,
"step": 1810
},
{
"epoch": 2.707566462167689,
"grad_norm": 0.07594762742519379,
"learning_rate": 0.0001543627078114667,
"loss": 0.0338,
"step": 1820
},
{
"epoch": 2.722439115077152,
"grad_norm": 0.07757771015167236,
"learning_rate": 0.00015280482996463533,
"loss": 0.0315,
"step": 1830
},
{
"epoch": 2.737311767986615,
"grad_norm": 0.06432707607746124,
"learning_rate": 0.00015124664939941457,
"loss": 0.0319,
"step": 1840
},
{
"epoch": 2.7521844208960773,
"grad_norm": 0.07696104794740677,
"learning_rate": 0.00014968833428639474,
"loss": 0.0301,
"step": 1850
},
{
"epoch": 2.7670570738055402,
"grad_norm": 0.07426641881465912,
"learning_rate": 0.00014813005281068774,
"loss": 0.0285,
"step": 1860
},
{
"epoch": 2.7819297267150027,
"grad_norm": 0.0765393078327179,
"learning_rate": 0.00014657197315377495,
"loss": 0.0313,
"step": 1870
},
{
"epoch": 2.7968023796244657,
"grad_norm": 0.07151610404253006,
"learning_rate": 0.00014501426347535598,
"loss": 0.03,
"step": 1880
},
{
"epoch": 2.811675032533928,
"grad_norm": 0.07834175229072571,
"learning_rate": 0.0001434570918951996,
"loss": 0.0286,
"step": 1890
},
{
"epoch": 2.826547685443391,
"grad_norm": 0.09932053834199905,
"learning_rate": 0.00014190062647499892,
"loss": 0.0307,
"step": 1900
},
{
"epoch": 2.8414203383528536,
"grad_norm": 0.08595503121614456,
"learning_rate": 0.00014034503520023297,
"loss": 0.0306,
"step": 1910
},
{
"epoch": 2.8562929912623165,
"grad_norm": 0.08349858224391937,
"learning_rate": 0.00013879048596203636,
"loss": 0.0306,
"step": 1920
},
{
"epoch": 2.871165644171779,
"grad_norm": 0.07905739545822144,
"learning_rate": 0.0001372371465390794,
"loss": 0.0305,
"step": 1930
},
{
"epoch": 2.886038297081242,
"grad_norm": 0.06820567697286606,
"learning_rate": 0.0001356851845794598,
"loss": 0.0276,
"step": 1940
},
{
"epoch": 2.9009109499907044,
"grad_norm": 0.07227708399295807,
"learning_rate": 0.00013413476758260934,
"loss": 0.0267,
"step": 1950
},
{
"epoch": 2.9157836029001674,
"grad_norm": 0.09035148471593857,
"learning_rate": 0.00013258606288121542,
"loss": 0.0287,
"step": 1960
},
{
"epoch": 2.9306562558096303,
"grad_norm": 0.08626757562160492,
"learning_rate": 0.00013103923762316198,
"loss": 0.0298,
"step": 1970
},
{
"epoch": 2.945528908719093,
"grad_norm": 0.0765102431178093,
"learning_rate": 0.00012949445875348902,
"loss": 0.0274,
"step": 1980
},
{
"epoch": 2.9604015616285553,
"grad_norm": 0.08610813319683075,
"learning_rate": 0.00012795189299637483,
"loss": 0.0283,
"step": 1990
},
{
"epoch": 2.975274214538018,
"grad_norm": 0.08020433783531189,
"learning_rate": 0.00012641170683714222,
"loss": 0.0267,
"step": 2000
},
{
"epoch": 2.975274214538018,
"eval_loss": 0.0338360071182251,
"eval_runtime": 212.7237,
"eval_samples_per_second": 22.475,
"eval_steps_per_second": 22.475,
"step": 2000
},
{
"epoch": 2.990146867447481,
"grad_norm": 0.06885667890310287,
"learning_rate": 0.00012487406650428954,
"loss": 0.0277,
"step": 2010
},
{
"epoch": 3.00539133667968,
"grad_norm": 0.07658534497022629,
"learning_rate": 0.00012333913795155053,
"loss": 0.0251,
"step": 2020
},
{
"epoch": 3.020263989589143,
"grad_norm": 0.06449634581804276,
"learning_rate": 0.00012180708683998321,
"loss": 0.0147,
"step": 2030
},
{
"epoch": 3.0351366424986055,
"grad_norm": 0.06312290579080582,
"learning_rate": 0.00012027807852009038,
"loss": 0.0157,
"step": 2040
},
{
"epoch": 3.0500092954080684,
"grad_norm": 0.07343071699142456,
"learning_rate": 0.00011875227801397381,
"loss": 0.0149,
"step": 2050
},
{
"epoch": 3.064881948317531,
"grad_norm": 0.06489036977291107,
"learning_rate": 0.00011722984999752392,
"loss": 0.0155,
"step": 2060
},
{
"epoch": 3.079754601226994,
"grad_norm": 0.06041651591658592,
"learning_rate": 0.00011571095878264658,
"loss": 0.0139,
"step": 2070
},
{
"epoch": 3.094627254136457,
"grad_norm": 0.07048339396715164,
"learning_rate": 0.00011419576829952933,
"loss": 0.014,
"step": 2080
},
{
"epoch": 3.1094999070459193,
"grad_norm": 0.05680292099714279,
"learning_rate": 0.00011268444207894902,
"loss": 0.0133,
"step": 2090
},
{
"epoch": 3.124372559955382,
"grad_norm": 0.0727318823337555,
"learning_rate": 0.00011117714323462186,
"loss": 0.0147,
"step": 2100
},
{
"epoch": 3.1392452128648447,
"grad_norm": 0.054686855524778366,
"learning_rate": 0.00010967403444559963,
"loss": 0.0143,
"step": 2110
},
{
"epoch": 3.1541178657743076,
"grad_norm": 0.05729954317212105,
"learning_rate": 0.00010817527793871143,
"loss": 0.0134,
"step": 2120
},
{
"epoch": 3.16899051868377,
"grad_norm": 0.08314567804336548,
"learning_rate": 0.00010668103547105553,
"loss": 0.0148,
"step": 2130
},
{
"epoch": 3.183863171593233,
"grad_norm": 0.05523039028048515,
"learning_rate": 0.00010519146831254088,
"loss": 0.0129,
"step": 2140
},
{
"epoch": 3.1987358245026956,
"grad_norm": 0.05546917766332626,
"learning_rate": 0.00010370673722848183,
"loss": 0.0139,
"step": 2150
},
{
"epoch": 3.2136084774121585,
"grad_norm": 0.05486704409122467,
"learning_rate": 0.00010222700246224735,
"loss": 0.0135,
"step": 2160
},
{
"epoch": 3.228481130321621,
"grad_norm": 0.05656208097934723,
"learning_rate": 0.00010075242371796585,
"loss": 0.0125,
"step": 2170
},
{
"epoch": 3.243353783231084,
"grad_norm": 0.053801827132701874,
"learning_rate": 9.928316014328916e-05,
"loss": 0.0141,
"step": 2180
},
{
"epoch": 3.2582264361405464,
"grad_norm": 0.061040911823511124,
"learning_rate": 9.781937031221589e-05,
"loss": 0.0136,
"step": 2190
},
{
"epoch": 3.2730990890500093,
"grad_norm": 0.05558522418141365,
"learning_rate": 9.636121220797707e-05,
"loss": 0.0138,
"step": 2200
},
{
"epoch": 3.287971741959472,
"grad_norm": 0.055547308176755905,
"learning_rate": 9.490884320598516e-05,
"loss": 0.0136,
"step": 2210
},
{
"epoch": 3.3028443948689348,
"grad_norm": 0.061592597514390945,
"learning_rate": 9.34624200568492e-05,
"loss": 0.014,
"step": 2220
},
{
"epoch": 3.3177170477783973,
"grad_norm": 0.05287894979119301,
"learning_rate": 9.202209886945698e-05,
"loss": 0.0125,
"step": 2230
},
{
"epoch": 3.33258970068786,
"grad_norm": 0.06365808844566345,
"learning_rate": 9.058803509412646e-05,
"loss": 0.0139,
"step": 2240
},
{
"epoch": 3.347462353597323,
"grad_norm": 0.05474059656262398,
"learning_rate": 8.916038350582876e-05,
"loss": 0.0141,
"step": 2250
},
{
"epoch": 3.3623350065067856,
"grad_norm": 0.054872963577508926,
"learning_rate": 8.773929818748315e-05,
"loss": 0.0135,
"step": 2260
},
{
"epoch": 3.3772076594162486,
"grad_norm": 0.05935963988304138,
"learning_rate": 8.632493251332793e-05,
"loss": 0.0128,
"step": 2270
},
{
"epoch": 3.392080312325711,
"grad_norm": 0.06830602139234543,
"learning_rate": 8.491743913236628e-05,
"loss": 0.0133,
"step": 2280
},
{
"epoch": 3.406952965235174,
"grad_norm": 0.057178862392902374,
"learning_rate": 8.351696995189218e-05,
"loss": 0.0121,
"step": 2290
},
{
"epoch": 3.4218256181446365,
"grad_norm": 0.06827449798583984,
"learning_rate": 8.212367612109464e-05,
"loss": 0.0127,
"step": 2300
},
{
"epoch": 3.4366982710540994,
"grad_norm": 0.04981634393334389,
"learning_rate": 8.073770801474495e-05,
"loss": 0.0132,
"step": 2310
},
{
"epoch": 3.451570923963562,
"grad_norm": 0.052124422043561935,
"learning_rate": 7.935921521696702e-05,
"loss": 0.0129,
"step": 2320
},
{
"epoch": 3.466443576873025,
"grad_norm": 0.05991722270846367,
"learning_rate": 7.798834650509306e-05,
"loss": 0.0128,
"step": 2330
},
{
"epoch": 3.4813162297824873,
"grad_norm": 0.05946414917707443,
"learning_rate": 7.662524983360665e-05,
"loss": 0.0127,
"step": 2340
},
{
"epoch": 3.4961888826919503,
"grad_norm": 0.05650801584124565,
"learning_rate": 7.527007231817389e-05,
"loss": 0.0127,
"step": 2350
},
{
"epoch": 3.5110615356014128,
"grad_norm": 0.04841410368680954,
"learning_rate": 7.392296021976614e-05,
"loss": 0.0122,
"step": 2360
},
{
"epoch": 3.5259341885108757,
"grad_norm": 0.05933946743607521,
"learning_rate": 7.258405892887398e-05,
"loss": 0.0121,
"step": 2370
},
{
"epoch": 3.540806841420338,
"grad_norm": 0.05451497435569763,
"learning_rate": 7.125351294981598e-05,
"loss": 0.0127,
"step": 2380
},
{
"epoch": 3.555679494329801,
"grad_norm": 0.05574881285429001,
"learning_rate": 6.993146588514225e-05,
"loss": 0.0124,
"step": 2390
},
{
"epoch": 3.5705521472392636,
"grad_norm": 0.057919006794691086,
"learning_rate": 6.86180604201361e-05,
"loss": 0.0119,
"step": 2400
},
{
"epoch": 3.5854248001487266,
"grad_norm": 0.051368821412324905,
"learning_rate": 6.731343830741433e-05,
"loss": 0.0126,
"step": 2410
},
{
"epoch": 3.6002974530581895,
"grad_norm": 0.06351654976606369,
"learning_rate": 6.6017740351628e-05,
"loss": 0.0135,
"step": 2420
},
{
"epoch": 3.615170105967652,
"grad_norm": 0.053709421306848526,
"learning_rate": 6.473110639426616e-05,
"loss": 0.0122,
"step": 2430
},
{
"epoch": 3.6300427588771145,
"grad_norm": 0.061445701867341995,
"learning_rate": 6.345367529856254e-05,
"loss": 0.0132,
"step": 2440
},
{
"epoch": 3.6449154117865774,
"grad_norm": 0.0678747370839119,
"learning_rate": 6.218558493450893e-05,
"loss": 0.0125,
"step": 2450
},
{
"epoch": 3.6597880646960403,
"grad_norm": 0.05095114931464195,
"learning_rate": 6.0926972163974775e-05,
"loss": 0.012,
"step": 2460
},
{
"epoch": 3.674660717605503,
"grad_norm": 0.05740583688020706,
"learning_rate": 5.9677972825936254e-05,
"loss": 0.0125,
"step": 2470
},
{
"epoch": 3.6895333705149658,
"grad_norm": 0.05399662256240845,
"learning_rate": 5.8438721721815536e-05,
"loss": 0.0134,
"step": 2480
},
{
"epoch": 3.7044060234244283,
"grad_norm": 0.056056030094623566,
"learning_rate": 5.720935260093177e-05,
"loss": 0.0125,
"step": 2490
},
{
"epoch": 3.719278676333891,
"grad_norm": 0.046866290271282196,
"learning_rate": 5.598999814606618e-05,
"loss": 0.0118,
"step": 2500
},
{
"epoch": 3.719278676333891,
"eval_loss": 0.031009526923298836,
"eval_runtime": 212.3923,
"eval_samples_per_second": 22.51,
"eval_steps_per_second": 22.51,
"step": 2500
},
{
"epoch": 3.7341513292433537,
"grad_norm": 0.046400755643844604,
"learning_rate": 5.4780789959141524e-05,
"loss": 0.0122,
"step": 2510
},
{
"epoch": 3.7490239821528166,
"grad_norm": 0.05211547762155533,
"learning_rate": 5.358185854701909e-05,
"loss": 0.0122,
"step": 2520
},
{
"epoch": 3.763896635062279,
"grad_norm": 0.0429752878844738,
"learning_rate": 5.239333330741298e-05,
"loss": 0.0124,
"step": 2530
},
{
"epoch": 3.778769287971742,
"grad_norm": 0.05008607730269432,
"learning_rate": 5.121534251492486e-05,
"loss": 0.0125,
"step": 2540
},
{
"epoch": 3.7936419408812045,
"grad_norm": 0.046397943049669266,
"learning_rate": 5.004801330719941e-05,
"loss": 0.0111,
"step": 2550
},
{
"epoch": 3.8085145937906675,
"grad_norm": 0.05960022658109665,
"learning_rate": 4.8891471671202675e-05,
"loss": 0.0117,
"step": 2560
},
{
"epoch": 3.82338724670013,
"grad_norm": 0.04353282228112221,
"learning_rate": 4.7745842429624795e-05,
"loss": 0.0119,
"step": 2570
},
{
"epoch": 3.838259899609593,
"grad_norm": 0.05057670921087265,
"learning_rate": 4.661124922740794e-05,
"loss": 0.0116,
"step": 2580
},
{
"epoch": 3.853132552519056,
"grad_norm": 0.04886782541871071,
"learning_rate": 4.548781451840179e-05,
"loss": 0.0113,
"step": 2590
},
{
"epoch": 3.8680052054285183,
"grad_norm": 0.055182382464408875,
"learning_rate": 4.437565955214723e-05,
"loss": 0.0116,
"step": 2600
},
{
"epoch": 3.882877858337981,
"grad_norm": 0.048834457993507385,
"learning_rate": 4.3274904360790505e-05,
"loss": 0.0121,
"step": 2610
},
{
"epoch": 3.8977505112474438,
"grad_norm": 0.05025951564311981,
"learning_rate": 4.218566774612802e-05,
"loss": 0.0112,
"step": 2620
},
{
"epoch": 3.9126231641569067,
"grad_norm": 0.05054251477122307,
"learning_rate": 4.1108067266784746e-05,
"loss": 0.0112,
"step": 2630
},
{
"epoch": 3.927495817066369,
"grad_norm": 0.05326022952795029,
"learning_rate": 4.004221922552608e-05,
"loss": 0.0119,
"step": 2640
},
{
"epoch": 3.9423684699758317,
"grad_norm": 0.05668502673506737,
"learning_rate": 3.898823865670579e-05,
"loss": 0.0114,
"step": 2650
},
{
"epoch": 3.9572411228852946,
"grad_norm": 0.054235439747571945,
"learning_rate": 3.794623931385062e-05,
"loss": 0.0119,
"step": 2660
},
{
"epoch": 3.9721137757947576,
"grad_norm": 0.05231969431042671,
"learning_rate": 3.6916333657383024e-05,
"loss": 0.0108,
"step": 2670
},
{
"epoch": 3.98698642870422,
"grad_norm": 0.057500049471855164,
"learning_rate": 3.5898632842483746e-05,
"loss": 0.011,
"step": 2680
},
{
"epoch": 4.002230897936419,
"grad_norm": 0.04203633964061737,
"learning_rate": 3.489324670709494e-05,
"loss": 0.0113,
"step": 2690
},
{
"epoch": 4.017103550845882,
"grad_norm": 0.029648838564753532,
"learning_rate": 3.390028376006589e-05,
"loss": 0.0059,
"step": 2700
},
{
"epoch": 4.031976203755345,
"grad_norm": 0.03779765963554382,
"learning_rate": 3.2919851169441625e-05,
"loss": 0.006,
"step": 2710
},
{
"epoch": 4.046848856664807,
"grad_norm": 0.040116600692272186,
"learning_rate": 3.195205475089667e-05,
"loss": 0.0058,
"step": 2720
},
{
"epoch": 4.06172150957427,
"grad_norm": 0.030058899894356728,
"learning_rate": 3.099699895631474e-05,
"loss": 0.0056,
"step": 2730
},
{
"epoch": 4.076594162483733,
"grad_norm": 0.03675166517496109,
"learning_rate": 3.0054786862515257e-05,
"loss": 0.0058,
"step": 2740
},
{
"epoch": 4.091466815393196,
"grad_norm": 0.03470413759350777,
"learning_rate": 2.912552016012879e-05,
"loss": 0.0057,
"step": 2750
},
{
"epoch": 4.106339468302658,
"grad_norm": 0.03222460299730301,
"learning_rate": 2.8209299142621522e-05,
"loss": 0.0057,
"step": 2760
},
{
"epoch": 4.121212121212121,
"grad_norm": 0.036458127200603485,
"learning_rate": 2.7306222695471173e-05,
"loss": 0.0056,
"step": 2770
},
{
"epoch": 4.136084774121584,
"grad_norm": 0.035760316997766495,
"learning_rate": 2.641638828549425e-05,
"loss": 0.0055,
"step": 2780
},
{
"epoch": 4.150957427031047,
"grad_norm": 0.04281270503997803,
"learning_rate": 2.5539891950326875e-05,
"loss": 0.0056,
"step": 2790
},
{
"epoch": 4.165830079940509,
"grad_norm": 0.030339548364281654,
"learning_rate": 2.4676828288059558e-05,
"loss": 0.0057,
"step": 2800
},
{
"epoch": 4.180702732849972,
"grad_norm": 0.03753247857093811,
"learning_rate": 2.382729044702748e-05,
"loss": 0.0058,
"step": 2810
},
{
"epoch": 4.195575385759435,
"grad_norm": 0.035988811403512955,
"learning_rate": 2.299137011575738e-05,
"loss": 0.0055,
"step": 2820
},
{
"epoch": 4.210448038668898,
"grad_norm": 0.0344134196639061,
"learning_rate": 2.2169157513071566e-05,
"loss": 0.0057,
"step": 2830
},
{
"epoch": 4.22532069157836,
"grad_norm": 0.03696177527308464,
"learning_rate": 2.136074137835107e-05,
"loss": 0.0056,
"step": 2840
},
{
"epoch": 4.240193344487823,
"grad_norm": 0.03733756020665169,
"learning_rate": 2.056620896195804e-05,
"loss": 0.0057,
"step": 2850
},
{
"epoch": 4.255065997397286,
"grad_norm": 0.03630942478775978,
"learning_rate": 1.978564601581919e-05,
"loss": 0.0056,
"step": 2860
},
{
"epoch": 4.269938650306749,
"grad_norm": 0.03577449545264244,
"learning_rate": 1.9019136784170635e-05,
"loss": 0.0055,
"step": 2870
},
{
"epoch": 4.284811303216211,
"grad_norm": 0.03209745138883591,
"learning_rate": 1.82667639944657e-05,
"loss": 0.0054,
"step": 2880
},
{
"epoch": 4.299683956125674,
"grad_norm": 0.03668665885925293,
"learning_rate": 1.752860884844646e-05,
"loss": 0.0055,
"step": 2890
},
{
"epoch": 4.314556609035137,
"grad_norm": 0.03498975560069084,
"learning_rate": 1.680475101337959e-05,
"loss": 0.0055,
"step": 2900
},
{
"epoch": 4.3294292619445995,
"grad_norm": 0.04088146984577179,
"learning_rate": 1.60952686134583e-05,
"loss": 0.0055,
"step": 2910
},
{
"epoch": 4.3443019148540625,
"grad_norm": 0.035557616502046585,
"learning_rate": 1.5400238221370413e-05,
"loss": 0.0056,
"step": 2920
},
{
"epoch": 4.3591745677635245,
"grad_norm": 0.03443196415901184,
"learning_rate": 1.4719734850034277e-05,
"loss": 0.0056,
"step": 2930
},
{
"epoch": 4.3740472206729875,
"grad_norm": 0.03481742739677429,
"learning_rate": 1.4053831944502508e-05,
"loss": 0.0057,
"step": 2940
},
{
"epoch": 4.38891987358245,
"grad_norm": 0.03648516163229942,
"learning_rate": 1.340260137403557e-05,
"loss": 0.0053,
"step": 2950
},
{
"epoch": 4.403792526491913,
"grad_norm": 0.03400832787156105,
"learning_rate": 1.2766113424344814e-05,
"loss": 0.0055,
"step": 2960
},
{
"epoch": 4.418665179401375,
"grad_norm": 0.03558880090713501,
"learning_rate": 1.21444367900069e-05,
"loss": 0.0055,
"step": 2970
},
{
"epoch": 4.433537832310838,
"grad_norm": 0.035319775342941284,
"learning_rate": 1.1537638567049729e-05,
"loss": 0.0055,
"step": 2980
},
{
"epoch": 4.448410485220301,
"grad_norm": 0.03432595729827881,
"learning_rate": 1.0945784245710848e-05,
"loss": 0.0054,
"step": 2990
},
{
"epoch": 4.463283138129764,
"grad_norm": 0.03571225702762604,
"learning_rate": 1.036893770336938e-05,
"loss": 0.0055,
"step": 3000
},
{
"epoch": 4.463283138129764,
"eval_loss": 0.03200867399573326,
"eval_runtime": 212.5457,
"eval_samples_per_second": 22.494,
"eval_steps_per_second": 22.494,
"step": 3000
},
{
"epoch": 4.478155791039226,
"grad_norm": 0.040391724556684494,
"learning_rate": 9.807161197651742e-06,
"loss": 0.0056,
"step": 3010
},
{
"epoch": 4.493028443948689,
"grad_norm": 0.03410281240940094,
"learning_rate": 9.260515359712517e-06,
"loss": 0.0055,
"step": 3020
},
{
"epoch": 4.507901096858152,
"grad_norm": 0.03447275608778,
"learning_rate": 8.729059187690479e-06,
"loss": 0.0054,
"step": 3030
},
{
"epoch": 4.522773749767615,
"grad_norm": 0.032652657479047775,
"learning_rate": 8.212850040341273e-06,
"loss": 0.0055,
"step": 3040
},
{
"epoch": 4.537646402677078,
"grad_norm": 0.035828616470098495,
"learning_rate": 7.711943630846684e-06,
"loss": 0.0053,
"step": 3050
},
{
"epoch": 4.55251905558654,
"grad_norm": 0.03351854532957077,
"learning_rate": 7.226394020801645e-06,
"loss": 0.0054,
"step": 3060
},
{
"epoch": 4.567391708496003,
"grad_norm": 0.03872072696685791,
"learning_rate": 6.7562536143796254e-06,
"loss": 0.0056,
"step": 3070
},
{
"epoch": 4.582264361405466,
"grad_norm": 0.03518550843000412,
"learning_rate": 6.301573152676664e-06,
"loss": 0.0054,
"step": 3080
},
{
"epoch": 4.597137014314928,
"grad_norm": 0.0351685993373394,
"learning_rate": 5.862401708235076e-06,
"loss": 0.0052,
"step": 3090
},
{
"epoch": 4.612009667224391,
"grad_norm": 0.0348668210208416,
"learning_rate": 5.438786679747081e-06,
"loss": 0.0055,
"step": 3100
},
{
"epoch": 4.626882320133854,
"grad_norm": 0.03660331293940544,
"learning_rate": 5.030773786939319e-06,
"loss": 0.0055,
"step": 3110
},
{
"epoch": 4.641754973043317,
"grad_norm": 0.04046601429581642,
"learning_rate": 4.638407065638322e-06,
"loss": 0.0054,
"step": 3120
},
{
"epoch": 4.65662762595278,
"grad_norm": 0.03230154886841774,
"learning_rate": 4.261728863017827e-06,
"loss": 0.0054,
"step": 3130
},
{
"epoch": 4.671500278862242,
"grad_norm": 0.034297142177820206,
"learning_rate": 3.900779833028472e-06,
"loss": 0.0054,
"step": 3140
},
{
"epoch": 4.686372931771705,
"grad_norm": 0.03240946680307388,
"learning_rate": 3.5555989320099952e-06,
"loss": 0.0053,
"step": 3150
},
{
"epoch": 4.701245584681168,
"grad_norm": 0.04137023165822029,
"learning_rate": 3.2262234144868116e-06,
"loss": 0.0054,
"step": 3160
},
{
"epoch": 4.7161182375906305,
"grad_norm": 0.030783316120505333,
"learning_rate": 2.912688829147214e-06,
"loss": 0.0052,
"step": 3170
},
{
"epoch": 4.730990890500093,
"grad_norm": 0.03588159382343292,
"learning_rate": 2.6150290150067588e-06,
"loss": 0.0055,
"step": 3180
},
{
"epoch": 4.7458635434095555,
"grad_norm": 0.03300805762410164,
"learning_rate": 2.3332760977559873e-06,
"loss": 0.0053,
"step": 3190
},
{
"epoch": 4.7607361963190185,
"grad_norm": 0.03986676409840584,
"learning_rate": 2.0674604862932654e-06,
"loss": 0.0055,
"step": 3200
},
{
"epoch": 4.775608849228481,
"grad_norm": 0.03252493590116501,
"learning_rate": 1.8176108694427927e-06,
"loss": 0.0052,
"step": 3210
},
{
"epoch": 4.790481502137943,
"grad_norm": 0.03938417136669159,
"learning_rate": 1.583754212858329e-06,
"loss": 0.0054,
"step": 3220
},
{
"epoch": 4.805354155047406,
"grad_norm": 0.03552339971065521,
"learning_rate": 1.3659157561127732e-06,
"loss": 0.0057,
"step": 3230
},
{
"epoch": 4.820226807956869,
"grad_norm": 0.03480495885014534,
"learning_rate": 1.1641190099741904e-06,
"loss": 0.0053,
"step": 3240
},
{
"epoch": 4.835099460866332,
"grad_norm": 0.03451026231050491,
"learning_rate": 9.783857538683603e-07,
"loss": 0.0053,
"step": 3250
},
{
"epoch": 4.849972113775795,
"grad_norm": 0.033308371901512146,
"learning_rate": 8.087360335281235e-07,
"loss": 0.0055,
"step": 3260
},
{
"epoch": 4.864844766685257,
"grad_norm": 0.035610370337963104,
"learning_rate": 6.551881588299279e-07,
"loss": 0.0054,
"step": 3270
},
{
"epoch": 4.87971741959472,
"grad_norm": 0.030910024419426918,
"learning_rate": 5.177587018176777e-07,
"loss": 0.0054,
"step": 3280
},
{
"epoch": 4.894590072504183,
"grad_norm": 0.034942276775836945,
"learning_rate": 3.964624949141626e-07,
"loss": 0.0054,
"step": 3290
},
{
"epoch": 4.909462725413646,
"grad_norm": 0.03491232544183731,
"learning_rate": 2.913126293202228e-07,
"loss": 0.0053,
"step": 3300
},
{
"epoch": 4.924335378323108,
"grad_norm": 0.0331818163394928,
"learning_rate": 2.0232045360184523e-07,
"loss": 0.0051,
"step": 3310
},
{
"epoch": 4.939208031232571,
"grad_norm": 0.034393060952425,
"learning_rate": 1.2949557246537678e-07,
"loss": 0.0053,
"step": 3320
},
{
"epoch": 4.954080684142034,
"grad_norm": 0.03940508887171745,
"learning_rate": 7.284584572085361e-08,
"loss": 0.0052,
"step": 3330
},
{
"epoch": 4.968953337051497,
"grad_norm": 0.03125544637441635,
"learning_rate": 3.237738743372964e-08,
"loss": 0.0052,
"step": 3340
},
{
"epoch": 4.983825989960959,
"grad_norm": 0.03558258339762688,
"learning_rate": 8.094565265054365e-09,
"loss": 0.0054,
"step": 3350
},
{
"epoch": 4.998698642870422,
"grad_norm": 0.03360743075609207,
"learning_rate": 0.0,
"loss": 0.0054,
"step": 3360
},
{
"epoch": 4.998698642870422,
"step": 3360,
"total_flos": 5.14290499398402e+18,
"train_loss": 0.19145491501161208,
"train_runtime": 31931.328,
"train_samples_per_second": 6.737,
"train_steps_per_second": 0.105
}
],
"logging_steps": 10,
"max_steps": 3360,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.14290499398402e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}