chinese-phi3.5-v0 / trainer_state.json
yongsen-teo's picture
Upload 13 files
0ae3b7a verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 76.92307692307692,
"eval_steps": 500,
"global_step": 2500,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.3076923076923077,
"grad_norm": 5.159167766571045,
"learning_rate": 7.8125e-08,
"loss": 4.2745,
"step": 10
},
{
"epoch": 0.6153846153846154,
"grad_norm": 5.482397079467773,
"learning_rate": 1.5625e-07,
"loss": 4.2677,
"step": 20
},
{
"epoch": 0.9230769230769231,
"grad_norm": 5.341490268707275,
"learning_rate": 2.3437500000000003e-07,
"loss": 4.2277,
"step": 30
},
{
"epoch": 1.2307692307692308,
"grad_norm": 5.534327983856201,
"learning_rate": 3.125e-07,
"loss": 4.2715,
"step": 40
},
{
"epoch": 1.5384615384615383,
"grad_norm": 5.407191753387451,
"learning_rate": 3.90625e-07,
"loss": 4.2371,
"step": 50
},
{
"epoch": 1.8461538461538463,
"grad_norm": 5.150612831115723,
"learning_rate": 4.6875000000000006e-07,
"loss": 4.1953,
"step": 60
},
{
"epoch": 2.1538461538461537,
"grad_norm": 5.620100021362305,
"learning_rate": 5.468750000000001e-07,
"loss": 4.2908,
"step": 70
},
{
"epoch": 2.4615384615384617,
"grad_norm": 5.80696964263916,
"learning_rate": 6.25e-07,
"loss": 4.2603,
"step": 80
},
{
"epoch": 2.769230769230769,
"grad_norm": 5.940330982208252,
"learning_rate": 7.03125e-07,
"loss": 4.2049,
"step": 90
},
{
"epoch": 3.076923076923077,
"grad_norm": 5.8099141120910645,
"learning_rate": 7.8125e-07,
"loss": 4.1638,
"step": 100
},
{
"epoch": 3.3846153846153846,
"grad_norm": 5.388895511627197,
"learning_rate": 8.59375e-07,
"loss": 4.1409,
"step": 110
},
{
"epoch": 3.6923076923076925,
"grad_norm": 5.919471263885498,
"learning_rate": 9.375000000000001e-07,
"loss": 4.1658,
"step": 120
},
{
"epoch": 4.0,
"grad_norm": 13097.25390625,
"learning_rate": 1.0156250000000001e-06,
"loss": 4.1936,
"step": 130
},
{
"epoch": 4.3076923076923075,
"grad_norm": 4.802829265594482,
"learning_rate": 1.0937500000000001e-06,
"loss": 4.021,
"step": 140
},
{
"epoch": 4.615384615384615,
"grad_norm": 5.628367900848389,
"learning_rate": 1.1718750000000001e-06,
"loss": 4.019,
"step": 150
},
{
"epoch": 4.923076923076923,
"grad_norm": 5.601304531097412,
"learning_rate": 1.25e-06,
"loss": 3.9075,
"step": 160
},
{
"epoch": 5.230769230769231,
"grad_norm": 5.029603004455566,
"learning_rate": 1.328125e-06,
"loss": 3.8952,
"step": 170
},
{
"epoch": 5.538461538461538,
"grad_norm": 5.625918388366699,
"learning_rate": 1.40625e-06,
"loss": 3.7809,
"step": 180
},
{
"epoch": 5.846153846153846,
"grad_norm": 4.302311420440674,
"learning_rate": 1.484375e-06,
"loss": 3.5823,
"step": 190
},
{
"epoch": 6.153846153846154,
"grad_norm": 4.250982284545898,
"learning_rate": 1.5625e-06,
"loss": 3.5112,
"step": 200
},
{
"epoch": 6.461538461538462,
"grad_norm": 3.137059211730957,
"learning_rate": 1.640625e-06,
"loss": 3.3867,
"step": 210
},
{
"epoch": 6.769230769230769,
"grad_norm": 3.2033724784851074,
"learning_rate": 1.71875e-06,
"loss": 3.2788,
"step": 220
},
{
"epoch": 7.076923076923077,
"grad_norm": 2.8167309761047363,
"learning_rate": 1.796875e-06,
"loss": 3.1566,
"step": 230
},
{
"epoch": 7.384615384615385,
"grad_norm": 2.167381525039673,
"learning_rate": 1.8750000000000003e-06,
"loss": 2.9642,
"step": 240
},
{
"epoch": 7.6923076923076925,
"grad_norm": 2.2277944087982178,
"learning_rate": 1.953125e-06,
"loss": 2.8886,
"step": 250
},
{
"epoch": 8.0,
"grad_norm": 60692.35546875,
"learning_rate": 2.0312500000000002e-06,
"loss": 2.7726,
"step": 260
},
{
"epoch": 8.307692307692308,
"grad_norm": 1.6853564977645874,
"learning_rate": 2.109375e-06,
"loss": 2.7062,
"step": 270
},
{
"epoch": 8.615384615384615,
"grad_norm": 1.5454535484313965,
"learning_rate": 2.1875000000000002e-06,
"loss": 2.5508,
"step": 280
},
{
"epoch": 8.923076923076923,
"grad_norm": 1.2037118673324585,
"learning_rate": 2.265625e-06,
"loss": 2.4639,
"step": 290
},
{
"epoch": 9.23076923076923,
"grad_norm": 1.0261240005493164,
"learning_rate": 2.3437500000000002e-06,
"loss": 2.4103,
"step": 300
},
{
"epoch": 9.538461538461538,
"grad_norm": 0.9358808994293213,
"learning_rate": 2.421875e-06,
"loss": 2.3032,
"step": 310
},
{
"epoch": 9.846153846153847,
"grad_norm": 0.7383924722671509,
"learning_rate": 2.5e-06,
"loss": 2.3002,
"step": 320
},
{
"epoch": 10.153846153846153,
"grad_norm": 0.685702383518219,
"learning_rate": 2.5781250000000004e-06,
"loss": 2.2148,
"step": 330
},
{
"epoch": 10.461538461538462,
"grad_norm": 0.6645168662071228,
"learning_rate": 2.65625e-06,
"loss": 2.1989,
"step": 340
},
{
"epoch": 10.76923076923077,
"grad_norm": 0.7011102437973022,
"learning_rate": 2.7343750000000004e-06,
"loss": 2.1496,
"step": 350
},
{
"epoch": 11.076923076923077,
"grad_norm": 0.5761039853096008,
"learning_rate": 2.8125e-06,
"loss": 2.1465,
"step": 360
},
{
"epoch": 11.384615384615385,
"grad_norm": 0.562958836555481,
"learning_rate": 2.8906250000000004e-06,
"loss": 2.0871,
"step": 370
},
{
"epoch": 11.692307692307692,
"grad_norm": 0.5713663101196289,
"learning_rate": 2.96875e-06,
"loss": 2.0732,
"step": 380
},
{
"epoch": 12.0,
"grad_norm": 16827.7421875,
"learning_rate": 3.0468750000000004e-06,
"loss": 2.0684,
"step": 390
},
{
"epoch": 12.307692307692308,
"grad_norm": 0.5909234285354614,
"learning_rate": 3.125e-06,
"loss": 2.0554,
"step": 400
},
{
"epoch": 12.615384615384615,
"grad_norm": 0.5658320784568787,
"learning_rate": 3.2031250000000004e-06,
"loss": 1.9929,
"step": 410
},
{
"epoch": 12.923076923076923,
"grad_norm": 0.5382928848266602,
"learning_rate": 3.28125e-06,
"loss": 1.9913,
"step": 420
},
{
"epoch": 13.23076923076923,
"grad_norm": 0.7106872200965881,
"learning_rate": 3.3593750000000003e-06,
"loss": 1.9562,
"step": 430
},
{
"epoch": 13.538461538461538,
"grad_norm": 0.5338084697723389,
"learning_rate": 3.4375e-06,
"loss": 1.9503,
"step": 440
},
{
"epoch": 13.846153846153847,
"grad_norm": 0.524355411529541,
"learning_rate": 3.5156250000000003e-06,
"loss": 1.9499,
"step": 450
},
{
"epoch": 14.153846153846153,
"grad_norm": 0.5286893248558044,
"learning_rate": 3.59375e-06,
"loss": 1.9225,
"step": 460
},
{
"epoch": 14.461538461538462,
"grad_norm": 0.5003280639648438,
"learning_rate": 3.6718750000000003e-06,
"loss": 1.8688,
"step": 470
},
{
"epoch": 14.76923076923077,
"grad_norm": 0.5744629502296448,
"learning_rate": 3.7500000000000005e-06,
"loss": 1.8728,
"step": 480
},
{
"epoch": 15.076923076923077,
"grad_norm": 0.47028881311416626,
"learning_rate": 3.828125000000001e-06,
"loss": 1.818,
"step": 490
},
{
"epoch": 15.384615384615385,
"grad_norm": 0.5076237320899963,
"learning_rate": 3.90625e-06,
"loss": 1.8375,
"step": 500
},
{
"epoch": 15.692307692307692,
"grad_norm": 0.5628578066825867,
"learning_rate": 3.984375e-06,
"loss": 1.823,
"step": 510
},
{
"epoch": 16.0,
"grad_norm": 35255.69921875,
"learning_rate": 4.0625000000000005e-06,
"loss": 1.8249,
"step": 520
},
{
"epoch": 16.307692307692307,
"grad_norm": 0.5536410212516785,
"learning_rate": 4.140625000000001e-06,
"loss": 1.7998,
"step": 530
},
{
"epoch": 16.615384615384617,
"grad_norm": 0.6619865298271179,
"learning_rate": 4.21875e-06,
"loss": 1.775,
"step": 540
},
{
"epoch": 16.923076923076923,
"grad_norm": 0.6591458320617676,
"learning_rate": 4.296875e-06,
"loss": 1.7605,
"step": 550
},
{
"epoch": 17.23076923076923,
"grad_norm": 0.8084316253662109,
"learning_rate": 4.3750000000000005e-06,
"loss": 1.7278,
"step": 560
},
{
"epoch": 17.53846153846154,
"grad_norm": 0.6782126426696777,
"learning_rate": 4.453125000000001e-06,
"loss": 1.7261,
"step": 570
},
{
"epoch": 17.846153846153847,
"grad_norm": 0.6113712191581726,
"learning_rate": 4.53125e-06,
"loss": 1.7178,
"step": 580
},
{
"epoch": 18.153846153846153,
"grad_norm": 0.6165570020675659,
"learning_rate": 4.609375e-06,
"loss": 1.6865,
"step": 590
},
{
"epoch": 18.46153846153846,
"grad_norm": 0.7881684303283691,
"learning_rate": 4.6875000000000004e-06,
"loss": 1.6793,
"step": 600
},
{
"epoch": 18.76923076923077,
"grad_norm": 0.672874927520752,
"learning_rate": 4.765625000000001e-06,
"loss": 1.6661,
"step": 610
},
{
"epoch": 19.076923076923077,
"grad_norm": 0.730848491191864,
"learning_rate": 4.84375e-06,
"loss": 1.6431,
"step": 620
},
{
"epoch": 19.384615384615383,
"grad_norm": 0.6730669736862183,
"learning_rate": 4.921875e-06,
"loss": 1.6155,
"step": 630
},
{
"epoch": 19.692307692307693,
"grad_norm": 0.6560551524162292,
"learning_rate": 5e-06,
"loss": 1.6395,
"step": 640
},
{
"epoch": 20.0,
"grad_norm": 4129.79052734375,
"learning_rate": 4.999811754597862e-06,
"loss": 1.5897,
"step": 650
},
{
"epoch": 20.307692307692307,
"grad_norm": 0.6594120860099792,
"learning_rate": 4.999247046740511e-06,
"loss": 1.5829,
"step": 660
},
{
"epoch": 20.615384615384617,
"grad_norm": 0.893703818321228,
"learning_rate": 4.998305961470874e-06,
"loss": 1.558,
"step": 670
},
{
"epoch": 20.923076923076923,
"grad_norm": 0.7144062519073486,
"learning_rate": 4.996988640512931e-06,
"loss": 1.5373,
"step": 680
},
{
"epoch": 21.23076923076923,
"grad_norm": 0.8453310132026672,
"learning_rate": 4.995295282250373e-06,
"loss": 1.4909,
"step": 690
},
{
"epoch": 21.53846153846154,
"grad_norm": 0.8293094635009766,
"learning_rate": 4.993226141696726e-06,
"loss": 1.4967,
"step": 700
},
{
"epoch": 21.846153846153847,
"grad_norm": 0.7181118726730347,
"learning_rate": 4.990781530456945e-06,
"loss": 1.4857,
"step": 710
},
{
"epoch": 22.153846153846153,
"grad_norm": 0.844008207321167,
"learning_rate": 4.987961816680493e-06,
"loss": 1.495,
"step": 720
},
{
"epoch": 22.46153846153846,
"grad_norm": 0.8761960864067078,
"learning_rate": 4.984767425005891e-06,
"loss": 1.4224,
"step": 730
},
{
"epoch": 22.76923076923077,
"grad_norm": 0.8939017057418823,
"learning_rate": 4.981198836496776e-06,
"loss": 1.4063,
"step": 740
},
{
"epoch": 23.076923076923077,
"grad_norm": 0.828834593296051,
"learning_rate": 4.97725658856945e-06,
"loss": 1.429,
"step": 750
},
{
"epoch": 23.384615384615383,
"grad_norm": 0.945023775100708,
"learning_rate": 4.972941274911953e-06,
"loss": 1.3588,
"step": 760
},
{
"epoch": 23.692307692307693,
"grad_norm": 1.122968316078186,
"learning_rate": 4.968253545394647e-06,
"loss": 1.3309,
"step": 770
},
{
"epoch": 24.0,
"grad_norm": 93171.5546875,
"learning_rate": 4.9631941059723535e-06,
"loss": 1.335,
"step": 780
},
{
"epoch": 24.307692307692307,
"grad_norm": 1.0523442029953003,
"learning_rate": 4.957763718578042e-06,
"loss": 1.3347,
"step": 790
},
{
"epoch": 24.615384615384617,
"grad_norm": 1.179585576057434,
"learning_rate": 4.9519632010080765e-06,
"loss": 1.2618,
"step": 800
},
{
"epoch": 24.923076923076923,
"grad_norm": 0.854586124420166,
"learning_rate": 4.9457934267990695e-06,
"loss": 1.2321,
"step": 810
},
{
"epoch": 25.23076923076923,
"grad_norm": 1.0027631521224976,
"learning_rate": 4.939255325096322e-06,
"loss": 1.2041,
"step": 820
},
{
"epoch": 25.53846153846154,
"grad_norm": 1.045911192893982,
"learning_rate": 4.932349880513901e-06,
"loss": 1.1746,
"step": 830
},
{
"epoch": 25.846153846153847,
"grad_norm": 0.9622510075569153,
"learning_rate": 4.925078132986361e-06,
"loss": 1.1846,
"step": 840
},
{
"epoch": 26.153846153846153,
"grad_norm": 1.2767952680587769,
"learning_rate": 4.917441177612131e-06,
"loss": 1.1605,
"step": 850
},
{
"epoch": 26.46153846153846,
"grad_norm": 0.9456568360328674,
"learning_rate": 4.9094401644886e-06,
"loss": 1.1373,
"step": 860
},
{
"epoch": 26.76923076923077,
"grad_norm": 1.3368542194366455,
"learning_rate": 4.901076298538915e-06,
"loss": 1.0879,
"step": 870
},
{
"epoch": 27.076923076923077,
"grad_norm": 1.4178813695907593,
"learning_rate": 4.8923508393305224e-06,
"loss": 1.0359,
"step": 880
},
{
"epoch": 27.384615384615383,
"grad_norm": 1.4570515155792236,
"learning_rate": 4.883265100885484e-06,
"loss": 1.0313,
"step": 890
},
{
"epoch": 27.692307692307693,
"grad_norm": 1.415483832359314,
"learning_rate": 4.873820451482592e-06,
"loss": 1.034,
"step": 900
},
{
"epoch": 28.0,
"grad_norm": 1146730.75,
"learning_rate": 4.864018313451304e-06,
"loss": 0.8984,
"step": 910
},
{
"epoch": 28.307692307692307,
"grad_norm": 1.3334933519363403,
"learning_rate": 4.8538601629575525e-06,
"loss": 0.9298,
"step": 920
},
{
"epoch": 28.615384615384617,
"grad_norm": 1.1457735300064087,
"learning_rate": 4.843347529781438e-06,
"loss": 0.94,
"step": 930
},
{
"epoch": 28.923076923076923,
"grad_norm": 1.3328795433044434,
"learning_rate": 4.832481997086848e-06,
"loss": 0.8961,
"step": 940
},
{
"epoch": 29.23076923076923,
"grad_norm": 1.455394983291626,
"learning_rate": 4.82126520118304e-06,
"loss": 0.9099,
"step": 950
},
{
"epoch": 29.53846153846154,
"grad_norm": 1.7109308242797852,
"learning_rate": 4.809698831278217e-06,
"loss": 0.8214,
"step": 960
},
{
"epoch": 29.846153846153847,
"grad_norm": 1.392449140548706,
"learning_rate": 4.797784629225145e-06,
"loss": 0.813,
"step": 970
},
{
"epoch": 30.153846153846153,
"grad_norm": 1.8153551816940308,
"learning_rate": 4.7855243892588275e-06,
"loss": 0.7938,
"step": 980
},
{
"epoch": 30.46153846153846,
"grad_norm": 1.7134582996368408,
"learning_rate": 4.772919957726306e-06,
"loss": 0.8109,
"step": 990
},
{
"epoch": 30.76923076923077,
"grad_norm": 1.7986242771148682,
"learning_rate": 4.759973232808609e-06,
"loss": 0.6657,
"step": 1000
},
{
"epoch": 31.076923076923077,
"grad_norm": 1.5258599519729614,
"learning_rate": 4.746686164234885e-06,
"loss": 0.7157,
"step": 1010
},
{
"epoch": 31.384615384615383,
"grad_norm": 1.645753264427185,
"learning_rate": 4.7330607529887885e-06,
"loss": 0.6876,
"step": 1020
},
{
"epoch": 31.692307692307693,
"grad_norm": 1.4284254312515259,
"learning_rate": 4.719099051007136e-06,
"loss": 0.6913,
"step": 1030
},
{
"epoch": 32.0,
"grad_norm": 322668.5625,
"learning_rate": 4.704803160870888e-06,
"loss": 0.6297,
"step": 1040
},
{
"epoch": 32.30769230769231,
"grad_norm": 1.519518256187439,
"learning_rate": 4.6901752354885166e-06,
"loss": 0.617,
"step": 1050
},
{
"epoch": 32.61538461538461,
"grad_norm": 1.3290555477142334,
"learning_rate": 4.675217477771779e-06,
"loss": 0.6476,
"step": 1060
},
{
"epoch": 32.92307692307692,
"grad_norm": 1.5469777584075928,
"learning_rate": 4.659932140303967e-06,
"loss": 0.5633,
"step": 1070
},
{
"epoch": 33.23076923076923,
"grad_norm": 1.5148197412490845,
"learning_rate": 4.644321525000681e-06,
"loss": 0.5595,
"step": 1080
},
{
"epoch": 33.53846153846154,
"grad_norm": 1.7467495203018188,
"learning_rate": 4.628387982763163e-06,
"loss": 0.5691,
"step": 1090
},
{
"epoch": 33.84615384615385,
"grad_norm": 1.9587457180023193,
"learning_rate": 4.612133913124268e-06,
"loss": 0.4897,
"step": 1100
},
{
"epoch": 34.15384615384615,
"grad_norm": 1.3276329040527344,
"learning_rate": 4.595561763887095e-06,
"loss": 0.5177,
"step": 1110
},
{
"epoch": 34.46153846153846,
"grad_norm": 1.5996724367141724,
"learning_rate": 4.578674030756364e-06,
"loss": 0.4505,
"step": 1120
},
{
"epoch": 34.76923076923077,
"grad_norm": 1.8829643726348877,
"learning_rate": 4.561473256962564e-06,
"loss": 0.5024,
"step": 1130
},
{
"epoch": 35.07692307692308,
"grad_norm": 2.2122015953063965,
"learning_rate": 4.54396203287896e-06,
"loss": 0.466,
"step": 1140
},
{
"epoch": 35.38461538461539,
"grad_norm": 1.784812092781067,
"learning_rate": 4.526142995631488e-06,
"loss": 0.4057,
"step": 1150
},
{
"epoch": 35.69230769230769,
"grad_norm": 1.3397154808044434,
"learning_rate": 4.508018828701613e-06,
"loss": 0.4513,
"step": 1160
},
{
"epoch": 36.0,
"grad_norm": 368474.65625,
"learning_rate": 4.489592261522209e-06,
"loss": 0.3743,
"step": 1170
},
{
"epoch": 36.30769230769231,
"grad_norm": 1.8128660917282104,
"learning_rate": 4.470866069066516e-06,
"loss": 0.3919,
"step": 1180
},
{
"epoch": 36.61538461538461,
"grad_norm": 1.7692885398864746,
"learning_rate": 4.451843071430236e-06,
"loss": 0.3589,
"step": 1190
},
{
"epoch": 36.92307692307692,
"grad_norm": 1.45668363571167,
"learning_rate": 4.432526133406843e-06,
"loss": 0.3578,
"step": 1200
},
{
"epoch": 37.23076923076923,
"grad_norm": 1.255777359008789,
"learning_rate": 4.412918164056148e-06,
"loss": 0.3436,
"step": 1210
},
{
"epoch": 37.53846153846154,
"grad_norm": 1.4039700031280518,
"learning_rate": 4.393022116266212e-06,
"loss": 0.3054,
"step": 1220
},
{
"epoch": 37.84615384615385,
"grad_norm": 2.489480495452881,
"learning_rate": 4.372840986308649e-06,
"loss": 0.3145,
"step": 1230
},
{
"epoch": 38.15384615384615,
"grad_norm": 2.235471725463867,
"learning_rate": 4.352377813387398e-06,
"loss": 0.3915,
"step": 1240
},
{
"epoch": 38.46153846153846,
"grad_norm": 1.5027036666870117,
"learning_rate": 4.331635679181032e-06,
"loss": 0.2823,
"step": 1250
},
{
"epoch": 38.76923076923077,
"grad_norm": 1.8041187524795532,
"learning_rate": 4.3106177073786684e-06,
"loss": 0.2837,
"step": 1260
},
{
"epoch": 39.07692307692308,
"grad_norm": 1.359180212020874,
"learning_rate": 4.289327063209548e-06,
"loss": 0.277,
"step": 1270
},
{
"epoch": 39.38461538461539,
"grad_norm": 1.5031533241271973,
"learning_rate": 4.267766952966369e-06,
"loss": 0.2959,
"step": 1280
},
{
"epoch": 39.69230769230769,
"grad_norm": 1.5086129903793335,
"learning_rate": 4.245940623522433e-06,
"loss": 0.236,
"step": 1290
},
{
"epoch": 40.0,
"grad_norm": 8115298.5,
"learning_rate": 4.223851361842668e-06,
"loss": 0.2243,
"step": 1300
},
{
"epoch": 40.30769230769231,
"grad_norm": 1.59078049659729,
"learning_rate": 4.201502494488633e-06,
"loss": 0.2423,
"step": 1310
},
{
"epoch": 40.61538461538461,
"grad_norm": 1.6530065536499023,
"learning_rate": 4.178897387117547e-06,
"loss": 0.2217,
"step": 1320
},
{
"epoch": 40.92307692307692,
"grad_norm": 1.6133264303207397,
"learning_rate": 4.15603944397543e-06,
"loss": 0.2185,
"step": 1330
},
{
"epoch": 41.23076923076923,
"grad_norm": 1.7328965663909912,
"learning_rate": 4.132932107384442e-06,
"loss": 0.2183,
"step": 1340
},
{
"epoch": 41.53846153846154,
"grad_norm": 1.5189462900161743,
"learning_rate": 4.109578857224478e-06,
"loss": 0.2167,
"step": 1350
},
{
"epoch": 41.84615384615385,
"grad_norm": 1.807603359222412,
"learning_rate": 4.085983210409114e-06,
"loss": 0.193,
"step": 1360
},
{
"epoch": 42.15384615384615,
"grad_norm": 1.4981666803359985,
"learning_rate": 4.062148720355967e-06,
"loss": 0.1724,
"step": 1370
},
{
"epoch": 42.46153846153846,
"grad_norm": 1.40492844581604,
"learning_rate": 4.038078976451567e-06,
"loss": 0.1658,
"step": 1380
},
{
"epoch": 42.76923076923077,
"grad_norm": 1.4978539943695068,
"learning_rate": 4.013777603510815e-06,
"loss": 0.1555,
"step": 1390
},
{
"epoch": 43.07692307692308,
"grad_norm": 1.2119354009628296,
"learning_rate": 3.989248261231084e-06,
"loss": 0.1833,
"step": 1400
},
{
"epoch": 43.38461538461539,
"grad_norm": 1.857128381729126,
"learning_rate": 3.964494643641097e-06,
"loss": 0.1622,
"step": 1410
},
{
"epoch": 43.69230769230769,
"grad_norm": 1.624523639678955,
"learning_rate": 3.939520478544614e-06,
"loss": 0.1499,
"step": 1420
},
{
"epoch": 44.0,
"grad_norm": 53768.84375,
"learning_rate": 3.914329526959033e-06,
"loss": 0.1712,
"step": 1430
},
{
"epoch": 44.30769230769231,
"grad_norm": 1.3771990537643433,
"learning_rate": 3.888925582549006e-06,
"loss": 0.1334,
"step": 1440
},
{
"epoch": 44.61538461538461,
"grad_norm": 2.549403190612793,
"learning_rate": 3.863312471055116e-06,
"loss": 0.1833,
"step": 1450
},
{
"epoch": 44.92307692307692,
"grad_norm": 1.4592410326004028,
"learning_rate": 3.8374940497177435e-06,
"loss": 0.1174,
"step": 1460
},
{
"epoch": 45.23076923076923,
"grad_norm": 1.5769482851028442,
"learning_rate": 3.8114742066961722e-06,
"loss": 0.1207,
"step": 1470
},
{
"epoch": 45.53846153846154,
"grad_norm": 1.1684468984603882,
"learning_rate": 3.785256860483054e-06,
"loss": 0.1275,
"step": 1480
},
{
"epoch": 45.84615384615385,
"grad_norm": 1.1915509700775146,
"learning_rate": 3.7588459593142944e-06,
"loss": 0.1253,
"step": 1490
},
{
"epoch": 46.15384615384615,
"grad_norm": 1.3312920331954956,
"learning_rate": 3.7322454805744605e-06,
"loss": 0.1159,
"step": 1500
},
{
"epoch": 46.46153846153846,
"grad_norm": 1.1740134954452515,
"learning_rate": 3.7054594301978075e-06,
"loss": 0.1312,
"step": 1510
},
{
"epoch": 46.76923076923077,
"grad_norm": 0.9904786348342896,
"learning_rate": 3.6784918420649952e-06,
"loss": 0.1171,
"step": 1520
},
{
"epoch": 47.07692307692308,
"grad_norm": 1.7853026390075684,
"learning_rate": 3.6513467773956002e-06,
"loss": 0.1229,
"step": 1530
},
{
"epoch": 47.38461538461539,
"grad_norm": 0.924367368221283,
"learning_rate": 3.624028324136517e-06,
"loss": 0.1087,
"step": 1540
},
{
"epoch": 47.69230769230769,
"grad_norm": 1.3565295934677124,
"learning_rate": 3.5965405963463197e-06,
"loss": 0.113,
"step": 1550
},
{
"epoch": 48.0,
"grad_norm": 27509738.0,
"learning_rate": 3.5688877335757055e-06,
"loss": 0.0949,
"step": 1560
},
{
"epoch": 48.30769230769231,
"grad_norm": 1.4091086387634277,
"learning_rate": 3.5410739002440938e-06,
"loss": 0.095,
"step": 1570
},
{
"epoch": 48.61538461538461,
"grad_norm": 1.1597000360488892,
"learning_rate": 3.5131032850124745e-06,
"loss": 0.0987,
"step": 1580
},
{
"epoch": 48.92307692307692,
"grad_norm": 1.2181200981140137,
"learning_rate": 3.484980100152621e-06,
"loss": 0.0894,
"step": 1590
},
{
"epoch": 49.23076923076923,
"grad_norm": 1.0059666633605957,
"learning_rate": 3.4567085809127247e-06,
"loss": 0.0962,
"step": 1600
},
{
"epoch": 49.53846153846154,
"grad_norm": 1.6103034019470215,
"learning_rate": 3.4282929848795944e-06,
"loss": 0.0842,
"step": 1610
},
{
"epoch": 49.84615384615385,
"grad_norm": 1.6124812364578247,
"learning_rate": 3.399737591337471e-06,
"loss": 0.0981,
"step": 1620
},
{
"epoch": 50.15384615384615,
"grad_norm": 1.14884614944458,
"learning_rate": 3.3710467006235865e-06,
"loss": 0.0996,
"step": 1630
},
{
"epoch": 50.46153846153846,
"grad_norm": 1.4607173204421997,
"learning_rate": 3.3422246334805504e-06,
"loss": 0.1034,
"step": 1640
},
{
"epoch": 50.76923076923077,
"grad_norm": 1.092631459236145,
"learning_rate": 3.313275730405658e-06,
"loss": 0.0801,
"step": 1650
},
{
"epoch": 51.07692307692308,
"grad_norm": 1.0797463655471802,
"learning_rate": 3.2842043509972294e-06,
"loss": 0.0795,
"step": 1660
},
{
"epoch": 51.38461538461539,
"grad_norm": 1.503728985786438,
"learning_rate": 3.2550148732980707e-06,
"loss": 0.0744,
"step": 1670
},
{
"epoch": 51.69230769230769,
"grad_norm": 1.0997803211212158,
"learning_rate": 3.225711693136156e-06,
"loss": 0.0741,
"step": 1680
},
{
"epoch": 52.0,
"grad_norm": 10298226.0,
"learning_rate": 3.196299223462633e-06,
"loss": 0.0699,
"step": 1690
},
{
"epoch": 52.30769230769231,
"grad_norm": 1.0617835521697998,
"learning_rate": 3.1667818936872463e-06,
"loss": 0.074,
"step": 1700
},
{
"epoch": 52.61538461538461,
"grad_norm": 0.8478608131408691,
"learning_rate": 3.137164149011287e-06,
"loss": 0.0577,
"step": 1710
},
{
"epoch": 52.92307692307692,
"grad_norm": 1.827232003211975,
"learning_rate": 3.10745044975816e-06,
"loss": 0.0871,
"step": 1720
},
{
"epoch": 53.23076923076923,
"grad_norm": 1.0407119989395142,
"learning_rate": 3.0776452707016784e-06,
"loss": 0.0682,
"step": 1730
},
{
"epoch": 53.53846153846154,
"grad_norm": 1.375914454460144,
"learning_rate": 3.0477531003921745e-06,
"loss": 0.0735,
"step": 1740
},
{
"epoch": 53.84615384615385,
"grad_norm": 1.3044782876968384,
"learning_rate": 3.0177784404805466e-06,
"loss": 0.0617,
"step": 1750
},
{
"epoch": 54.15384615384615,
"grad_norm": 0.6784080862998962,
"learning_rate": 2.9877258050403214e-06,
"loss": 0.0494,
"step": 1760
},
{
"epoch": 54.46153846153846,
"grad_norm": 1.0913127660751343,
"learning_rate": 2.957599719887853e-06,
"loss": 0.0685,
"step": 1770
},
{
"epoch": 54.76923076923077,
"grad_norm": 1.2988967895507812,
"learning_rate": 2.9274047219007533e-06,
"loss": 0.069,
"step": 1780
},
{
"epoch": 55.07692307692308,
"grad_norm": 1.0163013935089111,
"learning_rate": 2.8971453583346536e-06,
"loss": 0.0545,
"step": 1790
},
{
"epoch": 55.38461538461539,
"grad_norm": 0.7982223033905029,
"learning_rate": 2.8668261861384045e-06,
"loss": 0.0509,
"step": 1800
},
{
"epoch": 55.69230769230769,
"grad_norm": 1.5185413360595703,
"learning_rate": 2.8364517712678157e-06,
"loss": 0.0708,
"step": 1810
},
{
"epoch": 56.0,
"grad_norm": 54588.5390625,
"learning_rate": 2.806026687998041e-06,
"loss": 0.0531,
"step": 1820
},
{
"epoch": 56.30769230769231,
"grad_norm": 0.9050544500350952,
"learning_rate": 2.775555518234708e-06,
"loss": 0.0529,
"step": 1830
},
{
"epoch": 56.61538461538461,
"grad_norm": 1.0293651819229126,
"learning_rate": 2.7450428508239024e-06,
"loss": 0.0572,
"step": 1840
},
{
"epoch": 56.92307692307692,
"grad_norm": 1.1838538646697998,
"learning_rate": 2.7144932808611002e-06,
"loss": 0.0616,
"step": 1850
},
{
"epoch": 57.23076923076923,
"grad_norm": 1.2836631536483765,
"learning_rate": 2.683911408999169e-06,
"loss": 0.0536,
"step": 1860
},
{
"epoch": 57.53846153846154,
"grad_norm": 1.0823341608047485,
"learning_rate": 2.6533018407555216e-06,
"loss": 0.0476,
"step": 1870
},
{
"epoch": 57.84615384615385,
"grad_norm": 1.0420117378234863,
"learning_rate": 2.6226691858185454e-06,
"loss": 0.0527,
"step": 1880
},
{
"epoch": 58.15384615384615,
"grad_norm": 1.0533287525177002,
"learning_rate": 2.5920180573533975e-06,
"loss": 0.0623,
"step": 1890
},
{
"epoch": 58.46153846153846,
"grad_norm": 0.9571561217308044,
"learning_rate": 2.561353071307281e-06,
"loss": 0.0498,
"step": 1900
},
{
"epoch": 58.76923076923077,
"grad_norm": 1.224555253982544,
"learning_rate": 2.5306788457143e-06,
"loss": 0.0447,
"step": 1910
},
{
"epoch": 59.07692307692308,
"grad_norm": 1.900484561920166,
"learning_rate": 2.5e-06,
"loss": 0.0578,
"step": 1920
},
{
"epoch": 59.38461538461539,
"grad_norm": 0.9753430485725403,
"learning_rate": 2.4693211542857005e-06,
"loss": 0.052,
"step": 1930
},
{
"epoch": 59.69230769230769,
"grad_norm": 1.0865504741668701,
"learning_rate": 2.43864692869272e-06,
"loss": 0.0447,
"step": 1940
},
{
"epoch": 60.0,
"grad_norm": 39835.109375,
"learning_rate": 2.407981942646603e-06,
"loss": 0.0477,
"step": 1950
},
{
"epoch": 60.30769230769231,
"grad_norm": 1.1388822793960571,
"learning_rate": 2.377330814181455e-06,
"loss": 0.0516,
"step": 1960
},
{
"epoch": 60.61538461538461,
"grad_norm": 1.2782968282699585,
"learning_rate": 2.346698159244479e-06,
"loss": 0.0501,
"step": 1970
},
{
"epoch": 60.92307692307692,
"grad_norm": 1.6132394075393677,
"learning_rate": 2.3160885910008317e-06,
"loss": 0.0408,
"step": 1980
},
{
"epoch": 61.23076923076923,
"grad_norm": 0.8268479108810425,
"learning_rate": 2.2855067191389006e-06,
"loss": 0.0407,
"step": 1990
},
{
"epoch": 61.53846153846154,
"grad_norm": 0.9518911838531494,
"learning_rate": 2.2549571491760985e-06,
"loss": 0.0455,
"step": 2000
},
{
"epoch": 61.84615384615385,
"grad_norm": 1.2631924152374268,
"learning_rate": 2.2244444817652923e-06,
"loss": 0.0462,
"step": 2010
},
{
"epoch": 62.15384615384615,
"grad_norm": 1.0491702556610107,
"learning_rate": 2.19397331200196e-06,
"loss": 0.0384,
"step": 2020
},
{
"epoch": 62.46153846153846,
"grad_norm": 1.37276291847229,
"learning_rate": 2.1635482287321848e-06,
"loss": 0.0384,
"step": 2030
},
{
"epoch": 62.76923076923077,
"grad_norm": 0.911716103553772,
"learning_rate": 2.133173813861596e-06,
"loss": 0.0489,
"step": 2040
},
{
"epoch": 63.07692307692308,
"grad_norm": 0.8453769683837891,
"learning_rate": 2.102854641665347e-06,
"loss": 0.0519,
"step": 2050
},
{
"epoch": 63.38461538461539,
"grad_norm": 1.1368571519851685,
"learning_rate": 2.072595278099247e-06,
"loss": 0.0383,
"step": 2060
},
{
"epoch": 63.69230769230769,
"grad_norm": 1.6040756702423096,
"learning_rate": 2.042400280112148e-06,
"loss": 0.0421,
"step": 2070
},
{
"epoch": 64.0,
"grad_norm": 664656.875,
"learning_rate": 2.01227419495968e-06,
"loss": 0.0473,
"step": 2080
},
{
"epoch": 64.3076923076923,
"grad_norm": 0.9847516417503357,
"learning_rate": 1.982221559519454e-06,
"loss": 0.042,
"step": 2090
},
{
"epoch": 64.61538461538461,
"grad_norm": 0.77010577917099,
"learning_rate": 1.952246899607826e-06,
"loss": 0.042,
"step": 2100
},
{
"epoch": 64.92307692307692,
"grad_norm": 1.2908334732055664,
"learning_rate": 1.9223547292983225e-06,
"loss": 0.0416,
"step": 2110
},
{
"epoch": 65.23076923076923,
"grad_norm": 0.7256491184234619,
"learning_rate": 1.8925495502418407e-06,
"loss": 0.0353,
"step": 2120
},
{
"epoch": 65.53846153846153,
"grad_norm": 1.212572455406189,
"learning_rate": 1.862835850988714e-06,
"loss": 0.0427,
"step": 2130
},
{
"epoch": 65.84615384615384,
"grad_norm": 1.154390811920166,
"learning_rate": 1.8332181063127543e-06,
"loss": 0.0396,
"step": 2140
},
{
"epoch": 66.15384615384616,
"grad_norm": 0.8889743685722351,
"learning_rate": 1.8037007765373677e-06,
"loss": 0.0381,
"step": 2150
},
{
"epoch": 66.46153846153847,
"grad_norm": 1.1566401720046997,
"learning_rate": 1.7742883068638447e-06,
"loss": 0.041,
"step": 2160
},
{
"epoch": 66.76923076923077,
"grad_norm": 0.9389579892158508,
"learning_rate": 1.74498512670193e-06,
"loss": 0.0385,
"step": 2170
},
{
"epoch": 67.07692307692308,
"grad_norm": 0.709125816822052,
"learning_rate": 1.7157956490027716e-06,
"loss": 0.0432,
"step": 2180
},
{
"epoch": 67.38461538461539,
"grad_norm": 1.1108027696609497,
"learning_rate": 1.686724269594343e-06,
"loss": 0.0395,
"step": 2190
},
{
"epoch": 67.6923076923077,
"grad_norm": 1.087064266204834,
"learning_rate": 1.6577753665194502e-06,
"loss": 0.0402,
"step": 2200
},
{
"epoch": 68.0,
"grad_norm": 3785425.75,
"learning_rate": 1.628953299376414e-06,
"loss": 0.0351,
"step": 2210
},
{
"epoch": 68.3076923076923,
"grad_norm": 0.9498631954193115,
"learning_rate": 1.6002624086625296e-06,
"loss": 0.0402,
"step": 2220
},
{
"epoch": 68.61538461538461,
"grad_norm": 0.6672590374946594,
"learning_rate": 1.5717070151204064e-06,
"loss": 0.0333,
"step": 2230
},
{
"epoch": 68.92307692307692,
"grad_norm": 0.671436607837677,
"learning_rate": 1.5432914190872757e-06,
"loss": 0.0387,
"step": 2240
},
{
"epoch": 69.23076923076923,
"grad_norm": 0.7344552874565125,
"learning_rate": 1.5150198998473802e-06,
"loss": 0.0345,
"step": 2250
},
{
"epoch": 69.53846153846153,
"grad_norm": 0.5938952565193176,
"learning_rate": 1.4868967149875257e-06,
"loss": 0.0343,
"step": 2260
},
{
"epoch": 69.84615384615384,
"grad_norm": 0.862509548664093,
"learning_rate": 1.4589260997559077e-06,
"loss": 0.0359,
"step": 2270
},
{
"epoch": 70.15384615384616,
"grad_norm": 0.8928817510604858,
"learning_rate": 1.4311122664242955e-06,
"loss": 0.0406,
"step": 2280
},
{
"epoch": 70.46153846153847,
"grad_norm": 0.81640625,
"learning_rate": 1.4034594036536816e-06,
"loss": 0.0332,
"step": 2290
},
{
"epoch": 70.76923076923077,
"grad_norm": 0.5396949648857117,
"learning_rate": 1.3759716758634833e-06,
"loss": 0.0389,
"step": 2300
},
{
"epoch": 71.07692307692308,
"grad_norm": 0.918347954750061,
"learning_rate": 1.3486532226044e-06,
"loss": 0.0329,
"step": 2310
},
{
"epoch": 71.38461538461539,
"grad_norm": 0.8731901049613953,
"learning_rate": 1.3215081579350058e-06,
"loss": 0.0355,
"step": 2320
},
{
"epoch": 71.6923076923077,
"grad_norm": 0.9578891396522522,
"learning_rate": 1.294540569802193e-06,
"loss": 0.037,
"step": 2330
},
{
"epoch": 72.0,
"grad_norm": 3753634.75,
"learning_rate": 1.2677545194255403e-06,
"loss": 0.0372,
"step": 2340
},
{
"epoch": 72.3076923076923,
"grad_norm": 0.9850757122039795,
"learning_rate": 1.2411540406857064e-06,
"loss": 0.0388,
"step": 2350
},
{
"epoch": 72.61538461538461,
"grad_norm": 1.0889559984207153,
"learning_rate": 1.214743139516946e-06,
"loss": 0.0329,
"step": 2360
},
{
"epoch": 72.92307692307692,
"grad_norm": 1.2561447620391846,
"learning_rate": 1.1885257933038282e-06,
"loss": 0.0334,
"step": 2370
},
{
"epoch": 73.23076923076923,
"grad_norm": 1.08214271068573,
"learning_rate": 1.1625059502822575e-06,
"loss": 0.0438,
"step": 2380
},
{
"epoch": 73.53846153846153,
"grad_norm": 0.6585337519645691,
"learning_rate": 1.1366875289448844e-06,
"loss": 0.0283,
"step": 2390
},
{
"epoch": 73.84615384615384,
"grad_norm": 1.001531720161438,
"learning_rate": 1.1110744174509952e-06,
"loss": 0.0405,
"step": 2400
},
{
"epoch": 74.15384615384616,
"grad_norm": 0.774890124797821,
"learning_rate": 1.0856704730409667e-06,
"loss": 0.035,
"step": 2410
},
{
"epoch": 74.46153846153847,
"grad_norm": 0.8112567663192749,
"learning_rate": 1.0604795214553867e-06,
"loss": 0.0371,
"step": 2420
},
{
"epoch": 74.76923076923077,
"grad_norm": 0.7539799809455872,
"learning_rate": 1.035505356358903e-06,
"loss": 0.0363,
"step": 2430
},
{
"epoch": 75.07692307692308,
"grad_norm": 1.047004222869873,
"learning_rate": 1.0107517387689168e-06,
"loss": 0.0305,
"step": 2440
},
{
"epoch": 75.38461538461539,
"grad_norm": 0.5877715945243835,
"learning_rate": 9.862223964891864e-07,
"loss": 0.032,
"step": 2450
},
{
"epoch": 75.6923076923077,
"grad_norm": 0.7707840204238892,
"learning_rate": 9.61921023548433e-07,
"loss": 0.0345,
"step": 2460
},
{
"epoch": 76.0,
"grad_norm": 8456043.0,
"learning_rate": 9.378512796440345e-07,
"loss": 0.0362,
"step": 2470
},
{
"epoch": 76.3076923076923,
"grad_norm": 1.2004197835922241,
"learning_rate": 9.140167895908867e-07,
"loss": 0.0318,
"step": 2480
},
{
"epoch": 76.61538461538461,
"grad_norm": 0.7192149758338928,
"learning_rate": 8.904211427755219e-07,
"loss": 0.0339,
"step": 2490
},
{
"epoch": 76.92307692307692,
"grad_norm": 0.8858992457389832,
"learning_rate": 8.670678926155588e-07,
"loss": 0.0365,
"step": 2500
}
],
"logging_steps": 10,
"max_steps": 3200,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.4736165944792064e+17,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}