|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 76.92307692307692, |
|
"eval_steps": 500, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.3076923076923077, |
|
"grad_norm": 5.159167766571045, |
|
"learning_rate": 7.8125e-08, |
|
"loss": 4.2745, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.6153846153846154, |
|
"grad_norm": 5.482397079467773, |
|
"learning_rate": 1.5625e-07, |
|
"loss": 4.2677, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.9230769230769231, |
|
"grad_norm": 5.341490268707275, |
|
"learning_rate": 2.3437500000000003e-07, |
|
"loss": 4.2277, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 1.2307692307692308, |
|
"grad_norm": 5.534327983856201, |
|
"learning_rate": 3.125e-07, |
|
"loss": 4.2715, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 5.407191753387451, |
|
"learning_rate": 3.90625e-07, |
|
"loss": 4.2371, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 1.8461538461538463, |
|
"grad_norm": 5.150612831115723, |
|
"learning_rate": 4.6875000000000006e-07, |
|
"loss": 4.1953, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 2.1538461538461537, |
|
"grad_norm": 5.620100021362305, |
|
"learning_rate": 5.468750000000001e-07, |
|
"loss": 4.2908, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 2.4615384615384617, |
|
"grad_norm": 5.80696964263916, |
|
"learning_rate": 6.25e-07, |
|
"loss": 4.2603, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 2.769230769230769, |
|
"grad_norm": 5.940330982208252, |
|
"learning_rate": 7.03125e-07, |
|
"loss": 4.2049, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 5.8099141120910645, |
|
"learning_rate": 7.8125e-07, |
|
"loss": 4.1638, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 3.3846153846153846, |
|
"grad_norm": 5.388895511627197, |
|
"learning_rate": 8.59375e-07, |
|
"loss": 4.1409, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 3.6923076923076925, |
|
"grad_norm": 5.919471263885498, |
|
"learning_rate": 9.375000000000001e-07, |
|
"loss": 4.1658, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 13097.25390625, |
|
"learning_rate": 1.0156250000000001e-06, |
|
"loss": 4.1936, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 4.3076923076923075, |
|
"grad_norm": 4.802829265594482, |
|
"learning_rate": 1.0937500000000001e-06, |
|
"loss": 4.021, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 5.628367900848389, |
|
"learning_rate": 1.1718750000000001e-06, |
|
"loss": 4.019, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 4.923076923076923, |
|
"grad_norm": 5.601304531097412, |
|
"learning_rate": 1.25e-06, |
|
"loss": 3.9075, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 5.230769230769231, |
|
"grad_norm": 5.029603004455566, |
|
"learning_rate": 1.328125e-06, |
|
"loss": 3.8952, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 5.538461538461538, |
|
"grad_norm": 5.625918388366699, |
|
"learning_rate": 1.40625e-06, |
|
"loss": 3.7809, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 5.846153846153846, |
|
"grad_norm": 4.302311420440674, |
|
"learning_rate": 1.484375e-06, |
|
"loss": 3.5823, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 4.250982284545898, |
|
"learning_rate": 1.5625e-06, |
|
"loss": 3.5112, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 6.461538461538462, |
|
"grad_norm": 3.137059211730957, |
|
"learning_rate": 1.640625e-06, |
|
"loss": 3.3867, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 6.769230769230769, |
|
"grad_norm": 3.2033724784851074, |
|
"learning_rate": 1.71875e-06, |
|
"loss": 3.2788, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 7.076923076923077, |
|
"grad_norm": 2.8167309761047363, |
|
"learning_rate": 1.796875e-06, |
|
"loss": 3.1566, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 7.384615384615385, |
|
"grad_norm": 2.167381525039673, |
|
"learning_rate": 1.8750000000000003e-06, |
|
"loss": 2.9642, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 2.2277944087982178, |
|
"learning_rate": 1.953125e-06, |
|
"loss": 2.8886, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 60692.35546875, |
|
"learning_rate": 2.0312500000000002e-06, |
|
"loss": 2.7726, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 8.307692307692308, |
|
"grad_norm": 1.6853564977645874, |
|
"learning_rate": 2.109375e-06, |
|
"loss": 2.7062, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 8.615384615384615, |
|
"grad_norm": 1.5454535484313965, |
|
"learning_rate": 2.1875000000000002e-06, |
|
"loss": 2.5508, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 8.923076923076923, |
|
"grad_norm": 1.2037118673324585, |
|
"learning_rate": 2.265625e-06, |
|
"loss": 2.4639, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 1.0261240005493164, |
|
"learning_rate": 2.3437500000000002e-06, |
|
"loss": 2.4103, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 9.538461538461538, |
|
"grad_norm": 0.9358808994293213, |
|
"learning_rate": 2.421875e-06, |
|
"loss": 2.3032, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 9.846153846153847, |
|
"grad_norm": 0.7383924722671509, |
|
"learning_rate": 2.5e-06, |
|
"loss": 2.3002, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 10.153846153846153, |
|
"grad_norm": 0.685702383518219, |
|
"learning_rate": 2.5781250000000004e-06, |
|
"loss": 2.2148, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 10.461538461538462, |
|
"grad_norm": 0.6645168662071228, |
|
"learning_rate": 2.65625e-06, |
|
"loss": 2.1989, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 10.76923076923077, |
|
"grad_norm": 0.7011102437973022, |
|
"learning_rate": 2.7343750000000004e-06, |
|
"loss": 2.1496, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 11.076923076923077, |
|
"grad_norm": 0.5761039853096008, |
|
"learning_rate": 2.8125e-06, |
|
"loss": 2.1465, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 11.384615384615385, |
|
"grad_norm": 0.562958836555481, |
|
"learning_rate": 2.8906250000000004e-06, |
|
"loss": 2.0871, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 11.692307692307692, |
|
"grad_norm": 0.5713663101196289, |
|
"learning_rate": 2.96875e-06, |
|
"loss": 2.0732, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 16827.7421875, |
|
"learning_rate": 3.0468750000000004e-06, |
|
"loss": 2.0684, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 12.307692307692308, |
|
"grad_norm": 0.5909234285354614, |
|
"learning_rate": 3.125e-06, |
|
"loss": 2.0554, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 12.615384615384615, |
|
"grad_norm": 0.5658320784568787, |
|
"learning_rate": 3.2031250000000004e-06, |
|
"loss": 1.9929, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 12.923076923076923, |
|
"grad_norm": 0.5382928848266602, |
|
"learning_rate": 3.28125e-06, |
|
"loss": 1.9913, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 13.23076923076923, |
|
"grad_norm": 0.7106872200965881, |
|
"learning_rate": 3.3593750000000003e-06, |
|
"loss": 1.9562, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 13.538461538461538, |
|
"grad_norm": 0.5338084697723389, |
|
"learning_rate": 3.4375e-06, |
|
"loss": 1.9503, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 13.846153846153847, |
|
"grad_norm": 0.524355411529541, |
|
"learning_rate": 3.5156250000000003e-06, |
|
"loss": 1.9499, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 14.153846153846153, |
|
"grad_norm": 0.5286893248558044, |
|
"learning_rate": 3.59375e-06, |
|
"loss": 1.9225, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 14.461538461538462, |
|
"grad_norm": 0.5003280639648438, |
|
"learning_rate": 3.6718750000000003e-06, |
|
"loss": 1.8688, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 14.76923076923077, |
|
"grad_norm": 0.5744629502296448, |
|
"learning_rate": 3.7500000000000005e-06, |
|
"loss": 1.8728, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 15.076923076923077, |
|
"grad_norm": 0.47028881311416626, |
|
"learning_rate": 3.828125000000001e-06, |
|
"loss": 1.818, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 15.384615384615385, |
|
"grad_norm": 0.5076237320899963, |
|
"learning_rate": 3.90625e-06, |
|
"loss": 1.8375, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 15.692307692307692, |
|
"grad_norm": 0.5628578066825867, |
|
"learning_rate": 3.984375e-06, |
|
"loss": 1.823, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 35255.69921875, |
|
"learning_rate": 4.0625000000000005e-06, |
|
"loss": 1.8249, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 16.307692307692307, |
|
"grad_norm": 0.5536410212516785, |
|
"learning_rate": 4.140625000000001e-06, |
|
"loss": 1.7998, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 16.615384615384617, |
|
"grad_norm": 0.6619865298271179, |
|
"learning_rate": 4.21875e-06, |
|
"loss": 1.775, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 16.923076923076923, |
|
"grad_norm": 0.6591458320617676, |
|
"learning_rate": 4.296875e-06, |
|
"loss": 1.7605, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 17.23076923076923, |
|
"grad_norm": 0.8084316253662109, |
|
"learning_rate": 4.3750000000000005e-06, |
|
"loss": 1.7278, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 17.53846153846154, |
|
"grad_norm": 0.6782126426696777, |
|
"learning_rate": 4.453125000000001e-06, |
|
"loss": 1.7261, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 17.846153846153847, |
|
"grad_norm": 0.6113712191581726, |
|
"learning_rate": 4.53125e-06, |
|
"loss": 1.7178, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 18.153846153846153, |
|
"grad_norm": 0.6165570020675659, |
|
"learning_rate": 4.609375e-06, |
|
"loss": 1.6865, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 18.46153846153846, |
|
"grad_norm": 0.7881684303283691, |
|
"learning_rate": 4.6875000000000004e-06, |
|
"loss": 1.6793, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 18.76923076923077, |
|
"grad_norm": 0.672874927520752, |
|
"learning_rate": 4.765625000000001e-06, |
|
"loss": 1.6661, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 19.076923076923077, |
|
"grad_norm": 0.730848491191864, |
|
"learning_rate": 4.84375e-06, |
|
"loss": 1.6431, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 19.384615384615383, |
|
"grad_norm": 0.6730669736862183, |
|
"learning_rate": 4.921875e-06, |
|
"loss": 1.6155, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 19.692307692307693, |
|
"grad_norm": 0.6560551524162292, |
|
"learning_rate": 5e-06, |
|
"loss": 1.6395, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 4129.79052734375, |
|
"learning_rate": 4.999811754597862e-06, |
|
"loss": 1.5897, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 20.307692307692307, |
|
"grad_norm": 0.6594120860099792, |
|
"learning_rate": 4.999247046740511e-06, |
|
"loss": 1.5829, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 20.615384615384617, |
|
"grad_norm": 0.893703818321228, |
|
"learning_rate": 4.998305961470874e-06, |
|
"loss": 1.558, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 20.923076923076923, |
|
"grad_norm": 0.7144062519073486, |
|
"learning_rate": 4.996988640512931e-06, |
|
"loss": 1.5373, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 21.23076923076923, |
|
"grad_norm": 0.8453310132026672, |
|
"learning_rate": 4.995295282250373e-06, |
|
"loss": 1.4909, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 21.53846153846154, |
|
"grad_norm": 0.8293094635009766, |
|
"learning_rate": 4.993226141696726e-06, |
|
"loss": 1.4967, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 21.846153846153847, |
|
"grad_norm": 0.7181118726730347, |
|
"learning_rate": 4.990781530456945e-06, |
|
"loss": 1.4857, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 22.153846153846153, |
|
"grad_norm": 0.844008207321167, |
|
"learning_rate": 4.987961816680493e-06, |
|
"loss": 1.495, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 22.46153846153846, |
|
"grad_norm": 0.8761960864067078, |
|
"learning_rate": 4.984767425005891e-06, |
|
"loss": 1.4224, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 22.76923076923077, |
|
"grad_norm": 0.8939017057418823, |
|
"learning_rate": 4.981198836496776e-06, |
|
"loss": 1.4063, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 23.076923076923077, |
|
"grad_norm": 0.828834593296051, |
|
"learning_rate": 4.97725658856945e-06, |
|
"loss": 1.429, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 23.384615384615383, |
|
"grad_norm": 0.945023775100708, |
|
"learning_rate": 4.972941274911953e-06, |
|
"loss": 1.3588, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 23.692307692307693, |
|
"grad_norm": 1.122968316078186, |
|
"learning_rate": 4.968253545394647e-06, |
|
"loss": 1.3309, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 24.0, |
|
"grad_norm": 93171.5546875, |
|
"learning_rate": 4.9631941059723535e-06, |
|
"loss": 1.335, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 24.307692307692307, |
|
"grad_norm": 1.0523442029953003, |
|
"learning_rate": 4.957763718578042e-06, |
|
"loss": 1.3347, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 24.615384615384617, |
|
"grad_norm": 1.179585576057434, |
|
"learning_rate": 4.9519632010080765e-06, |
|
"loss": 1.2618, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 24.923076923076923, |
|
"grad_norm": 0.854586124420166, |
|
"learning_rate": 4.9457934267990695e-06, |
|
"loss": 1.2321, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 25.23076923076923, |
|
"grad_norm": 1.0027631521224976, |
|
"learning_rate": 4.939255325096322e-06, |
|
"loss": 1.2041, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 25.53846153846154, |
|
"grad_norm": 1.045911192893982, |
|
"learning_rate": 4.932349880513901e-06, |
|
"loss": 1.1746, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 25.846153846153847, |
|
"grad_norm": 0.9622510075569153, |
|
"learning_rate": 4.925078132986361e-06, |
|
"loss": 1.1846, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 26.153846153846153, |
|
"grad_norm": 1.2767952680587769, |
|
"learning_rate": 4.917441177612131e-06, |
|
"loss": 1.1605, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 26.46153846153846, |
|
"grad_norm": 0.9456568360328674, |
|
"learning_rate": 4.9094401644886e-06, |
|
"loss": 1.1373, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 26.76923076923077, |
|
"grad_norm": 1.3368542194366455, |
|
"learning_rate": 4.901076298538915e-06, |
|
"loss": 1.0879, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 27.076923076923077, |
|
"grad_norm": 1.4178813695907593, |
|
"learning_rate": 4.8923508393305224e-06, |
|
"loss": 1.0359, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 27.384615384615383, |
|
"grad_norm": 1.4570515155792236, |
|
"learning_rate": 4.883265100885484e-06, |
|
"loss": 1.0313, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 27.692307692307693, |
|
"grad_norm": 1.415483832359314, |
|
"learning_rate": 4.873820451482592e-06, |
|
"loss": 1.034, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 28.0, |
|
"grad_norm": 1146730.75, |
|
"learning_rate": 4.864018313451304e-06, |
|
"loss": 0.8984, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 28.307692307692307, |
|
"grad_norm": 1.3334933519363403, |
|
"learning_rate": 4.8538601629575525e-06, |
|
"loss": 0.9298, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 28.615384615384617, |
|
"grad_norm": 1.1457735300064087, |
|
"learning_rate": 4.843347529781438e-06, |
|
"loss": 0.94, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 28.923076923076923, |
|
"grad_norm": 1.3328795433044434, |
|
"learning_rate": 4.832481997086848e-06, |
|
"loss": 0.8961, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 29.23076923076923, |
|
"grad_norm": 1.455394983291626, |
|
"learning_rate": 4.82126520118304e-06, |
|
"loss": 0.9099, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 29.53846153846154, |
|
"grad_norm": 1.7109308242797852, |
|
"learning_rate": 4.809698831278217e-06, |
|
"loss": 0.8214, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 29.846153846153847, |
|
"grad_norm": 1.392449140548706, |
|
"learning_rate": 4.797784629225145e-06, |
|
"loss": 0.813, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 30.153846153846153, |
|
"grad_norm": 1.8153551816940308, |
|
"learning_rate": 4.7855243892588275e-06, |
|
"loss": 0.7938, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 30.46153846153846, |
|
"grad_norm": 1.7134582996368408, |
|
"learning_rate": 4.772919957726306e-06, |
|
"loss": 0.8109, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 30.76923076923077, |
|
"grad_norm": 1.7986242771148682, |
|
"learning_rate": 4.759973232808609e-06, |
|
"loss": 0.6657, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 31.076923076923077, |
|
"grad_norm": 1.5258599519729614, |
|
"learning_rate": 4.746686164234885e-06, |
|
"loss": 0.7157, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 31.384615384615383, |
|
"grad_norm": 1.645753264427185, |
|
"learning_rate": 4.7330607529887885e-06, |
|
"loss": 0.6876, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 31.692307692307693, |
|
"grad_norm": 1.4284254312515259, |
|
"learning_rate": 4.719099051007136e-06, |
|
"loss": 0.6913, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 32.0, |
|
"grad_norm": 322668.5625, |
|
"learning_rate": 4.704803160870888e-06, |
|
"loss": 0.6297, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 32.30769230769231, |
|
"grad_norm": 1.519518256187439, |
|
"learning_rate": 4.6901752354885166e-06, |
|
"loss": 0.617, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 32.61538461538461, |
|
"grad_norm": 1.3290555477142334, |
|
"learning_rate": 4.675217477771779e-06, |
|
"loss": 0.6476, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 32.92307692307692, |
|
"grad_norm": 1.5469777584075928, |
|
"learning_rate": 4.659932140303967e-06, |
|
"loss": 0.5633, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 33.23076923076923, |
|
"grad_norm": 1.5148197412490845, |
|
"learning_rate": 4.644321525000681e-06, |
|
"loss": 0.5595, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 33.53846153846154, |
|
"grad_norm": 1.7467495203018188, |
|
"learning_rate": 4.628387982763163e-06, |
|
"loss": 0.5691, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 33.84615384615385, |
|
"grad_norm": 1.9587457180023193, |
|
"learning_rate": 4.612133913124268e-06, |
|
"loss": 0.4897, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 34.15384615384615, |
|
"grad_norm": 1.3276329040527344, |
|
"learning_rate": 4.595561763887095e-06, |
|
"loss": 0.5177, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 34.46153846153846, |
|
"grad_norm": 1.5996724367141724, |
|
"learning_rate": 4.578674030756364e-06, |
|
"loss": 0.4505, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 34.76923076923077, |
|
"grad_norm": 1.8829643726348877, |
|
"learning_rate": 4.561473256962564e-06, |
|
"loss": 0.5024, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 35.07692307692308, |
|
"grad_norm": 2.2122015953063965, |
|
"learning_rate": 4.54396203287896e-06, |
|
"loss": 0.466, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 35.38461538461539, |
|
"grad_norm": 1.784812092781067, |
|
"learning_rate": 4.526142995631488e-06, |
|
"loss": 0.4057, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 35.69230769230769, |
|
"grad_norm": 1.3397154808044434, |
|
"learning_rate": 4.508018828701613e-06, |
|
"loss": 0.4513, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 36.0, |
|
"grad_norm": 368474.65625, |
|
"learning_rate": 4.489592261522209e-06, |
|
"loss": 0.3743, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 36.30769230769231, |
|
"grad_norm": 1.8128660917282104, |
|
"learning_rate": 4.470866069066516e-06, |
|
"loss": 0.3919, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 36.61538461538461, |
|
"grad_norm": 1.7692885398864746, |
|
"learning_rate": 4.451843071430236e-06, |
|
"loss": 0.3589, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 36.92307692307692, |
|
"grad_norm": 1.45668363571167, |
|
"learning_rate": 4.432526133406843e-06, |
|
"loss": 0.3578, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 37.23076923076923, |
|
"grad_norm": 1.255777359008789, |
|
"learning_rate": 4.412918164056148e-06, |
|
"loss": 0.3436, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 37.53846153846154, |
|
"grad_norm": 1.4039700031280518, |
|
"learning_rate": 4.393022116266212e-06, |
|
"loss": 0.3054, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 37.84615384615385, |
|
"grad_norm": 2.489480495452881, |
|
"learning_rate": 4.372840986308649e-06, |
|
"loss": 0.3145, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 38.15384615384615, |
|
"grad_norm": 2.235471725463867, |
|
"learning_rate": 4.352377813387398e-06, |
|
"loss": 0.3915, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 38.46153846153846, |
|
"grad_norm": 1.5027036666870117, |
|
"learning_rate": 4.331635679181032e-06, |
|
"loss": 0.2823, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 38.76923076923077, |
|
"grad_norm": 1.8041187524795532, |
|
"learning_rate": 4.3106177073786684e-06, |
|
"loss": 0.2837, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 39.07692307692308, |
|
"grad_norm": 1.359180212020874, |
|
"learning_rate": 4.289327063209548e-06, |
|
"loss": 0.277, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 39.38461538461539, |
|
"grad_norm": 1.5031533241271973, |
|
"learning_rate": 4.267766952966369e-06, |
|
"loss": 0.2959, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 39.69230769230769, |
|
"grad_norm": 1.5086129903793335, |
|
"learning_rate": 4.245940623522433e-06, |
|
"loss": 0.236, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 40.0, |
|
"grad_norm": 8115298.5, |
|
"learning_rate": 4.223851361842668e-06, |
|
"loss": 0.2243, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 40.30769230769231, |
|
"grad_norm": 1.59078049659729, |
|
"learning_rate": 4.201502494488633e-06, |
|
"loss": 0.2423, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 40.61538461538461, |
|
"grad_norm": 1.6530065536499023, |
|
"learning_rate": 4.178897387117547e-06, |
|
"loss": 0.2217, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 40.92307692307692, |
|
"grad_norm": 1.6133264303207397, |
|
"learning_rate": 4.15603944397543e-06, |
|
"loss": 0.2185, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 41.23076923076923, |
|
"grad_norm": 1.7328965663909912, |
|
"learning_rate": 4.132932107384442e-06, |
|
"loss": 0.2183, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 41.53846153846154, |
|
"grad_norm": 1.5189462900161743, |
|
"learning_rate": 4.109578857224478e-06, |
|
"loss": 0.2167, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 41.84615384615385, |
|
"grad_norm": 1.807603359222412, |
|
"learning_rate": 4.085983210409114e-06, |
|
"loss": 0.193, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 42.15384615384615, |
|
"grad_norm": 1.4981666803359985, |
|
"learning_rate": 4.062148720355967e-06, |
|
"loss": 0.1724, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 42.46153846153846, |
|
"grad_norm": 1.40492844581604, |
|
"learning_rate": 4.038078976451567e-06, |
|
"loss": 0.1658, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 42.76923076923077, |
|
"grad_norm": 1.4978539943695068, |
|
"learning_rate": 4.013777603510815e-06, |
|
"loss": 0.1555, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 43.07692307692308, |
|
"grad_norm": 1.2119354009628296, |
|
"learning_rate": 3.989248261231084e-06, |
|
"loss": 0.1833, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 43.38461538461539, |
|
"grad_norm": 1.857128381729126, |
|
"learning_rate": 3.964494643641097e-06, |
|
"loss": 0.1622, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 43.69230769230769, |
|
"grad_norm": 1.624523639678955, |
|
"learning_rate": 3.939520478544614e-06, |
|
"loss": 0.1499, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 44.0, |
|
"grad_norm": 53768.84375, |
|
"learning_rate": 3.914329526959033e-06, |
|
"loss": 0.1712, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 44.30769230769231, |
|
"grad_norm": 1.3771990537643433, |
|
"learning_rate": 3.888925582549006e-06, |
|
"loss": 0.1334, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 44.61538461538461, |
|
"grad_norm": 2.549403190612793, |
|
"learning_rate": 3.863312471055116e-06, |
|
"loss": 0.1833, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 44.92307692307692, |
|
"grad_norm": 1.4592410326004028, |
|
"learning_rate": 3.8374940497177435e-06, |
|
"loss": 0.1174, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 45.23076923076923, |
|
"grad_norm": 1.5769482851028442, |
|
"learning_rate": 3.8114742066961722e-06, |
|
"loss": 0.1207, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 45.53846153846154, |
|
"grad_norm": 1.1684468984603882, |
|
"learning_rate": 3.785256860483054e-06, |
|
"loss": 0.1275, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 45.84615384615385, |
|
"grad_norm": 1.1915509700775146, |
|
"learning_rate": 3.7588459593142944e-06, |
|
"loss": 0.1253, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 46.15384615384615, |
|
"grad_norm": 1.3312920331954956, |
|
"learning_rate": 3.7322454805744605e-06, |
|
"loss": 0.1159, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 46.46153846153846, |
|
"grad_norm": 1.1740134954452515, |
|
"learning_rate": 3.7054594301978075e-06, |
|
"loss": 0.1312, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 46.76923076923077, |
|
"grad_norm": 0.9904786348342896, |
|
"learning_rate": 3.6784918420649952e-06, |
|
"loss": 0.1171, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 47.07692307692308, |
|
"grad_norm": 1.7853026390075684, |
|
"learning_rate": 3.6513467773956002e-06, |
|
"loss": 0.1229, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 47.38461538461539, |
|
"grad_norm": 0.924367368221283, |
|
"learning_rate": 3.624028324136517e-06, |
|
"loss": 0.1087, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 47.69230769230769, |
|
"grad_norm": 1.3565295934677124, |
|
"learning_rate": 3.5965405963463197e-06, |
|
"loss": 0.113, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 48.0, |
|
"grad_norm": 27509738.0, |
|
"learning_rate": 3.5688877335757055e-06, |
|
"loss": 0.0949, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 48.30769230769231, |
|
"grad_norm": 1.4091086387634277, |
|
"learning_rate": 3.5410739002440938e-06, |
|
"loss": 0.095, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 48.61538461538461, |
|
"grad_norm": 1.1597000360488892, |
|
"learning_rate": 3.5131032850124745e-06, |
|
"loss": 0.0987, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 48.92307692307692, |
|
"grad_norm": 1.2181200981140137, |
|
"learning_rate": 3.484980100152621e-06, |
|
"loss": 0.0894, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 49.23076923076923, |
|
"grad_norm": 1.0059666633605957, |
|
"learning_rate": 3.4567085809127247e-06, |
|
"loss": 0.0962, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 49.53846153846154, |
|
"grad_norm": 1.6103034019470215, |
|
"learning_rate": 3.4282929848795944e-06, |
|
"loss": 0.0842, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 49.84615384615385, |
|
"grad_norm": 1.6124812364578247, |
|
"learning_rate": 3.399737591337471e-06, |
|
"loss": 0.0981, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 50.15384615384615, |
|
"grad_norm": 1.14884614944458, |
|
"learning_rate": 3.3710467006235865e-06, |
|
"loss": 0.0996, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 50.46153846153846, |
|
"grad_norm": 1.4607173204421997, |
|
"learning_rate": 3.3422246334805504e-06, |
|
"loss": 0.1034, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 50.76923076923077, |
|
"grad_norm": 1.092631459236145, |
|
"learning_rate": 3.313275730405658e-06, |
|
"loss": 0.0801, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 51.07692307692308, |
|
"grad_norm": 1.0797463655471802, |
|
"learning_rate": 3.2842043509972294e-06, |
|
"loss": 0.0795, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 51.38461538461539, |
|
"grad_norm": 1.503728985786438, |
|
"learning_rate": 3.2550148732980707e-06, |
|
"loss": 0.0744, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 51.69230769230769, |
|
"grad_norm": 1.0997803211212158, |
|
"learning_rate": 3.225711693136156e-06, |
|
"loss": 0.0741, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 52.0, |
|
"grad_norm": 10298226.0, |
|
"learning_rate": 3.196299223462633e-06, |
|
"loss": 0.0699, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 52.30769230769231, |
|
"grad_norm": 1.0617835521697998, |
|
"learning_rate": 3.1667818936872463e-06, |
|
"loss": 0.074, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 52.61538461538461, |
|
"grad_norm": 0.8478608131408691, |
|
"learning_rate": 3.137164149011287e-06, |
|
"loss": 0.0577, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 52.92307692307692, |
|
"grad_norm": 1.827232003211975, |
|
"learning_rate": 3.10745044975816e-06, |
|
"loss": 0.0871, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 53.23076923076923, |
|
"grad_norm": 1.0407119989395142, |
|
"learning_rate": 3.0776452707016784e-06, |
|
"loss": 0.0682, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 53.53846153846154, |
|
"grad_norm": 1.375914454460144, |
|
"learning_rate": 3.0477531003921745e-06, |
|
"loss": 0.0735, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 53.84615384615385, |
|
"grad_norm": 1.3044782876968384, |
|
"learning_rate": 3.0177784404805466e-06, |
|
"loss": 0.0617, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 54.15384615384615, |
|
"grad_norm": 0.6784080862998962, |
|
"learning_rate": 2.9877258050403214e-06, |
|
"loss": 0.0494, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 54.46153846153846, |
|
"grad_norm": 1.0913127660751343, |
|
"learning_rate": 2.957599719887853e-06, |
|
"loss": 0.0685, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 54.76923076923077, |
|
"grad_norm": 1.2988967895507812, |
|
"learning_rate": 2.9274047219007533e-06, |
|
"loss": 0.069, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 55.07692307692308, |
|
"grad_norm": 1.0163013935089111, |
|
"learning_rate": 2.8971453583346536e-06, |
|
"loss": 0.0545, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 55.38461538461539, |
|
"grad_norm": 0.7982223033905029, |
|
"learning_rate": 2.8668261861384045e-06, |
|
"loss": 0.0509, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 55.69230769230769, |
|
"grad_norm": 1.5185413360595703, |
|
"learning_rate": 2.8364517712678157e-06, |
|
"loss": 0.0708, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 56.0, |
|
"grad_norm": 54588.5390625, |
|
"learning_rate": 2.806026687998041e-06, |
|
"loss": 0.0531, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 56.30769230769231, |
|
"grad_norm": 0.9050544500350952, |
|
"learning_rate": 2.775555518234708e-06, |
|
"loss": 0.0529, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 56.61538461538461, |
|
"grad_norm": 1.0293651819229126, |
|
"learning_rate": 2.7450428508239024e-06, |
|
"loss": 0.0572, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 56.92307692307692, |
|
"grad_norm": 1.1838538646697998, |
|
"learning_rate": 2.7144932808611002e-06, |
|
"loss": 0.0616, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 57.23076923076923, |
|
"grad_norm": 1.2836631536483765, |
|
"learning_rate": 2.683911408999169e-06, |
|
"loss": 0.0536, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 57.53846153846154, |
|
"grad_norm": 1.0823341608047485, |
|
"learning_rate": 2.6533018407555216e-06, |
|
"loss": 0.0476, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 57.84615384615385, |
|
"grad_norm": 1.0420117378234863, |
|
"learning_rate": 2.6226691858185454e-06, |
|
"loss": 0.0527, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 58.15384615384615, |
|
"grad_norm": 1.0533287525177002, |
|
"learning_rate": 2.5920180573533975e-06, |
|
"loss": 0.0623, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 58.46153846153846, |
|
"grad_norm": 0.9571561217308044, |
|
"learning_rate": 2.561353071307281e-06, |
|
"loss": 0.0498, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 58.76923076923077, |
|
"grad_norm": 1.224555253982544, |
|
"learning_rate": 2.5306788457143e-06, |
|
"loss": 0.0447, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 59.07692307692308, |
|
"grad_norm": 1.900484561920166, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.0578, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 59.38461538461539, |
|
"grad_norm": 0.9753430485725403, |
|
"learning_rate": 2.4693211542857005e-06, |
|
"loss": 0.052, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 59.69230769230769, |
|
"grad_norm": 1.0865504741668701, |
|
"learning_rate": 2.43864692869272e-06, |
|
"loss": 0.0447, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 60.0, |
|
"grad_norm": 39835.109375, |
|
"learning_rate": 2.407981942646603e-06, |
|
"loss": 0.0477, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 60.30769230769231, |
|
"grad_norm": 1.1388822793960571, |
|
"learning_rate": 2.377330814181455e-06, |
|
"loss": 0.0516, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 60.61538461538461, |
|
"grad_norm": 1.2782968282699585, |
|
"learning_rate": 2.346698159244479e-06, |
|
"loss": 0.0501, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 60.92307692307692, |
|
"grad_norm": 1.6132394075393677, |
|
"learning_rate": 2.3160885910008317e-06, |
|
"loss": 0.0408, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 61.23076923076923, |
|
"grad_norm": 0.8268479108810425, |
|
"learning_rate": 2.2855067191389006e-06, |
|
"loss": 0.0407, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 61.53846153846154, |
|
"grad_norm": 0.9518911838531494, |
|
"learning_rate": 2.2549571491760985e-06, |
|
"loss": 0.0455, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 61.84615384615385, |
|
"grad_norm": 1.2631924152374268, |
|
"learning_rate": 2.2244444817652923e-06, |
|
"loss": 0.0462, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 62.15384615384615, |
|
"grad_norm": 1.0491702556610107, |
|
"learning_rate": 2.19397331200196e-06, |
|
"loss": 0.0384, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 62.46153846153846, |
|
"grad_norm": 1.37276291847229, |
|
"learning_rate": 2.1635482287321848e-06, |
|
"loss": 0.0384, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 62.76923076923077, |
|
"grad_norm": 0.911716103553772, |
|
"learning_rate": 2.133173813861596e-06, |
|
"loss": 0.0489, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 63.07692307692308, |
|
"grad_norm": 0.8453769683837891, |
|
"learning_rate": 2.102854641665347e-06, |
|
"loss": 0.0519, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 63.38461538461539, |
|
"grad_norm": 1.1368571519851685, |
|
"learning_rate": 2.072595278099247e-06, |
|
"loss": 0.0383, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 63.69230769230769, |
|
"grad_norm": 1.6040756702423096, |
|
"learning_rate": 2.042400280112148e-06, |
|
"loss": 0.0421, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 64.0, |
|
"grad_norm": 664656.875, |
|
"learning_rate": 2.01227419495968e-06, |
|
"loss": 0.0473, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 64.3076923076923, |
|
"grad_norm": 0.9847516417503357, |
|
"learning_rate": 1.982221559519454e-06, |
|
"loss": 0.042, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 64.61538461538461, |
|
"grad_norm": 0.77010577917099, |
|
"learning_rate": 1.952246899607826e-06, |
|
"loss": 0.042, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 64.92307692307692, |
|
"grad_norm": 1.2908334732055664, |
|
"learning_rate": 1.9223547292983225e-06, |
|
"loss": 0.0416, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 65.23076923076923, |
|
"grad_norm": 0.7256491184234619, |
|
"learning_rate": 1.8925495502418407e-06, |
|
"loss": 0.0353, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 65.53846153846153, |
|
"grad_norm": 1.212572455406189, |
|
"learning_rate": 1.862835850988714e-06, |
|
"loss": 0.0427, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 65.84615384615384, |
|
"grad_norm": 1.154390811920166, |
|
"learning_rate": 1.8332181063127543e-06, |
|
"loss": 0.0396, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 66.15384615384616, |
|
"grad_norm": 0.8889743685722351, |
|
"learning_rate": 1.8037007765373677e-06, |
|
"loss": 0.0381, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 66.46153846153847, |
|
"grad_norm": 1.1566401720046997, |
|
"learning_rate": 1.7742883068638447e-06, |
|
"loss": 0.041, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 66.76923076923077, |
|
"grad_norm": 0.9389579892158508, |
|
"learning_rate": 1.74498512670193e-06, |
|
"loss": 0.0385, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 67.07692307692308, |
|
"grad_norm": 0.709125816822052, |
|
"learning_rate": 1.7157956490027716e-06, |
|
"loss": 0.0432, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 67.38461538461539, |
|
"grad_norm": 1.1108027696609497, |
|
"learning_rate": 1.686724269594343e-06, |
|
"loss": 0.0395, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 67.6923076923077, |
|
"grad_norm": 1.087064266204834, |
|
"learning_rate": 1.6577753665194502e-06, |
|
"loss": 0.0402, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 68.0, |
|
"grad_norm": 3785425.75, |
|
"learning_rate": 1.628953299376414e-06, |
|
"loss": 0.0351, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 68.3076923076923, |
|
"grad_norm": 0.9498631954193115, |
|
"learning_rate": 1.6002624086625296e-06, |
|
"loss": 0.0402, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 68.61538461538461, |
|
"grad_norm": 0.6672590374946594, |
|
"learning_rate": 1.5717070151204064e-06, |
|
"loss": 0.0333, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 68.92307692307692, |
|
"grad_norm": 0.671436607837677, |
|
"learning_rate": 1.5432914190872757e-06, |
|
"loss": 0.0387, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 69.23076923076923, |
|
"grad_norm": 0.7344552874565125, |
|
"learning_rate": 1.5150198998473802e-06, |
|
"loss": 0.0345, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 69.53846153846153, |
|
"grad_norm": 0.5938952565193176, |
|
"learning_rate": 1.4868967149875257e-06, |
|
"loss": 0.0343, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 69.84615384615384, |
|
"grad_norm": 0.862509548664093, |
|
"learning_rate": 1.4589260997559077e-06, |
|
"loss": 0.0359, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 70.15384615384616, |
|
"grad_norm": 0.8928817510604858, |
|
"learning_rate": 1.4311122664242955e-06, |
|
"loss": 0.0406, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 70.46153846153847, |
|
"grad_norm": 0.81640625, |
|
"learning_rate": 1.4034594036536816e-06, |
|
"loss": 0.0332, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 70.76923076923077, |
|
"grad_norm": 0.5396949648857117, |
|
"learning_rate": 1.3759716758634833e-06, |
|
"loss": 0.0389, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 71.07692307692308, |
|
"grad_norm": 0.918347954750061, |
|
"learning_rate": 1.3486532226044e-06, |
|
"loss": 0.0329, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 71.38461538461539, |
|
"grad_norm": 0.8731901049613953, |
|
"learning_rate": 1.3215081579350058e-06, |
|
"loss": 0.0355, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 71.6923076923077, |
|
"grad_norm": 0.9578891396522522, |
|
"learning_rate": 1.294540569802193e-06, |
|
"loss": 0.037, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 72.0, |
|
"grad_norm": 3753634.75, |
|
"learning_rate": 1.2677545194255403e-06, |
|
"loss": 0.0372, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 72.3076923076923, |
|
"grad_norm": 0.9850757122039795, |
|
"learning_rate": 1.2411540406857064e-06, |
|
"loss": 0.0388, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 72.61538461538461, |
|
"grad_norm": 1.0889559984207153, |
|
"learning_rate": 1.214743139516946e-06, |
|
"loss": 0.0329, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 72.92307692307692, |
|
"grad_norm": 1.2561447620391846, |
|
"learning_rate": 1.1885257933038282e-06, |
|
"loss": 0.0334, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 73.23076923076923, |
|
"grad_norm": 1.08214271068573, |
|
"learning_rate": 1.1625059502822575e-06, |
|
"loss": 0.0438, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 73.53846153846153, |
|
"grad_norm": 0.6585337519645691, |
|
"learning_rate": 1.1366875289448844e-06, |
|
"loss": 0.0283, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 73.84615384615384, |
|
"grad_norm": 1.001531720161438, |
|
"learning_rate": 1.1110744174509952e-06, |
|
"loss": 0.0405, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 74.15384615384616, |
|
"grad_norm": 0.774890124797821, |
|
"learning_rate": 1.0856704730409667e-06, |
|
"loss": 0.035, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 74.46153846153847, |
|
"grad_norm": 0.8112567663192749, |
|
"learning_rate": 1.0604795214553867e-06, |
|
"loss": 0.0371, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 74.76923076923077, |
|
"grad_norm": 0.7539799809455872, |
|
"learning_rate": 1.035505356358903e-06, |
|
"loss": 0.0363, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 75.07692307692308, |
|
"grad_norm": 1.047004222869873, |
|
"learning_rate": 1.0107517387689168e-06, |
|
"loss": 0.0305, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 75.38461538461539, |
|
"grad_norm": 0.5877715945243835, |
|
"learning_rate": 9.862223964891864e-07, |
|
"loss": 0.032, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 75.6923076923077, |
|
"grad_norm": 0.7707840204238892, |
|
"learning_rate": 9.61921023548433e-07, |
|
"loss": 0.0345, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 76.0, |
|
"grad_norm": 8456043.0, |
|
"learning_rate": 9.378512796440345e-07, |
|
"loss": 0.0362, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 76.3076923076923, |
|
"grad_norm": 1.2004197835922241, |
|
"learning_rate": 9.140167895908867e-07, |
|
"loss": 0.0318, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 76.61538461538461, |
|
"grad_norm": 0.7192149758338928, |
|
"learning_rate": 8.904211427755219e-07, |
|
"loss": 0.0339, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 76.92307692307692, |
|
"grad_norm": 0.8858992457389832, |
|
"learning_rate": 8.670678926155588e-07, |
|
"loss": 0.0365, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3200, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 100, |
|
"save_steps": 50, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.4736165944792064e+17, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|