End of training

Browse files

Files changed (7) hide show

README.md +3 -2
all_results.json +13 -0
eval_results.json +8 -0
train_results.json +8 -0
trainer_state.json +2442 -0
training_eval_loss.png +0 -0
training_loss.png +0 -0

README.md CHANGED Viewed

@@ -4,6 +4,7 @@ license: llama3.2
 base_model: meta-llama/Llama-3.2-1B
 tags:
 - llama-factory
 - generated_from_trainer
 model-index:
 - name: mcmaster-llama3-1b-full-pt
@@ -15,9 +16,9 @@ should probably proofread and complete it, then remove this comment. -->
 # mcmaster-llama3-1b-full-pt
-This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on an unknown dataset.
 It achieves the following results on the evaluation set:
-- Loss: 0.0320
 ## Model description

 base_model: meta-llama/Llama-3.2-1B
 tags:
 - llama-factory
+- full
 - generated_from_trainer
 model-index:
 - name: mcmaster-llama3-1b-full-pt
 # mcmaster-llama3-1b-full-pt
+This model is a fine-tuned version of [meta-llama/Llama-3.2-1B](https://huggingface.co/meta-llama/Llama-3.2-1B) on the mcmaster dataset.
 It achieves the following results on the evaluation set:
+- Loss: 0.0319
 ## Model description

all_results.json ADDED Viewed

	@@ -0,0 +1,13 @@

+{
+    "epoch": 4.998698642870422,
+    "eval_loss": 0.03189178928732872,
+    "eval_runtime": 212.4751,
+    "eval_samples_per_second": 22.501,
+    "eval_steps_per_second": 22.501,
+    "perplexity": 1.0324057818949677,
+    "total_flos": 5.14290499398402e+18,
+    "train_loss": 0.19145491501161208,
+    "train_runtime": 31931.328,
+    "train_samples_per_second": 6.737,
+    "train_steps_per_second": 0.105
+}

eval_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.998698642870422,
+    "eval_loss": 0.03189178928732872,
+    "eval_runtime": 212.4751,
+    "eval_samples_per_second": 22.501,
+    "eval_steps_per_second": 22.501,
+    "perplexity": 1.0324057818949677
+}

train_results.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+    "epoch": 4.998698642870422,
+    "total_flos": 5.14290499398402e+18,
+    "train_loss": 0.19145491501161208,
+    "train_runtime": 31931.328,
+    "train_samples_per_second": 6.737,
+    "train_steps_per_second": 0.105
+}

trainer_state.json ADDED Viewed

	@@ -0,0 +1,2442 @@

+{
+  "best_metric": null,
+  "best_model_checkpoint": null,
+  "epoch": 4.998698642870422,
+  "eval_steps": 500,
+  "global_step": 3360,
+  "is_hyper_param_search": false,
+  "is_local_process_zero": true,
+  "is_world_process_zero": true,
+  "log_history": [
+    {
+      "epoch": 0.014872652909462726,
+      "grad_norm": 1.0425002574920654,
+      "learning_rate": 8.928571428571428e-06,
+      "loss": 0.4346,
+      "step": 10
+    },
+    {
+      "epoch": 0.02974530581892545,
+      "grad_norm": 0.5861272811889648,
+      "learning_rate": 1.7857142857142855e-05,
+      "loss": 0.3527,
+      "step": 20
+    },
+    {
+      "epoch": 0.044617958728388175,
+      "grad_norm": 0.5629558563232422,
+      "learning_rate": 2.6785714285714284e-05,
+      "loss": 0.2922,
+      "step": 30
+    },
+    {
+      "epoch": 0.0594906116378509,
+      "grad_norm": 0.49566933512687683,
+      "learning_rate": 3.571428571428571e-05,
+      "loss": 0.281,
+      "step": 40
+    },
+    {
+      "epoch": 0.07436326454731362,
+      "grad_norm": 0.45387113094329834,
+      "learning_rate": 4.4642857142857136e-05,
+      "loss": 0.2689,
+      "step": 50
+    },
+    {
+      "epoch": 0.08923591745677635,
+      "grad_norm": 0.43913352489471436,
+      "learning_rate": 5.357142857142857e-05,
+      "loss": 0.2506,
+      "step": 60
+    },
+    {
+      "epoch": 0.10410857036623908,
+      "grad_norm": 0.7242547869682312,
+      "learning_rate": 6.25e-05,
+      "loss": 0.229,
+      "step": 70
+    },
+    {
+      "epoch": 0.1189812232757018,
+      "grad_norm": 0.5109072923660278,
+      "learning_rate": 7.142857142857142e-05,
+      "loss": 0.2373,
+      "step": 80
+    },
+    {
+      "epoch": 0.13385387618516453,
+      "grad_norm": 0.5291035175323486,
+      "learning_rate": 8.035714285714285e-05,
+      "loss": 0.2405,
+      "step": 90
+    },
+    {
+      "epoch": 0.14872652909462725,
+      "grad_norm": 0.48036572337150574,
+      "learning_rate": 8.928571428571427e-05,
+      "loss": 0.2274,
+      "step": 100
+    },
+    {
+      "epoch": 0.16359918200409,
+      "grad_norm": 0.3294093906879425,
+      "learning_rate": 9.82142857142857e-05,
+      "loss": 0.2038,
+      "step": 110
+    },
+    {
+      "epoch": 0.1784718349135527,
+      "grad_norm": 0.49968525767326355,
+      "learning_rate": 0.00010714285714285714,
+      "loss": 0.2084,
+      "step": 120
+    },
+    {
+      "epoch": 0.19334448782301544,
+      "grad_norm": 0.32227209210395813,
+      "learning_rate": 0.00011607142857142857,
+      "loss": 0.1981,
+      "step": 130
+    },
+    {
+      "epoch": 0.20821714073247816,
+      "grad_norm": 0.37266677618026733,
+      "learning_rate": 0.000125,
+      "loss": 0.2192,
+      "step": 140
+    },
+    {
+      "epoch": 0.22308979364194087,
+      "grad_norm": 0.5228686928749084,
+      "learning_rate": 0.00013392857142857144,
+      "loss": 0.2014,
+      "step": 150
+    },
+    {
+      "epoch": 0.2379624465514036,
+      "grad_norm": 0.4202245771884918,
+      "learning_rate": 0.00014285714285714284,
+      "loss": 0.1912,
+      "step": 160
+    },
+    {
+      "epoch": 0.25283509946086635,
+      "grad_norm": 0.45801258087158203,
+      "learning_rate": 0.00015178571428571427,
+      "loss": 0.212,
+      "step": 170
+    },
+    {
+      "epoch": 0.26770775237032907,
+      "grad_norm": 0.4326329827308655,
+      "learning_rate": 0.0001607142857142857,
+      "loss": 0.1973,
+      "step": 180
+    },
+    {
+      "epoch": 0.2825804052797918,
+      "grad_norm": 0.38971471786499023,
+      "learning_rate": 0.0001696428571428571,
+      "loss": 0.1907,
+      "step": 190
+    },
+    {
+      "epoch": 0.2974530581892545,
+      "grad_norm": 0.3728097975254059,
+      "learning_rate": 0.00017857142857142854,
+      "loss": 0.192,
+      "step": 200
+    },
+    {
+      "epoch": 0.3123257110987172,
+      "grad_norm": 0.34695690870285034,
+      "learning_rate": 0.00018749999999999998,
+      "loss": 0.1855,
+      "step": 210
+    },
+    {
+      "epoch": 0.32719836400818,
+      "grad_norm": 0.41753408312797546,
+      "learning_rate": 0.0001964285714285714,
+      "loss": 0.1883,
+      "step": 220
+    },
+    {
+      "epoch": 0.3420710169176427,
+      "grad_norm": 0.27681878209114075,
+      "learning_rate": 0.00020535714285714284,
+      "loss": 0.1809,
+      "step": 230
+    },
+    {
+      "epoch": 0.3569436698271054,
+      "grad_norm": 2.382871150970459,
+      "learning_rate": 0.00021428571428571427,
+      "loss": 0.1735,
+      "step": 240
+    },
+    {
+      "epoch": 0.3718163227365681,
+      "grad_norm": 160.2670440673828,
+      "learning_rate": 0.0002232142857142857,
+      "loss": 1.2159,
+      "step": 250
+    },
+    {
+      "epoch": 0.3866889756460309,
+      "grad_norm": 21.60050392150879,
+      "learning_rate": 0.00023214285714285714,
+      "loss": 5.4026,
+      "step": 260
+    },
+    {
+      "epoch": 0.4015616285554936,
+      "grad_norm": 13.928524017333984,
+      "learning_rate": 0.00024107142857142857,
+      "loss": 4.3573,
+      "step": 270
+    },
+    {
+      "epoch": 0.4164342814649563,
+      "grad_norm": 5.3707685470581055,
+      "learning_rate": 0.00025,
+      "loss": 3.2782,
+      "step": 280
+    },
+    {
+      "epoch": 0.431306934374419,
+      "grad_norm": 5.556903839111328,
+      "learning_rate": 0.0002589285714285714,
+      "loss": 2.8033,
+      "step": 290
+    },
+    {
+      "epoch": 0.44617958728388174,
+      "grad_norm": 2.512521505355835,
+      "learning_rate": 0.00026785714285714287,
+      "loss": 2.5486,
+      "step": 300
+    },
+    {
+      "epoch": 0.4610522401933445,
+      "grad_norm": 3.592169761657715,
+      "learning_rate": 0.0002767857142857143,
+      "loss": 2.2779,
+      "step": 310
+    },
+    {
+      "epoch": 0.4759248931028072,
+      "grad_norm": 2.791459321975708,
+      "learning_rate": 0.0002857142857142857,
+      "loss": 2.1011,
+      "step": 320
+    },
+    {
+      "epoch": 0.49079754601226994,
+      "grad_norm": 1.1407463550567627,
+      "learning_rate": 0.0002946428571428571,
+      "loss": 1.9929,
+      "step": 330
+    },
+    {
+      "epoch": 0.5056701989217327,
+      "grad_norm": 1.795841097831726,
+      "learning_rate": 0.0002999987048597728,
+      "loss": 1.8818,
+      "step": 340
+    },
+    {
+      "epoch": 0.5205428518311954,
+      "grad_norm": 1.4798821210861206,
+      "learning_rate": 0.00029998413478906613,
+      "loss": 1.772,
+      "step": 350
+    },
+    {
+      "epoch": 0.5354155047406581,
+      "grad_norm": 1.5337024927139282,
+      "learning_rate": 0.0002999533773001224,
+      "loss": 1.6782,
+      "step": 360
+    },
+    {
+      "epoch": 0.5502881576501208,
+      "grad_norm": 1.332065463066101,
+      "learning_rate": 0.00029990643571252174,
+      "loss": 1.6035,
+      "step": 370
+    },
+    {
+      "epoch": 0.5651608105595836,
+      "grad_norm": 1.0516103506088257,
+      "learning_rate": 0.00029984331509255415,
+      "loss": 1.5053,
+      "step": 380
+    },
+    {
+      "epoch": 0.5800334634690463,
+      "grad_norm": 1.034192442893982,
+      "learning_rate": 0.00029976402225267247,
+      "loss": 1.3906,
+      "step": 390
+    },
+    {
+      "epoch": 0.594906116378509,
+      "grad_norm": 1.2757515907287598,
+      "learning_rate": 0.0002996685657507577,
+      "loss": 1.2592,
+      "step": 400
+    },
+    {
+      "epoch": 0.6097787692879717,
+      "grad_norm": 0.8252782225608826,
+      "learning_rate": 0.000299556955889195,
+      "loss": 1.0907,
+      "step": 410
+    },
+    {
+      "epoch": 0.6246514221974344,
+      "grad_norm": 1.020588994026184,
+      "learning_rate": 0.0002994292047137618,
+      "loss": 0.9035,
+      "step": 420
+    },
+    {
+      "epoch": 0.6395240751068972,
+      "grad_norm": 0.5973761677742004,
+      "learning_rate": 0.0002992853260123278,
+      "loss": 0.7538,
+      "step": 430
+    },
+    {
+      "epoch": 0.65439672801636,
+      "grad_norm": 0.6886543035507202,
+      "learning_rate": 0.0002991253353133668,
+      "loss": 0.6621,
+      "step": 440
+    },
+    {
+      "epoch": 0.6692693809258227,
+      "grad_norm": 0.44221287965774536,
+      "learning_rate": 0.00029894924988428087,
+      "loss": 0.59,
+      "step": 450
+    },
+    {
+      "epoch": 0.6841420338352854,
+      "grad_norm": 0.7888408899307251,
+      "learning_rate": 0.00029875708872953677,
+      "loss": 0.539,
+      "step": 460
+    },
+    {
+      "epoch": 0.6990146867447481,
+      "grad_norm": 0.43110209703445435,
+      "learning_rate": 0.00029854887258861447,
+      "loss": 0.4903,
+      "step": 470
+    },
+    {
+      "epoch": 0.7138873396542108,
+      "grad_norm": 0.41334015130996704,
+      "learning_rate": 0.0002983246239337692,
+      "loss": 0.4488,
+      "step": 480
+    },
+    {
+      "epoch": 0.7287599925636735,
+      "grad_norm": 0.3482460379600525,
+      "learning_rate": 0.0002980843669676061,
+      "loss": 0.4165,
+      "step": 490
+    },
+    {
+      "epoch": 0.7436326454731362,
+      "grad_norm": 0.3593901991844177,
+      "learning_rate": 0.0002978281276204675,
+      "loss": 0.3821,
+      "step": 500
+    },
+    {
+      "epoch": 0.7436326454731362,
+      "eval_loss": 0.37597203254699707,
+      "eval_runtime": 212.4955,
+      "eval_samples_per_second": 22.499,
+      "eval_steps_per_second": 22.499,
+      "step": 500
+    },
+    {
+      "epoch": 0.758505298382599,
+      "grad_norm": 0.4221905469894409,
+      "learning_rate": 0.00029755593354763516,
+      "loss": 0.3627,
+      "step": 510
+    },
+    {
+      "epoch": 0.7733779512920618,
+      "grad_norm": 0.31105437874794006,
+      "learning_rate": 0.0002972678141263449,
+      "loss": 0.3346,
+      "step": 520
+    },
+    {
+      "epoch": 0.7882506042015245,
+      "grad_norm": 0.2600822150707245,
+      "learning_rate": 0.000296963800452616,
+      "loss": 0.3217,
+      "step": 530
+    },
+    {
+      "epoch": 0.8031232571109872,
+      "grad_norm": 0.21437157690525055,
+      "learning_rate": 0.0002966439253378957,
+      "loss": 0.3095,
+      "step": 540
+    },
+    {
+      "epoch": 0.8179959100204499,
+      "grad_norm": 0.22641418874263763,
+      "learning_rate": 0.000296308223305517,
+      "loss": 0.2866,
+      "step": 550
+    },
+    {
+      "epoch": 0.8328685629299126,
+      "grad_norm": 0.2200980931520462,
+      "learning_rate": 0.00029595673058697357,
+      "loss": 0.2579,
+      "step": 560
+    },
+    {
+      "epoch": 0.8477412158393753,
+      "grad_norm": 0.21351036429405212,
+      "learning_rate": 0.0002955894851180086,
+      "loss": 0.2727,
+      "step": 570
+    },
+    {
+      "epoch": 0.862613868748838,
+      "grad_norm": 0.2137759029865265,
+      "learning_rate": 0.0002952065265345211,
+      "loss": 0.2621,
+      "step": 580
+    },
+    {
+      "epoch": 0.8774865216583008,
+      "grad_norm": 0.18923349678516388,
+      "learning_rate": 0.00029480789616828765,
+      "loss": 0.2647,
+      "step": 590
+    },
+    {
+      "epoch": 0.8923591745677635,
+      "grad_norm": 0.1697588562965393,
+      "learning_rate": 0.00029439363704250176,
+      "loss": 0.2434,
+      "step": 600
+    },
+    {
+      "epoch": 0.9072318274772263,
+      "grad_norm": 0.15528830885887146,
+      "learning_rate": 0.0002939637938671306,
+      "loss": 0.2293,
+      "step": 610
+    },
+    {
+      "epoch": 0.922104480386689,
+      "grad_norm": 0.43390974402427673,
+      "learning_rate": 0.0002935184130340893,
+      "loss": 0.228,
+      "step": 620
+    },
+    {
+      "epoch": 0.9369771332961517,
+      "grad_norm": 0.2026420682668686,
+      "learning_rate": 0.000293057542612234,
+      "loss": 0.2355,
+      "step": 630
+    },
+    {
+      "epoch": 0.9518497862056144,
+      "grad_norm": 0.16864228248596191,
+      "learning_rate": 0.00029258123234217435,
+      "loss": 0.2213,
+      "step": 640
+    },
+    {
+      "epoch": 0.9667224391150772,
+      "grad_norm": 0.15947186946868896,
+      "learning_rate": 0.0002920895336309044,
+      "loss": 0.2079,
+      "step": 650
+    },
+    {
+      "epoch": 0.9815950920245399,
+      "grad_norm": 0.21965055167675018,
+      "learning_rate": 0.0002915824995462551,
+      "loss": 0.2002,
+      "step": 660
+    },
+    {
+      "epoch": 0.9964677449340026,
+      "grad_norm": 0.23223313689231873,
+      "learning_rate": 0.00029106018481116626,
+      "loss": 0.1983,
+      "step": 670
+    },
+    {
+      "epoch": 1.0117122141662018,
+      "grad_norm": 0.26117920875549316,
+      "learning_rate": 0.00029052264579778063,
+      "loss": 0.2175,
+      "step": 680
+    },
+    {
+      "epoch": 1.0265848670756645,
+      "grad_norm": 0.176736518740654,
+      "learning_rate": 0.00028996994052135996,
+      "loss": 0.1831,
+      "step": 690
+    },
+    {
+      "epoch": 1.0414575199851273,
+      "grad_norm": 0.17873461544513702,
+      "learning_rate": 0.0002894021286340233,
+      "loss": 0.1784,
+      "step": 700
+    },
+    {
+      "epoch": 1.05633017289459,
+      "grad_norm": 0.2646450996398926,
+      "learning_rate": 0.0002888192714183092,
+      "loss": 0.1784,
+      "step": 710
+    },
+    {
+      "epoch": 1.0712028258040527,
+      "grad_norm": 0.16840551793575287,
+      "learning_rate": 0.00028822143178056114,
+      "loss": 0.1726,
+      "step": 720
+    },
+    {
+      "epoch": 1.0860754787135156,
+      "grad_norm": 0.1423952877521515,
+      "learning_rate": 0.0002876086742441387,
+      "loss": 0.1608,
+      "step": 730
+    },
+    {
+      "epoch": 1.1009481316229783,
+      "grad_norm": 0.16237640380859375,
+      "learning_rate": 0.0002869810649424535,
+      "loss": 0.179,
+      "step": 740
+    },
+    {
+      "epoch": 1.115820784532441,
+      "grad_norm": 0.158773735165596,
+      "learning_rate": 0.0002863386716118316,
+      "loss": 0.1742,
+      "step": 750
+    },
+    {
+      "epoch": 1.1306934374419038,
+      "grad_norm": 0.17627516388893127,
+      "learning_rate": 0.0002856815635842029,
+      "loss": 0.1821,
+      "step": 760
+    },
+    {
+      "epoch": 1.1455660903513665,
+      "grad_norm": 0.23613831400871277,
+      "learning_rate": 0.00028500981177961816,
+      "loss": 0.156,
+      "step": 770
+    },
+    {
+      "epoch": 1.1604387432608292,
+      "grad_norm": 0.16501256823539734,
+      "learning_rate": 0.0002843234886985951,
+      "loss": 0.1517,
+      "step": 780
+    },
+    {
+      "epoch": 1.175311396170292,
+      "grad_norm": 0.2365158647298813,
+      "learning_rate": 0.00028362266841429345,
+      "loss": 0.1391,
+      "step": 790
+    },
+    {
+      "epoch": 1.1901840490797546,
+      "grad_norm": 0.17508777976036072,
+      "learning_rate": 0.00028290742656452014,
+      "loss": 0.1434,
+      "step": 800
+    },
+    {
+      "epoch": 1.2050567019892173,
+      "grad_norm": 0.145797461271286,
+      "learning_rate": 0.0002821778403435663,
+      "loss": 0.1607,
+      "step": 810
+    },
+    {
+      "epoch": 1.21992935489868,
+      "grad_norm": 0.15968403220176697,
+      "learning_rate": 0.00028143398849387577,
+      "loss": 0.1536,
+      "step": 820
+    },
+    {
+      "epoch": 1.2348020078081428,
+      "grad_norm": 0.1553070992231369,
+      "learning_rate": 0.00028067595129754647,
+      "loss": 0.1481,
+      "step": 830
+    },
+    {
+      "epoch": 1.2496746607176055,
+      "grad_norm": 0.1769135743379593,
+      "learning_rate": 0.0002799038105676658,
+      "loss": 0.1285,
+      "step": 840
+    },
+    {
+      "epoch": 1.2645473136270682,
+      "grad_norm": 0.1639111191034317,
+      "learning_rate": 0.0002791176496394808,
+      "loss": 0.144,
+      "step": 850
+    },
+    {
+      "epoch": 1.279419966536531,
+      "grad_norm": 0.19045153260231018,
+      "learning_rate": 0.00027831755336140416,
+      "loss": 0.1347,
+      "step": 860
+    },
+    {
+      "epoch": 1.2942926194459936,
+      "grad_norm": 0.18079642951488495,
+      "learning_rate": 0.00027750360808585637,
+      "loss": 0.1254,
+      "step": 870
+    },
+    {
+      "epoch": 1.3091652723554563,
+      "grad_norm": 0.18368874490261078,
+      "learning_rate": 0.00027667590165994613,
+      "loss": 0.1289,
+      "step": 880
+    },
+    {
+      "epoch": 1.324037925264919,
+      "grad_norm": 0.20005619525909424,
+      "learning_rate": 0.00027583452341598935,
+      "loss": 0.1246,
+      "step": 890
+    },
+    {
+      "epoch": 1.338910578174382,
+      "grad_norm": 0.1317131668329239,
+      "learning_rate": 0.0002749795641618673,
+      "loss": 0.1238,
+      "step": 900
+    },
+    {
+      "epoch": 1.3537832310838445,
+      "grad_norm": 0.15287995338439941,
+      "learning_rate": 0.00027411111617122656,
+      "loss": 0.1224,
+      "step": 910
+    },
+    {
+      "epoch": 1.3686558839933074,
+      "grad_norm": 0.1613466739654541,
+      "learning_rate": 0.0002732292731735196,
+      "loss": 0.1178,
+      "step": 920
+    },
+    {
+      "epoch": 1.3835285369027701,
+      "grad_norm": 0.1685304194688797,
+      "learning_rate": 0.000272334130343889,
+      "loss": 0.1201,
+      "step": 930
+    },
+    {
+      "epoch": 1.3984011898122328,
+      "grad_norm": 0.19208119809627533,
+      "learning_rate": 0.0002714257842928956,
+      "loss": 0.1103,
+      "step": 940
+    },
+    {
+      "epoch": 1.4132738427216955,
+      "grad_norm": 0.17899583280086517,
+      "learning_rate": 0.00027050433305609125,
+      "loss": 0.1128,
+      "step": 950
+    },
+    {
+      "epoch": 1.4281464956311583,
+      "grad_norm": 0.19848547875881195,
+      "learning_rate": 0.0002695698760834384,
+      "loss": 0.1112,
+      "step": 960
+    },
+    {
+      "epoch": 1.443019148540621,
+      "grad_norm": 0.1710231602191925,
+      "learning_rate": 0.0002686225142285762,
+      "loss": 0.1107,
+      "step": 970
+    },
+    {
+      "epoch": 1.4578918014500837,
+      "grad_norm": 0.1552249938249588,
+      "learning_rate": 0.0002676623497379363,
+      "loss": 0.0984,
+      "step": 980
+    },
+    {
+      "epoch": 1.4727644543595464,
+      "grad_norm": 0.1702568084001541,
+      "learning_rate": 0.0002666894862397072,
+      "loss": 0.1109,
+      "step": 990
+    },
+    {
+      "epoch": 1.487637107269009,
+      "grad_norm": 0.12360525131225586,
+      "learning_rate": 0.00026570402873264996,
+      "loss": 0.1018,
+      "step": 1000
+    },
+    {
+      "epoch": 1.487637107269009,
+      "eval_loss": 0.10193677991628647,
+      "eval_runtime": 212.152,
+      "eval_samples_per_second": 22.536,
+      "eval_steps_per_second": 22.536,
+      "step": 1000
+    },
+    {
+      "epoch": 1.5025097601784718,
+      "grad_norm": 0.14356306195259094,
+      "learning_rate": 0.0002647060835747659,
+      "loss": 0.101,
+      "step": 1010
+    },
+    {
+      "epoch": 1.5173824130879345,
+      "grad_norm": 0.12723973393440247,
+      "learning_rate": 0.00026369575847181795,
+      "loss": 0.095,
+      "step": 1020
+    },
+    {
+      "epoch": 1.5322550659973972,
+      "grad_norm": 0.12857410311698914,
+      "learning_rate": 0.0002626731624657058,
+      "loss": 0.0915,
+      "step": 1030
+    },
+    {
+      "epoch": 1.54712771890686,
+      "grad_norm": 0.1593610793352127,
+      "learning_rate": 0.0002616384059226977,
+      "loss": 0.0993,
+      "step": 1040
+    },
+    {
+      "epoch": 1.562000371816323,
+      "grad_norm": 0.11687605082988739,
+      "learning_rate": 0.0002605916005215186,
+      "loss": 0.0894,
+      "step": 1050
+    },
+    {
+      "epoch": 1.5768730247257854,
+      "grad_norm": 0.1873299479484558,
+      "learning_rate": 0.0002595328592412969,
+      "loss": 0.097,
+      "step": 1060
+    },
+    {
+      "epoch": 1.5917456776352483,
+      "grad_norm": 0.1516319364309311,
+      "learning_rate": 0.00025846229634937136,
+      "loss": 0.0931,
+      "step": 1070
+    },
+    {
+      "epoch": 1.6066183305447108,
+      "grad_norm": 0.1431397646665573,
+      "learning_rate": 0.0002573800273889577,
+      "loss": 0.0918,
+      "step": 1080
+    },
+    {
+      "epoch": 1.6214909834541738,
+      "grad_norm": 0.17975349724292755,
+      "learning_rate": 0.0002562861691666793,
+      "loss": 0.0892,
+      "step": 1090
+    },
+    {
+      "epoch": 1.6363636363636362,
+      "grad_norm": 0.1414797306060791,
+      "learning_rate": 0.0002551808397399597,
+      "loss": 0.0952,
+      "step": 1100
+    },
+    {
+      "epoch": 1.6512362892730992,
+      "grad_norm": 0.151850625872612,
+      "learning_rate": 0.0002540641584042812,
+      "loss": 0.1008,
+      "step": 1110
+    },
+    {
+      "epoch": 1.6661089421825617,
+      "grad_norm": 0.1266675442457199,
+      "learning_rate": 0.00025293624568031,
+      "loss": 0.0782,
+      "step": 1120
+    },
+    {
+      "epoch": 1.6809815950920246,
+      "grad_norm": 0.12076599150896072,
+      "learning_rate": 0.0002517972233008882,
+      "loss": 0.0772,
+      "step": 1130
+    },
+    {
+      "epoch": 1.6958542480014873,
+      "grad_norm": 0.125094935297966,
+      "learning_rate": 0.0002506472141978955,
+      "loss": 0.0837,
+      "step": 1140
+    },
+    {
+      "epoch": 1.71072690091095,
+      "grad_norm": 0.13272984325885773,
+      "learning_rate": 0.0002494863424889819,
+      "loss": 0.0736,
+      "step": 1150
+    },
+    {
+      "epoch": 1.7255995538204127,
+      "grad_norm": 0.16893050074577332,
+      "learning_rate": 0.00024831473346417153,
+      "loss": 0.0856,
+      "step": 1160
+    },
+    {
+      "epoch": 1.7404722067298755,
+      "grad_norm": 0.11702137440443039,
+      "learning_rate": 0.00024713251357234053,
+      "loss": 0.0799,
+      "step": 1170
+    },
+    {
+      "epoch": 1.7553448596393382,
+      "grad_norm": 0.13682794570922852,
+      "learning_rate": 0.00024593981040756997,
+      "loss": 0.089,
+      "step": 1180
+    },
+    {
+      "epoch": 1.7702175125488009,
+      "grad_norm": 0.13676613569259644,
+      "learning_rate": 0.0002447367526953746,
+      "loss": 0.0797,
+      "step": 1190
+    },
+    {
+      "epoch": 1.7850901654582636,
+      "grad_norm": 0.13324877619743347,
+      "learning_rate": 0.00024352347027881003,
+      "loss": 0.0792,
+      "step": 1200
+    },
+    {
+      "epoch": 1.7999628183677263,
+      "grad_norm": 0.11255478858947754,
+      "learning_rate": 0.00024230009410445893,
+      "loss": 0.0763,
+      "step": 1210
+    },
+    {
+      "epoch": 1.814835471277189,
+      "grad_norm": 0.10950371623039246,
+      "learning_rate": 0.0002410667562082985,
+      "loss": 0.0663,
+      "step": 1220
+    },
+    {
+      "epoch": 1.8297081241866517,
+      "grad_norm": 0.11777317523956299,
+      "learning_rate": 0.00023982358970145004,
+      "loss": 0.0694,
+      "step": 1230
+    },
+    {
+      "epoch": 1.8445807770961147,
+      "grad_norm": 0.1194106712937355,
+      "learning_rate": 0.00023857072875581244,
+      "loss": 0.0703,
+      "step": 1240
+    },
+    {
+      "epoch": 1.8594534300055772,
+      "grad_norm": 0.11233114451169968,
+      "learning_rate": 0.00023730830858958177,
+      "loss": 0.0655,
+      "step": 1250
+    },
+    {
+      "epoch": 1.87432608291504,
+      "grad_norm": 0.11641702055931091,
+      "learning_rate": 0.00023603646545265687,
+      "loss": 0.0645,
+      "step": 1260
+    },
+    {
+      "epoch": 1.8891987358245026,
+      "grad_norm": 0.1414889097213745,
+      "learning_rate": 0.00023475533661193495,
+      "loss": 0.068,
+      "step": 1270
+    },
+    {
+      "epoch": 1.9040713887339655,
+      "grad_norm": 0.10632241517305374,
+      "learning_rate": 0.00023346506033649614,
+      "loss": 0.064,
+      "step": 1280
+    },
+    {
+      "epoch": 1.918944041643428,
+      "grad_norm": 0.10176625102758408,
+      "learning_rate": 0.0002321657758826807,
+      "loss": 0.062,
+      "step": 1290
+    },
+    {
+      "epoch": 1.933816694552891,
+      "grad_norm": 0.09434150904417038,
+      "learning_rate": 0.00023085762347905943,
+      "loss": 0.0684,
+      "step": 1300
+    },
+    {
+      "epoch": 1.9486893474623534,
+      "grad_norm": 0.12967799603939056,
+      "learning_rate": 0.00022954074431129915,
+      "loss": 0.0605,
+      "step": 1310
+    },
+    {
+      "epoch": 1.9635620003718164,
+      "grad_norm": 0.1181391179561615,
+      "learning_rate": 0.0002282152805069247,
+      "loss": 0.0654,
+      "step": 1320
+    },
+    {
+      "epoch": 1.978434653281279,
+      "grad_norm": 0.10801093280315399,
+      "learning_rate": 0.00022688137511997977,
+      "loss": 0.07,
+      "step": 1330
+    },
+    {
+      "epoch": 1.9933073061907418,
+      "grad_norm": 0.11437591165304184,
+      "learning_rate": 0.00022553917211558713,
+      "loss": 0.0578,
+      "step": 1340
+    },
+    {
+      "epoch": 2.008551775422941,
+      "grad_norm": 0.11018254607915878,
+      "learning_rate": 0.0002241888163544111,
+      "loss": 0.0565,
+      "step": 1350
+    },
+    {
+      "epoch": 2.0234244283324037,
+      "grad_norm": 0.08331198990345001,
+      "learning_rate": 0.0002228304535770228,
+      "loss": 0.0399,
+      "step": 1360
+    },
+    {
+      "epoch": 2.0382970812418666,
+      "grad_norm": 0.09547814726829529,
+      "learning_rate": 0.00022146423038817102,
+      "loss": 0.0438,
+      "step": 1370
+    },
+    {
+      "epoch": 2.053169734151329,
+      "grad_norm": 0.10641171038150787,
+      "learning_rate": 0.00022009029424095928,
+      "loss": 0.0384,
+      "step": 1380
+    },
+    {
+      "epoch": 2.068042387060792,
+      "grad_norm": 0.10844069719314575,
+      "learning_rate": 0.0002187087934209318,
+      "loss": 0.044,
+      "step": 1390
+    },
+    {
+      "epoch": 2.0829150399702545,
+      "grad_norm": 0.10333788394927979,
+      "learning_rate": 0.00021731987703006933,
+      "loss": 0.041,
+      "step": 1400
+    },
+    {
+      "epoch": 2.0977876928797174,
+      "grad_norm": 0.10635129362344742,
+      "learning_rate": 0.0002159236949706967,
+      "loss": 0.04,
+      "step": 1410
+    },
+    {
+      "epoch": 2.11266034578918,
+      "grad_norm": 0.09010270237922668,
+      "learning_rate": 0.00021452039792930474,
+      "loss": 0.0402,
+      "step": 1420
+    },
+    {
+      "epoch": 2.127532998698643,
+      "grad_norm": 0.09274252504110336,
+      "learning_rate": 0.00021311013736028658,
+      "loss": 0.0384,
+      "step": 1430
+    },
+    {
+      "epoch": 2.1424056516081054,
+      "grad_norm": 0.08550871163606644,
+      "learning_rate": 0.00021169306546959174,
+      "loss": 0.0428,
+      "step": 1440
+    },
+    {
+      "epoch": 2.1572783045175683,
+      "grad_norm": 0.10152186453342438,
+      "learning_rate": 0.00021026933519829896,
+      "loss": 0.0442,
+      "step": 1450
+    },
+    {
+      "epoch": 2.1721509574270312,
+      "grad_norm": 0.08528181910514832,
+      "learning_rate": 0.00020883910020610957,
+      "loss": 0.0375,
+      "step": 1460
+    },
+    {
+      "epoch": 2.1870236103364937,
+      "grad_norm": 0.09736708551645279,
+      "learning_rate": 0.00020740251485476345,
+      "loss": 0.0387,
+      "step": 1470
+    },
+    {
+      "epoch": 2.2018962632459567,
+      "grad_norm": 0.09133671224117279,
+      "learning_rate": 0.00020595973419137908,
+      "loss": 0.0373,
+      "step": 1480
+    },
+    {
+      "epoch": 2.216768916155419,
+      "grad_norm": 0.08406363427639008,
+      "learning_rate": 0.00020451091393171964,
+      "loss": 0.0381,
+      "step": 1490
+    },
+    {
+      "epoch": 2.231641569064882,
+      "grad_norm": 0.08503925055265427,
+      "learning_rate": 0.00020305621044338718,
+      "loss": 0.0376,
+      "step": 1500
+    },
+    {
+      "epoch": 2.231641569064882,
+      "eval_loss": 0.051042910665273666,
+      "eval_runtime": 212.5441,
+      "eval_samples_per_second": 22.494,
+      "eval_steps_per_second": 22.494,
+      "step": 1500
+    },
+    {
+      "epoch": 2.2465142219743446,
+      "grad_norm": 0.09201103448867798,
+      "learning_rate": 0.00020159578072894606,
+      "loss": 0.0393,
+      "step": 1510
+    },
+    {
+      "epoch": 2.2613868748838075,
+      "grad_norm": 0.09499834477901459,
+      "learning_rate": 0.00020012978240897814,
+      "loss": 0.0346,
+      "step": 1520
+    },
+    {
+      "epoch": 2.27625952779327,
+      "grad_norm": 0.09396501630544662,
+      "learning_rate": 0.00019865837370507106,
+      "loss": 0.039,
+      "step": 1530
+    },
+    {
+      "epoch": 2.291132180702733,
+      "grad_norm": 0.08983522653579712,
+      "learning_rate": 0.00019718171342274205,
+      "loss": 0.0387,
+      "step": 1540
+    },
+    {
+      "epoch": 2.3060048336121954,
+      "grad_norm": 0.1118871420621872,
+      "learning_rate": 0.00019569996093429814,
+      "loss": 0.0379,
+      "step": 1550
+    },
+    {
+      "epoch": 2.3208774865216584,
+      "grad_norm": 0.08434595167636871,
+      "learning_rate": 0.00019421327616163563,
+      "loss": 0.0372,
+      "step": 1560
+    },
+    {
+      "epoch": 2.335750139431121,
+      "grad_norm": 0.0915694460272789,
+      "learning_rate": 0.00019272181955898017,
+      "loss": 0.036,
+      "step": 1570
+    },
+    {
+      "epoch": 2.350622792340584,
+      "grad_norm": 0.08459066599607468,
+      "learning_rate": 0.0001912257520955692,
+      "loss": 0.0363,
+      "step": 1580
+    },
+    {
+      "epoch": 2.3654954452500463,
+      "grad_norm": 0.09195558726787567,
+      "learning_rate": 0.00018972523523827907,
+      "loss": 0.0389,
+      "step": 1590
+    },
+    {
+      "epoch": 2.3803680981595092,
+      "grad_norm": 0.09830203652381897,
+      "learning_rate": 0.0001882204309341982,
+      "loss": 0.0373,
+      "step": 1600
+    },
+    {
+      "epoch": 2.3952407510689717,
+      "grad_norm": 0.08541320264339447,
+      "learning_rate": 0.00018671150159314855,
+      "loss": 0.0342,
+      "step": 1610
+    },
+    {
+      "epoch": 2.4101134039784347,
+      "grad_norm": 0.08817029744386673,
+      "learning_rate": 0.00018519861007015729,
+      "loss": 0.0371,
+      "step": 1620
+    },
+    {
+      "epoch": 2.4249860568878976,
+      "grad_norm": 0.08839129656553268,
+      "learning_rate": 0.00018368191964788,
+      "loss": 0.0355,
+      "step": 1630
+    },
+    {
+      "epoch": 2.43985870979736,
+      "grad_norm": 0.08589951694011688,
+      "learning_rate": 0.00018216159401897812,
+      "loss": 0.0339,
+      "step": 1640
+    },
+    {
+      "epoch": 2.454731362706823,
+      "grad_norm": 0.09998754411935806,
+      "learning_rate": 0.00018063779726845203,
+      "loss": 0.0339,
+      "step": 1650
+    },
+    {
+      "epoch": 2.4696040156162855,
+      "grad_norm": 0.08363664150238037,
+      "learning_rate": 0.0001791106938559317,
+      "loss": 0.0357,
+      "step": 1660
+    },
+    {
+      "epoch": 2.4844766685257484,
+      "grad_norm": 0.08930620551109314,
+      "learning_rate": 0.00017758044859792705,
+      "loss": 0.0347,
+      "step": 1670
+    },
+    {
+      "epoch": 2.499349321435211,
+      "grad_norm": 0.08270251750946045,
+      "learning_rate": 0.00017604722665003956,
+      "loss": 0.0332,
+      "step": 1680
+    },
+    {
+      "epoch": 2.514221974344674,
+      "grad_norm": 0.09085123986005783,
+      "learning_rate": 0.00017451119348913744,
+      "loss": 0.0357,
+      "step": 1690
+    },
+    {
+      "epoch": 2.5290946272541364,
+      "grad_norm": 0.0897296592593193,
+      "learning_rate": 0.00017297251489549638,
+      "loss": 0.0368,
+      "step": 1700
+    },
+    {
+      "epoch": 2.5439672801635993,
+      "grad_norm": 0.07172433286905289,
+      "learning_rate": 0.000171431356934907,
+      "loss": 0.0371,
+      "step": 1710
+    },
+    {
+      "epoch": 2.558839933073062,
+      "grad_norm": 0.0848449245095253,
+      "learning_rate": 0.0001698878859407519,
+      "loss": 0.032,
+      "step": 1720
+    },
+    {
+      "epoch": 2.5737125859825247,
+      "grad_norm": 0.08270355314016342,
+      "learning_rate": 0.00016834226849605371,
+      "loss": 0.0333,
+      "step": 1730
+    },
+    {
+      "epoch": 2.588585238891987,
+      "grad_norm": 0.07130729407072067,
+      "learning_rate": 0.00016679467141549617,
+      "loss": 0.0324,
+      "step": 1740
+    },
+    {
+      "epoch": 2.60345789180145,
+      "grad_norm": 0.07863139361143112,
+      "learning_rate": 0.00016524526172742026,
+      "loss": 0.0295,
+      "step": 1750
+    },
+    {
+      "epoch": 2.6183305447109126,
+      "grad_norm": 0.08600688725709915,
+      "learning_rate": 0.00016369420665579725,
+      "loss": 0.0342,
+      "step": 1760
+    },
+    {
+      "epoch": 2.6332031976203756,
+      "grad_norm": 0.10146727412939072,
+      "learning_rate": 0.0001621416736021805,
+      "loss": 0.032,
+      "step": 1770
+    },
+    {
+      "epoch": 2.648075850529838,
+      "grad_norm": 0.0812121257185936,
+      "learning_rate": 0.00016058783012763844,
+      "loss": 0.0341,
+      "step": 1780
+    },
+    {
+      "epoch": 2.662948503439301,
+      "grad_norm": 0.0973149985074997,
+      "learning_rate": 0.00015903284393466987,
+      "loss": 0.0313,
+      "step": 1790
+    },
+    {
+      "epoch": 2.677821156348764,
+      "grad_norm": 0.0835902988910675,
+      "learning_rate": 0.00015747688284910457,
+      "loss": 0.0298,
+      "step": 1800
+    },
+    {
+      "epoch": 2.6926938092582264,
+      "grad_norm": 0.07972200214862823,
+      "learning_rate": 0.00015592011480198992,
+      "loss": 0.0346,
+      "step": 1810
+    },
+    {
+      "epoch": 2.707566462167689,
+      "grad_norm": 0.07594762742519379,
+      "learning_rate": 0.0001543627078114667,
+      "loss": 0.0338,
+      "step": 1820
+    },
+    {
+      "epoch": 2.722439115077152,
+      "grad_norm": 0.07757771015167236,
+      "learning_rate": 0.00015280482996463533,
+      "loss": 0.0315,
+      "step": 1830
+    },
+    {
+      "epoch": 2.737311767986615,
+      "grad_norm": 0.06432707607746124,
+      "learning_rate": 0.00015124664939941457,
+      "loss": 0.0319,
+      "step": 1840
+    },
+    {
+      "epoch": 2.7521844208960773,
+      "grad_norm": 0.07696104794740677,
+      "learning_rate": 0.00014968833428639474,
+      "loss": 0.0301,
+      "step": 1850
+    },
+    {
+      "epoch": 2.7670570738055402,
+      "grad_norm": 0.07426641881465912,
+      "learning_rate": 0.00014813005281068774,
+      "loss": 0.0285,
+      "step": 1860
+    },
+    {
+      "epoch": 2.7819297267150027,
+      "grad_norm": 0.0765393078327179,
+      "learning_rate": 0.00014657197315377495,
+      "loss": 0.0313,
+      "step": 1870
+    },
+    {
+      "epoch": 2.7968023796244657,
+      "grad_norm": 0.07151610404253006,
+      "learning_rate": 0.00014501426347535598,
+      "loss": 0.03,
+      "step": 1880
+    },
+    {
+      "epoch": 2.811675032533928,
+      "grad_norm": 0.07834175229072571,
+      "learning_rate": 0.0001434570918951996,
+      "loss": 0.0286,
+      "step": 1890
+    },
+    {
+      "epoch": 2.826547685443391,
+      "grad_norm": 0.09932053834199905,
+      "learning_rate": 0.00014190062647499892,
+      "loss": 0.0307,
+      "step": 1900
+    },
+    {
+      "epoch": 2.8414203383528536,
+      "grad_norm": 0.08595503121614456,
+      "learning_rate": 0.00014034503520023297,
+      "loss": 0.0306,
+      "step": 1910
+    },
+    {
+      "epoch": 2.8562929912623165,
+      "grad_norm": 0.08349858224391937,
+      "learning_rate": 0.00013879048596203636,
+      "loss": 0.0306,
+      "step": 1920
+    },
+    {
+      "epoch": 2.871165644171779,
+      "grad_norm": 0.07905739545822144,
+      "learning_rate": 0.0001372371465390794,
+      "loss": 0.0305,
+      "step": 1930
+    },
+    {
+      "epoch": 2.886038297081242,
+      "grad_norm": 0.06820567697286606,
+      "learning_rate": 0.0001356851845794598,
+      "loss": 0.0276,
+      "step": 1940
+    },
+    {
+      "epoch": 2.9009109499907044,
+      "grad_norm": 0.07227708399295807,
+      "learning_rate": 0.00013413476758260934,
+      "loss": 0.0267,
+      "step": 1950
+    },
+    {
+      "epoch": 2.9157836029001674,
+      "grad_norm": 0.09035148471593857,
+      "learning_rate": 0.00013258606288121542,
+      "loss": 0.0287,
+      "step": 1960
+    },
+    {
+      "epoch": 2.9306562558096303,
+      "grad_norm": 0.08626757562160492,
+      "learning_rate": 0.00013103923762316198,
+      "loss": 0.0298,
+      "step": 1970
+    },
+    {
+      "epoch": 2.945528908719093,
+      "grad_norm": 0.0765102431178093,
+      "learning_rate": 0.00012949445875348902,
+      "loss": 0.0274,
+      "step": 1980
+    },
+    {
+      "epoch": 2.9604015616285553,
+      "grad_norm": 0.08610813319683075,
+      "learning_rate": 0.00012795189299637483,
+      "loss": 0.0283,
+      "step": 1990
+    },
+    {
+      "epoch": 2.975274214538018,
+      "grad_norm": 0.08020433783531189,
+      "learning_rate": 0.00012641170683714222,
+      "loss": 0.0267,
+      "step": 2000
+    },
+    {
+      "epoch": 2.975274214538018,
+      "eval_loss": 0.0338360071182251,
+      "eval_runtime": 212.7237,
+      "eval_samples_per_second": 22.475,
+      "eval_steps_per_second": 22.475,
+      "step": 2000
+    },
+    {
+      "epoch": 2.990146867447481,
+      "grad_norm": 0.06885667890310287,
+      "learning_rate": 0.00012487406650428954,
+      "loss": 0.0277,
+      "step": 2010
+    },
+    {
+      "epoch": 3.00539133667968,
+      "grad_norm": 0.07658534497022629,
+      "learning_rate": 0.00012333913795155053,
+      "loss": 0.0251,
+      "step": 2020
+    },
+    {
+      "epoch": 3.020263989589143,
+      "grad_norm": 0.06449634581804276,
+      "learning_rate": 0.00012180708683998321,
+      "loss": 0.0147,
+      "step": 2030
+    },
+    {
+      "epoch": 3.0351366424986055,
+      "grad_norm": 0.06312290579080582,
+      "learning_rate": 0.00012027807852009038,
+      "loss": 0.0157,
+      "step": 2040
+    },
+    {
+      "epoch": 3.0500092954080684,
+      "grad_norm": 0.07343071699142456,
+      "learning_rate": 0.00011875227801397381,
+      "loss": 0.0149,
+      "step": 2050
+    },
+    {
+      "epoch": 3.064881948317531,
+      "grad_norm": 0.06489036977291107,
+      "learning_rate": 0.00011722984999752392,
+      "loss": 0.0155,
+      "step": 2060
+    },
+    {
+      "epoch": 3.079754601226994,
+      "grad_norm": 0.06041651591658592,
+      "learning_rate": 0.00011571095878264658,
+      "loss": 0.0139,
+      "step": 2070
+    },
+    {
+      "epoch": 3.094627254136457,
+      "grad_norm": 0.07048339396715164,
+      "learning_rate": 0.00011419576829952933,
+      "loss": 0.014,
+      "step": 2080
+    },
+    {
+      "epoch": 3.1094999070459193,
+      "grad_norm": 0.05680292099714279,
+      "learning_rate": 0.00011268444207894902,
+      "loss": 0.0133,
+      "step": 2090
+    },
+    {
+      "epoch": 3.124372559955382,
+      "grad_norm": 0.0727318823337555,
+      "learning_rate": 0.00011117714323462186,
+      "loss": 0.0147,
+      "step": 2100
+    },
+    {
+      "epoch": 3.1392452128648447,
+      "grad_norm": 0.054686855524778366,
+      "learning_rate": 0.00010967403444559963,
+      "loss": 0.0143,
+      "step": 2110
+    },
+    {
+      "epoch": 3.1541178657743076,
+      "grad_norm": 0.05729954317212105,
+      "learning_rate": 0.00010817527793871143,
+      "loss": 0.0134,
+      "step": 2120
+    },
+    {
+      "epoch": 3.16899051868377,
+      "grad_norm": 0.08314567804336548,
+      "learning_rate": 0.00010668103547105553,
+      "loss": 0.0148,
+      "step": 2130
+    },
+    {
+      "epoch": 3.183863171593233,
+      "grad_norm": 0.05523039028048515,
+      "learning_rate": 0.00010519146831254088,
+      "loss": 0.0129,
+      "step": 2140
+    },
+    {
+      "epoch": 3.1987358245026956,
+      "grad_norm": 0.05546917766332626,
+      "learning_rate": 0.00010370673722848183,
+      "loss": 0.0139,
+      "step": 2150
+    },
+    {
+      "epoch": 3.2136084774121585,
+      "grad_norm": 0.05486704409122467,
+      "learning_rate": 0.00010222700246224735,
+      "loss": 0.0135,
+      "step": 2160
+    },
+    {
+      "epoch": 3.228481130321621,
+      "grad_norm": 0.05656208097934723,
+      "learning_rate": 0.00010075242371796585,
+      "loss": 0.0125,
+      "step": 2170
+    },
+    {
+      "epoch": 3.243353783231084,
+      "grad_norm": 0.053801827132701874,
+      "learning_rate": 9.928316014328916e-05,
+      "loss": 0.0141,
+      "step": 2180
+    },
+    {
+      "epoch": 3.2582264361405464,
+      "grad_norm": 0.061040911823511124,
+      "learning_rate": 9.781937031221589e-05,
+      "loss": 0.0136,
+      "step": 2190
+    },
+    {
+      "epoch": 3.2730990890500093,
+      "grad_norm": 0.05558522418141365,
+      "learning_rate": 9.636121220797707e-05,
+      "loss": 0.0138,
+      "step": 2200
+    },
+    {
+      "epoch": 3.287971741959472,
+      "grad_norm": 0.055547308176755905,
+      "learning_rate": 9.490884320598516e-05,
+      "loss": 0.0136,
+      "step": 2210
+    },
+    {
+      "epoch": 3.3028443948689348,
+      "grad_norm": 0.061592597514390945,
+      "learning_rate": 9.34624200568492e-05,
+      "loss": 0.014,
+      "step": 2220
+    },
+    {
+      "epoch": 3.3177170477783973,
+      "grad_norm": 0.05287894979119301,
+      "learning_rate": 9.202209886945698e-05,
+      "loss": 0.0125,
+      "step": 2230
+    },
+    {
+      "epoch": 3.33258970068786,
+      "grad_norm": 0.06365808844566345,
+      "learning_rate": 9.058803509412646e-05,
+      "loss": 0.0139,
+      "step": 2240
+    },
+    {
+      "epoch": 3.347462353597323,
+      "grad_norm": 0.05474059656262398,
+      "learning_rate": 8.916038350582876e-05,
+      "loss": 0.0141,
+      "step": 2250
+    },
+    {
+      "epoch": 3.3623350065067856,
+      "grad_norm": 0.054872963577508926,
+      "learning_rate": 8.773929818748315e-05,
+      "loss": 0.0135,
+      "step": 2260
+    },
+    {
+      "epoch": 3.3772076594162486,
+      "grad_norm": 0.05935963988304138,
+      "learning_rate": 8.632493251332793e-05,
+      "loss": 0.0128,
+      "step": 2270
+    },
+    {
+      "epoch": 3.392080312325711,
+      "grad_norm": 0.06830602139234543,
+      "learning_rate": 8.491743913236628e-05,
+      "loss": 0.0133,
+      "step": 2280
+    },
+    {
+      "epoch": 3.406952965235174,
+      "grad_norm": 0.057178862392902374,
+      "learning_rate": 8.351696995189218e-05,
+      "loss": 0.0121,
+      "step": 2290
+    },
+    {
+      "epoch": 3.4218256181446365,
+      "grad_norm": 0.06827449798583984,
+      "learning_rate": 8.212367612109464e-05,
+      "loss": 0.0127,
+      "step": 2300
+    },
+    {
+      "epoch": 3.4366982710540994,
+      "grad_norm": 0.04981634393334389,
+      "learning_rate": 8.073770801474495e-05,
+      "loss": 0.0132,
+      "step": 2310
+    },
+    {
+      "epoch": 3.451570923963562,
+      "grad_norm": 0.052124422043561935,
+      "learning_rate": 7.935921521696702e-05,
+      "loss": 0.0129,
+      "step": 2320
+    },
+    {
+      "epoch": 3.466443576873025,
+      "grad_norm": 0.05991722270846367,
+      "learning_rate": 7.798834650509306e-05,
+      "loss": 0.0128,
+      "step": 2330
+    },
+    {
+      "epoch": 3.4813162297824873,
+      "grad_norm": 0.05946414917707443,
+      "learning_rate": 7.662524983360665e-05,
+      "loss": 0.0127,
+      "step": 2340
+    },
+    {
+      "epoch": 3.4961888826919503,
+      "grad_norm": 0.05650801584124565,
+      "learning_rate": 7.527007231817389e-05,
+      "loss": 0.0127,
+      "step": 2350
+    },
+    {
+      "epoch": 3.5110615356014128,
+      "grad_norm": 0.04841410368680954,
+      "learning_rate": 7.392296021976614e-05,
+      "loss": 0.0122,
+      "step": 2360
+    },
+    {
+      "epoch": 3.5259341885108757,
+      "grad_norm": 0.05933946743607521,
+      "learning_rate": 7.258405892887398e-05,
+      "loss": 0.0121,
+      "step": 2370
+    },
+    {
+      "epoch": 3.540806841420338,
+      "grad_norm": 0.05451497435569763,
+      "learning_rate": 7.125351294981598e-05,
+      "loss": 0.0127,
+      "step": 2380
+    },
+    {
+      "epoch": 3.555679494329801,
+      "grad_norm": 0.05574881285429001,
+      "learning_rate": 6.993146588514225e-05,
+      "loss": 0.0124,
+      "step": 2390
+    },
+    {
+      "epoch": 3.5705521472392636,
+      "grad_norm": 0.057919006794691086,
+      "learning_rate": 6.86180604201361e-05,
+      "loss": 0.0119,
+      "step": 2400
+    },
+    {
+      "epoch": 3.5854248001487266,
+      "grad_norm": 0.051368821412324905,
+      "learning_rate": 6.731343830741433e-05,
+      "loss": 0.0126,
+      "step": 2410
+    },
+    {
+      "epoch": 3.6002974530581895,
+      "grad_norm": 0.06351654976606369,
+      "learning_rate": 6.6017740351628e-05,
+      "loss": 0.0135,
+      "step": 2420
+    },
+    {
+      "epoch": 3.615170105967652,
+      "grad_norm": 0.053709421306848526,
+      "learning_rate": 6.473110639426616e-05,
+      "loss": 0.0122,
+      "step": 2430
+    },
+    {
+      "epoch": 3.6300427588771145,
+      "grad_norm": 0.061445701867341995,
+      "learning_rate": 6.345367529856254e-05,
+      "loss": 0.0132,
+      "step": 2440
+    },
+    {
+      "epoch": 3.6449154117865774,
+      "grad_norm": 0.0678747370839119,
+      "learning_rate": 6.218558493450893e-05,
+      "loss": 0.0125,
+      "step": 2450
+    },
+    {
+      "epoch": 3.6597880646960403,
+      "grad_norm": 0.05095114931464195,
+      "learning_rate": 6.0926972163974775e-05,
+      "loss": 0.012,
+      "step": 2460
+    },
+    {
+      "epoch": 3.674660717605503,
+      "grad_norm": 0.05740583688020706,
+      "learning_rate": 5.9677972825936254e-05,
+      "loss": 0.0125,
+      "step": 2470
+    },
+    {
+      "epoch": 3.6895333705149658,
+      "grad_norm": 0.05399662256240845,
+      "learning_rate": 5.8438721721815536e-05,
+      "loss": 0.0134,
+      "step": 2480
+    },
+    {
+      "epoch": 3.7044060234244283,
+      "grad_norm": 0.056056030094623566,
+      "learning_rate": 5.720935260093177e-05,
+      "loss": 0.0125,
+      "step": 2490
+    },
+    {
+      "epoch": 3.719278676333891,
+      "grad_norm": 0.046866290271282196,
+      "learning_rate": 5.598999814606618e-05,
+      "loss": 0.0118,
+      "step": 2500
+    },
+    {
+      "epoch": 3.719278676333891,
+      "eval_loss": 0.031009526923298836,
+      "eval_runtime": 212.3923,
+      "eval_samples_per_second": 22.51,
+      "eval_steps_per_second": 22.51,
+      "step": 2500
+    },
+    {
+      "epoch": 3.7341513292433537,
+      "grad_norm": 0.046400755643844604,
+      "learning_rate": 5.4780789959141524e-05,
+      "loss": 0.0122,
+      "step": 2510
+    },
+    {
+      "epoch": 3.7490239821528166,
+      "grad_norm": 0.05211547762155533,
+      "learning_rate": 5.358185854701909e-05,
+      "loss": 0.0122,
+      "step": 2520
+    },
+    {
+      "epoch": 3.763896635062279,
+      "grad_norm": 0.0429752878844738,
+      "learning_rate": 5.239333330741298e-05,
+      "loss": 0.0124,
+      "step": 2530
+    },
+    {
+      "epoch": 3.778769287971742,
+      "grad_norm": 0.05008607730269432,
+      "learning_rate": 5.121534251492486e-05,
+      "loss": 0.0125,
+      "step": 2540
+    },
+    {
+      "epoch": 3.7936419408812045,
+      "grad_norm": 0.046397943049669266,
+      "learning_rate": 5.004801330719941e-05,
+      "loss": 0.0111,
+      "step": 2550
+    },
+    {
+      "epoch": 3.8085145937906675,
+      "grad_norm": 0.05960022658109665,
+      "learning_rate": 4.8891471671202675e-05,
+      "loss": 0.0117,
+      "step": 2560
+    },
+    {
+      "epoch": 3.82338724670013,
+      "grad_norm": 0.04353282228112221,
+      "learning_rate": 4.7745842429624795e-05,
+      "loss": 0.0119,
+      "step": 2570
+    },
+    {
+      "epoch": 3.838259899609593,
+      "grad_norm": 0.05057670921087265,
+      "learning_rate": 4.661124922740794e-05,
+      "loss": 0.0116,
+      "step": 2580
+    },
+    {
+      "epoch": 3.853132552519056,
+      "grad_norm": 0.04886782541871071,
+      "learning_rate": 4.548781451840179e-05,
+      "loss": 0.0113,
+      "step": 2590
+    },
+    {
+      "epoch": 3.8680052054285183,
+      "grad_norm": 0.055182382464408875,
+      "learning_rate": 4.437565955214723e-05,
+      "loss": 0.0116,
+      "step": 2600
+    },
+    {
+      "epoch": 3.882877858337981,
+      "grad_norm": 0.048834457993507385,
+      "learning_rate": 4.3274904360790505e-05,
+      "loss": 0.0121,
+      "step": 2610
+    },
+    {
+      "epoch": 3.8977505112474438,
+      "grad_norm": 0.05025951564311981,
+      "learning_rate": 4.218566774612802e-05,
+      "loss": 0.0112,
+      "step": 2620
+    },
+    {
+      "epoch": 3.9126231641569067,
+      "grad_norm": 0.05054251477122307,
+      "learning_rate": 4.1108067266784746e-05,
+      "loss": 0.0112,
+      "step": 2630
+    },
+    {
+      "epoch": 3.927495817066369,
+      "grad_norm": 0.05326022952795029,
+      "learning_rate": 4.004221922552608e-05,
+      "loss": 0.0119,
+      "step": 2640
+    },
+    {
+      "epoch": 3.9423684699758317,
+      "grad_norm": 0.05668502673506737,
+      "learning_rate": 3.898823865670579e-05,
+      "loss": 0.0114,
+      "step": 2650
+    },
+    {
+      "epoch": 3.9572411228852946,
+      "grad_norm": 0.054235439747571945,
+      "learning_rate": 3.794623931385062e-05,
+      "loss": 0.0119,
+      "step": 2660
+    },
+    {
+      "epoch": 3.9721137757947576,
+      "grad_norm": 0.05231969431042671,
+      "learning_rate": 3.6916333657383024e-05,
+      "loss": 0.0108,
+      "step": 2670
+    },
+    {
+      "epoch": 3.98698642870422,
+      "grad_norm": 0.057500049471855164,
+      "learning_rate": 3.5898632842483746e-05,
+      "loss": 0.011,
+      "step": 2680
+    },
+    {
+      "epoch": 4.002230897936419,
+      "grad_norm": 0.04203633964061737,
+      "learning_rate": 3.489324670709494e-05,
+      "loss": 0.0113,
+      "step": 2690
+    },
+    {
+      "epoch": 4.017103550845882,
+      "grad_norm": 0.029648838564753532,
+      "learning_rate": 3.390028376006589e-05,
+      "loss": 0.0059,
+      "step": 2700
+    },
+    {
+      "epoch": 4.031976203755345,
+      "grad_norm": 0.03779765963554382,
+      "learning_rate": 3.2919851169441625e-05,
+      "loss": 0.006,
+      "step": 2710
+    },
+    {
+      "epoch": 4.046848856664807,
+      "grad_norm": 0.040116600692272186,
+      "learning_rate": 3.195205475089667e-05,
+      "loss": 0.0058,
+      "step": 2720
+    },
+    {
+      "epoch": 4.06172150957427,
+      "grad_norm": 0.030058899894356728,
+      "learning_rate": 3.099699895631474e-05,
+      "loss": 0.0056,
+      "step": 2730
+    },
+    {
+      "epoch": 4.076594162483733,
+      "grad_norm": 0.03675166517496109,
+      "learning_rate": 3.0054786862515257e-05,
+      "loss": 0.0058,
+      "step": 2740
+    },
+    {
+      "epoch": 4.091466815393196,
+      "grad_norm": 0.03470413759350777,
+      "learning_rate": 2.912552016012879e-05,
+      "loss": 0.0057,
+      "step": 2750
+    },
+    {
+      "epoch": 4.106339468302658,
+      "grad_norm": 0.03222460299730301,
+      "learning_rate": 2.8209299142621522e-05,
+      "loss": 0.0057,
+      "step": 2760
+    },
+    {
+      "epoch": 4.121212121212121,
+      "grad_norm": 0.036458127200603485,
+      "learning_rate": 2.7306222695471173e-05,
+      "loss": 0.0056,
+      "step": 2770
+    },
+    {
+      "epoch": 4.136084774121584,
+      "grad_norm": 0.035760316997766495,
+      "learning_rate": 2.641638828549425e-05,
+      "loss": 0.0055,
+      "step": 2780
+    },
+    {
+      "epoch": 4.150957427031047,
+      "grad_norm": 0.04281270503997803,
+      "learning_rate": 2.5539891950326875e-05,
+      "loss": 0.0056,
+      "step": 2790
+    },
+    {
+      "epoch": 4.165830079940509,
+      "grad_norm": 0.030339548364281654,
+      "learning_rate": 2.4676828288059558e-05,
+      "loss": 0.0057,
+      "step": 2800
+    },
+    {
+      "epoch": 4.180702732849972,
+      "grad_norm": 0.03753247857093811,
+      "learning_rate": 2.382729044702748e-05,
+      "loss": 0.0058,
+      "step": 2810
+    },
+    {
+      "epoch": 4.195575385759435,
+      "grad_norm": 0.035988811403512955,
+      "learning_rate": 2.299137011575738e-05,
+      "loss": 0.0055,
+      "step": 2820
+    },
+    {
+      "epoch": 4.210448038668898,
+      "grad_norm": 0.0344134196639061,
+      "learning_rate": 2.2169157513071566e-05,
+      "loss": 0.0057,
+      "step": 2830
+    },
+    {
+      "epoch": 4.22532069157836,
+      "grad_norm": 0.03696177527308464,
+      "learning_rate": 2.136074137835107e-05,
+      "loss": 0.0056,
+      "step": 2840
+    },
+    {
+      "epoch": 4.240193344487823,
+      "grad_norm": 0.03733756020665169,
+      "learning_rate": 2.056620896195804e-05,
+      "loss": 0.0057,
+      "step": 2850
+    },
+    {
+      "epoch": 4.255065997397286,
+      "grad_norm": 0.03630942478775978,
+      "learning_rate": 1.978564601581919e-05,
+      "loss": 0.0056,
+      "step": 2860
+    },
+    {
+      "epoch": 4.269938650306749,
+      "grad_norm": 0.03577449545264244,
+      "learning_rate": 1.9019136784170635e-05,
+      "loss": 0.0055,
+      "step": 2870
+    },
+    {
+      "epoch": 4.284811303216211,
+      "grad_norm": 0.03209745138883591,
+      "learning_rate": 1.82667639944657e-05,
+      "loss": 0.0054,
+      "step": 2880
+    },
+    {
+      "epoch": 4.299683956125674,
+      "grad_norm": 0.03668665885925293,
+      "learning_rate": 1.752860884844646e-05,
+      "loss": 0.0055,
+      "step": 2890
+    },
+    {
+      "epoch": 4.314556609035137,
+      "grad_norm": 0.03498975560069084,
+      "learning_rate": 1.680475101337959e-05,
+      "loss": 0.0055,
+      "step": 2900
+    },
+    {
+      "epoch": 4.3294292619445995,
+      "grad_norm": 0.04088146984577179,
+      "learning_rate": 1.60952686134583e-05,
+      "loss": 0.0055,
+      "step": 2910
+    },
+    {
+      "epoch": 4.3443019148540625,
+      "grad_norm": 0.035557616502046585,
+      "learning_rate": 1.5400238221370413e-05,
+      "loss": 0.0056,
+      "step": 2920
+    },
+    {
+      "epoch": 4.3591745677635245,
+      "grad_norm": 0.03443196415901184,
+      "learning_rate": 1.4719734850034277e-05,
+      "loss": 0.0056,
+      "step": 2930
+    },
+    {
+      "epoch": 4.3740472206729875,
+      "grad_norm": 0.03481742739677429,
+      "learning_rate": 1.4053831944502508e-05,
+      "loss": 0.0057,
+      "step": 2940
+    },
+    {
+      "epoch": 4.38891987358245,
+      "grad_norm": 0.03648516163229942,
+      "learning_rate": 1.340260137403557e-05,
+      "loss": 0.0053,
+      "step": 2950
+    },
+    {
+      "epoch": 4.403792526491913,
+      "grad_norm": 0.03400832787156105,
+      "learning_rate": 1.2766113424344814e-05,
+      "loss": 0.0055,
+      "step": 2960
+    },
+    {
+      "epoch": 4.418665179401375,
+      "grad_norm": 0.03558880090713501,
+      "learning_rate": 1.21444367900069e-05,
+      "loss": 0.0055,
+      "step": 2970
+    },
+    {
+      "epoch": 4.433537832310838,
+      "grad_norm": 0.035319775342941284,
+      "learning_rate": 1.1537638567049729e-05,
+      "loss": 0.0055,
+      "step": 2980
+    },
+    {
+      "epoch": 4.448410485220301,
+      "grad_norm": 0.03432595729827881,
+      "learning_rate": 1.0945784245710848e-05,
+      "loss": 0.0054,
+      "step": 2990
+    },
+    {
+      "epoch": 4.463283138129764,
+      "grad_norm": 0.03571225702762604,
+      "learning_rate": 1.036893770336938e-05,
+      "loss": 0.0055,
+      "step": 3000
+    },
+    {
+      "epoch": 4.463283138129764,
+      "eval_loss": 0.03200867399573326,
+      "eval_runtime": 212.5457,
+      "eval_samples_per_second": 22.494,
+      "eval_steps_per_second": 22.494,
+      "step": 3000
+    },
+    {
+      "epoch": 4.478155791039226,
+      "grad_norm": 0.040391724556684494,
+      "learning_rate": 9.807161197651742e-06,
+      "loss": 0.0056,
+      "step": 3010
+    },
+    {
+      "epoch": 4.493028443948689,
+      "grad_norm": 0.03410281240940094,
+      "learning_rate": 9.260515359712517e-06,
+      "loss": 0.0055,
+      "step": 3020
+    },
+    {
+      "epoch": 4.507901096858152,
+      "grad_norm": 0.03447275608778,
+      "learning_rate": 8.729059187690479e-06,
+      "loss": 0.0054,
+      "step": 3030
+    },
+    {
+      "epoch": 4.522773749767615,
+      "grad_norm": 0.032652657479047775,
+      "learning_rate": 8.212850040341273e-06,
+      "loss": 0.0055,
+      "step": 3040
+    },
+    {
+      "epoch": 4.537646402677078,
+      "grad_norm": 0.035828616470098495,
+      "learning_rate": 7.711943630846684e-06,
+      "loss": 0.0053,
+      "step": 3050
+    },
+    {
+      "epoch": 4.55251905558654,
+      "grad_norm": 0.03351854532957077,
+      "learning_rate": 7.226394020801645e-06,
+      "loss": 0.0054,
+      "step": 3060
+    },
+    {
+      "epoch": 4.567391708496003,
+      "grad_norm": 0.03872072696685791,
+      "learning_rate": 6.7562536143796254e-06,
+      "loss": 0.0056,
+      "step": 3070
+    },
+    {
+      "epoch": 4.582264361405466,
+      "grad_norm": 0.03518550843000412,
+      "learning_rate": 6.301573152676664e-06,
+      "loss": 0.0054,
+      "step": 3080
+    },
+    {
+      "epoch": 4.597137014314928,
+      "grad_norm": 0.0351685993373394,
+      "learning_rate": 5.862401708235076e-06,
+      "loss": 0.0052,
+      "step": 3090
+    },
+    {
+      "epoch": 4.612009667224391,
+      "grad_norm": 0.0348668210208416,
+      "learning_rate": 5.438786679747081e-06,
+      "loss": 0.0055,
+      "step": 3100
+    },
+    {
+      "epoch": 4.626882320133854,
+      "grad_norm": 0.03660331293940544,
+      "learning_rate": 5.030773786939319e-06,
+      "loss": 0.0055,
+      "step": 3110
+    },
+    {
+      "epoch": 4.641754973043317,
+      "grad_norm": 0.04046601429581642,
+      "learning_rate": 4.638407065638322e-06,
+      "loss": 0.0054,
+      "step": 3120
+    },
+    {
+      "epoch": 4.65662762595278,
+      "grad_norm": 0.03230154886841774,
+      "learning_rate": 4.261728863017827e-06,
+      "loss": 0.0054,
+      "step": 3130
+    },
+    {
+      "epoch": 4.671500278862242,
+      "grad_norm": 0.034297142177820206,
+      "learning_rate": 3.900779833028472e-06,
+      "loss": 0.0054,
+      "step": 3140
+    },
+    {
+      "epoch": 4.686372931771705,
+      "grad_norm": 0.03240946680307388,
+      "learning_rate": 3.5555989320099952e-06,
+      "loss": 0.0053,
+      "step": 3150
+    },
+    {
+      "epoch": 4.701245584681168,
+      "grad_norm": 0.04137023165822029,
+      "learning_rate": 3.2262234144868116e-06,
+      "loss": 0.0054,
+      "step": 3160
+    },
+    {
+      "epoch": 4.7161182375906305,
+      "grad_norm": 0.030783316120505333,
+      "learning_rate": 2.912688829147214e-06,
+      "loss": 0.0052,
+      "step": 3170
+    },
+    {
+      "epoch": 4.730990890500093,
+      "grad_norm": 0.03588159382343292,
+      "learning_rate": 2.6150290150067588e-06,
+      "loss": 0.0055,
+      "step": 3180
+    },
+    {
+      "epoch": 4.7458635434095555,
+      "grad_norm": 0.03300805762410164,
+      "learning_rate": 2.3332760977559873e-06,
+      "loss": 0.0053,
+      "step": 3190
+    },
+    {
+      "epoch": 4.7607361963190185,
+      "grad_norm": 0.03986676409840584,
+      "learning_rate": 2.0674604862932654e-06,
+      "loss": 0.0055,
+      "step": 3200
+    },
+    {
+      "epoch": 4.775608849228481,
+      "grad_norm": 0.03252493590116501,
+      "learning_rate": 1.8176108694427927e-06,
+      "loss": 0.0052,
+      "step": 3210
+    },
+    {
+      "epoch": 4.790481502137943,
+      "grad_norm": 0.03938417136669159,
+      "learning_rate": 1.583754212858329e-06,
+      "loss": 0.0054,
+      "step": 3220
+    },
+    {
+      "epoch": 4.805354155047406,
+      "grad_norm": 0.03552339971065521,
+      "learning_rate": 1.3659157561127732e-06,
+      "loss": 0.0057,
+      "step": 3230
+    },
+    {
+      "epoch": 4.820226807956869,
+      "grad_norm": 0.03480495885014534,
+      "learning_rate": 1.1641190099741904e-06,
+      "loss": 0.0053,
+      "step": 3240
+    },
+    {
+      "epoch": 4.835099460866332,
+      "grad_norm": 0.03451026231050491,
+      "learning_rate": 9.783857538683603e-07,
+      "loss": 0.0053,
+      "step": 3250
+    },
+    {
+      "epoch": 4.849972113775795,
+      "grad_norm": 0.033308371901512146,
+      "learning_rate": 8.087360335281235e-07,
+      "loss": 0.0055,
+      "step": 3260
+    },
+    {
+      "epoch": 4.864844766685257,
+      "grad_norm": 0.035610370337963104,
+      "learning_rate": 6.551881588299279e-07,
+      "loss": 0.0054,
+      "step": 3270
+    },
+    {
+      "epoch": 4.87971741959472,
+      "grad_norm": 0.030910024419426918,
+      "learning_rate": 5.177587018176777e-07,
+      "loss": 0.0054,
+      "step": 3280
+    },
+    {
+      "epoch": 4.894590072504183,
+      "grad_norm": 0.034942276775836945,
+      "learning_rate": 3.964624949141626e-07,
+      "loss": 0.0054,
+      "step": 3290
+    },
+    {
+      "epoch": 4.909462725413646,
+      "grad_norm": 0.03491232544183731,
+      "learning_rate": 2.913126293202228e-07,
+      "loss": 0.0053,
+      "step": 3300
+    },
+    {
+      "epoch": 4.924335378323108,
+      "grad_norm": 0.0331818163394928,
+      "learning_rate": 2.0232045360184523e-07,
+      "loss": 0.0051,
+      "step": 3310
+    },
+    {
+      "epoch": 4.939208031232571,
+      "grad_norm": 0.034393060952425,
+      "learning_rate": 1.2949557246537678e-07,
+      "loss": 0.0053,
+      "step": 3320
+    },
+    {
+      "epoch": 4.954080684142034,
+      "grad_norm": 0.03940508887171745,
+      "learning_rate": 7.284584572085361e-08,
+      "loss": 0.0052,
+      "step": 3330
+    },
+    {
+      "epoch": 4.968953337051497,
+      "grad_norm": 0.03125544637441635,
+      "learning_rate": 3.237738743372964e-08,
+      "loss": 0.0052,
+      "step": 3340
+    },
+    {
+      "epoch": 4.983825989960959,
+      "grad_norm": 0.03558258339762688,
+      "learning_rate": 8.094565265054365e-09,
+      "loss": 0.0054,
+      "step": 3350
+    },
+    {
+      "epoch": 4.998698642870422,
+      "grad_norm": 0.03360743075609207,
+      "learning_rate": 0.0,
+      "loss": 0.0054,
+      "step": 3360
+    },
+    {
+      "epoch": 4.998698642870422,
+      "step": 3360,
+      "total_flos": 5.14290499398402e+18,
+      "train_loss": 0.19145491501161208,
+      "train_runtime": 31931.328,
+      "train_samples_per_second": 6.737,
+      "train_steps_per_second": 0.105
+    }
+  ],
+  "logging_steps": 10,
+  "max_steps": 3360,
+  "num_input_tokens_seen": 0,
+  "num_train_epochs": 5,
+  "save_steps": 500,
+  "stateful_callbacks": {
+    "TrainerControl": {
+      "args": {
+        "should_epoch_stop": false,
+        "should_evaluate": false,
+        "should_log": false,
+        "should_save": true,
+        "should_training_stop": true
+      },
+      "attributes": {}
+    }
+  },
+  "total_flos": 5.14290499398402e+18,
+  "train_batch_size": 8,
+  "trial_name": null,
+  "trial_params": null
+}

training_eval_loss.png ADDED Viewed

training_loss.png ADDED Viewed