{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.998698642870422, "eval_steps": 500, "global_step": 3360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014872652909462726, "grad_norm": 1.0425002574920654, "learning_rate": 8.928571428571428e-06, "loss": 0.4346, "step": 10 }, { "epoch": 0.02974530581892545, "grad_norm": 0.5861272811889648, "learning_rate": 1.7857142857142855e-05, "loss": 0.3527, "step": 20 }, { "epoch": 0.044617958728388175, "grad_norm": 0.5629558563232422, "learning_rate": 2.6785714285714284e-05, "loss": 0.2922, "step": 30 }, { "epoch": 0.0594906116378509, "grad_norm": 0.49566933512687683, "learning_rate": 3.571428571428571e-05, "loss": 0.281, "step": 40 }, { "epoch": 0.07436326454731362, "grad_norm": 0.45387113094329834, "learning_rate": 4.4642857142857136e-05, "loss": 0.2689, "step": 50 }, { "epoch": 0.08923591745677635, "grad_norm": 0.43913352489471436, "learning_rate": 5.357142857142857e-05, "loss": 0.2506, "step": 60 }, { "epoch": 0.10410857036623908, "grad_norm": 0.7242547869682312, "learning_rate": 6.25e-05, "loss": 0.229, "step": 70 }, { "epoch": 0.1189812232757018, "grad_norm": 0.5109072923660278, "learning_rate": 7.142857142857142e-05, "loss": 0.2373, "step": 80 }, { "epoch": 0.13385387618516453, "grad_norm": 0.5291035175323486, "learning_rate": 8.035714285714285e-05, "loss": 0.2405, "step": 90 }, { "epoch": 0.14872652909462725, "grad_norm": 0.48036572337150574, "learning_rate": 8.928571428571427e-05, "loss": 0.2274, "step": 100 }, { "epoch": 0.16359918200409, "grad_norm": 0.3294093906879425, "learning_rate": 9.82142857142857e-05, "loss": 0.2038, "step": 110 }, { "epoch": 0.1784718349135527, "grad_norm": 0.49968525767326355, "learning_rate": 0.00010714285714285714, "loss": 0.2084, "step": 120 }, { "epoch": 0.19334448782301544, "grad_norm": 0.32227209210395813, "learning_rate": 0.00011607142857142857, "loss": 0.1981, "step": 130 }, { "epoch": 0.20821714073247816, "grad_norm": 0.37266677618026733, "learning_rate": 0.000125, "loss": 0.2192, "step": 140 }, { "epoch": 0.22308979364194087, "grad_norm": 0.5228686928749084, "learning_rate": 0.00013392857142857144, "loss": 0.2014, "step": 150 }, { "epoch": 0.2379624465514036, "grad_norm": 0.4202245771884918, "learning_rate": 0.00014285714285714284, "loss": 0.1912, "step": 160 }, { "epoch": 0.25283509946086635, "grad_norm": 0.45801258087158203, "learning_rate": 0.00015178571428571427, "loss": 0.212, "step": 170 }, { "epoch": 0.26770775237032907, "grad_norm": 0.4326329827308655, "learning_rate": 0.0001607142857142857, "loss": 0.1973, "step": 180 }, { "epoch": 0.2825804052797918, "grad_norm": 0.38971471786499023, "learning_rate": 0.0001696428571428571, "loss": 0.1907, "step": 190 }, { "epoch": 0.2974530581892545, "grad_norm": 0.3728097975254059, "learning_rate": 0.00017857142857142854, "loss": 0.192, "step": 200 }, { "epoch": 0.3123257110987172, "grad_norm": 0.34695690870285034, "learning_rate": 0.00018749999999999998, "loss": 0.1855, "step": 210 }, { "epoch": 0.32719836400818, "grad_norm": 0.41753408312797546, "learning_rate": 0.0001964285714285714, "loss": 0.1883, "step": 220 }, { "epoch": 0.3420710169176427, "grad_norm": 0.27681878209114075, "learning_rate": 0.00020535714285714284, "loss": 0.1809, "step": 230 }, { "epoch": 0.3569436698271054, "grad_norm": 2.382871150970459, "learning_rate": 0.00021428571428571427, "loss": 0.1735, "step": 240 }, { "epoch": 0.3718163227365681, "grad_norm": 160.2670440673828, "learning_rate": 0.0002232142857142857, "loss": 1.2159, "step": 250 }, { "epoch": 0.3866889756460309, "grad_norm": 21.60050392150879, "learning_rate": 0.00023214285714285714, "loss": 5.4026, "step": 260 }, { "epoch": 0.4015616285554936, "grad_norm": 13.928524017333984, "learning_rate": 0.00024107142857142857, "loss": 4.3573, "step": 270 }, { "epoch": 0.4164342814649563, "grad_norm": 5.3707685470581055, "learning_rate": 0.00025, "loss": 3.2782, "step": 280 }, { "epoch": 0.431306934374419, "grad_norm": 5.556903839111328, "learning_rate": 0.0002589285714285714, "loss": 2.8033, "step": 290 }, { "epoch": 0.44617958728388174, "grad_norm": 2.512521505355835, "learning_rate": 0.00026785714285714287, "loss": 2.5486, "step": 300 }, { "epoch": 0.4610522401933445, "grad_norm": 3.592169761657715, "learning_rate": 0.0002767857142857143, "loss": 2.2779, "step": 310 }, { "epoch": 0.4759248931028072, "grad_norm": 2.791459321975708, "learning_rate": 0.0002857142857142857, "loss": 2.1011, "step": 320 }, { "epoch": 0.49079754601226994, "grad_norm": 1.1407463550567627, "learning_rate": 0.0002946428571428571, "loss": 1.9929, "step": 330 }, { "epoch": 0.5056701989217327, "grad_norm": 1.795841097831726, "learning_rate": 0.0002999987048597728, "loss": 1.8818, "step": 340 }, { "epoch": 0.5205428518311954, "grad_norm": 1.4798821210861206, "learning_rate": 0.00029998413478906613, "loss": 1.772, "step": 350 }, { "epoch": 0.5354155047406581, "grad_norm": 1.5337024927139282, "learning_rate": 0.0002999533773001224, "loss": 1.6782, "step": 360 }, { "epoch": 0.5502881576501208, "grad_norm": 1.332065463066101, "learning_rate": 0.00029990643571252174, "loss": 1.6035, "step": 370 }, { "epoch": 0.5651608105595836, "grad_norm": 1.0516103506088257, "learning_rate": 0.00029984331509255415, "loss": 1.5053, "step": 380 }, { "epoch": 0.5800334634690463, "grad_norm": 1.034192442893982, "learning_rate": 0.00029976402225267247, "loss": 1.3906, "step": 390 }, { "epoch": 0.594906116378509, "grad_norm": 1.2757515907287598, "learning_rate": 0.0002996685657507577, "loss": 1.2592, "step": 400 }, { "epoch": 0.6097787692879717, "grad_norm": 0.8252782225608826, "learning_rate": 0.000299556955889195, "loss": 1.0907, "step": 410 }, { "epoch": 0.6246514221974344, "grad_norm": 1.020588994026184, "learning_rate": 0.0002994292047137618, "loss": 0.9035, "step": 420 }, { "epoch": 0.6395240751068972, "grad_norm": 0.5973761677742004, "learning_rate": 0.0002992853260123278, "loss": 0.7538, "step": 430 }, { "epoch": 0.65439672801636, "grad_norm": 0.6886543035507202, "learning_rate": 0.0002991253353133668, "loss": 0.6621, "step": 440 }, { "epoch": 0.6692693809258227, "grad_norm": 0.44221287965774536, "learning_rate": 0.00029894924988428087, "loss": 0.59, "step": 450 }, { "epoch": 0.6841420338352854, "grad_norm": 0.7888408899307251, "learning_rate": 0.00029875708872953677, "loss": 0.539, "step": 460 }, { "epoch": 0.6990146867447481, "grad_norm": 0.43110209703445435, "learning_rate": 0.00029854887258861447, "loss": 0.4903, "step": 470 }, { "epoch": 0.7138873396542108, "grad_norm": 0.41334015130996704, "learning_rate": 0.0002983246239337692, "loss": 0.4488, "step": 480 }, { "epoch": 0.7287599925636735, "grad_norm": 0.3482460379600525, "learning_rate": 0.0002980843669676061, "loss": 0.4165, "step": 490 }, { "epoch": 0.7436326454731362, "grad_norm": 0.3593901991844177, "learning_rate": 0.0002978281276204675, "loss": 0.3821, "step": 500 }, { "epoch": 0.7436326454731362, "eval_loss": 0.37597203254699707, "eval_runtime": 212.4955, "eval_samples_per_second": 22.499, "eval_steps_per_second": 22.499, "step": 500 }, { "epoch": 0.758505298382599, "grad_norm": 0.4221905469894409, "learning_rate": 0.00029755593354763516, "loss": 0.3627, "step": 510 }, { "epoch": 0.7733779512920618, "grad_norm": 0.31105437874794006, "learning_rate": 0.0002972678141263449, "loss": 0.3346, "step": 520 }, { "epoch": 0.7882506042015245, "grad_norm": 0.2600822150707245, "learning_rate": 0.000296963800452616, "loss": 0.3217, "step": 530 }, { "epoch": 0.8031232571109872, "grad_norm": 0.21437157690525055, "learning_rate": 0.0002966439253378957, "loss": 0.3095, "step": 540 }, { "epoch": 0.8179959100204499, "grad_norm": 0.22641418874263763, "learning_rate": 0.000296308223305517, "loss": 0.2866, "step": 550 }, { "epoch": 0.8328685629299126, "grad_norm": 0.2200980931520462, "learning_rate": 0.00029595673058697357, "loss": 0.2579, "step": 560 }, { "epoch": 0.8477412158393753, "grad_norm": 0.21351036429405212, "learning_rate": 0.0002955894851180086, "loss": 0.2727, "step": 570 }, { "epoch": 0.862613868748838, "grad_norm": 0.2137759029865265, "learning_rate": 0.0002952065265345211, "loss": 0.2621, "step": 580 }, { "epoch": 0.8774865216583008, "grad_norm": 0.18923349678516388, "learning_rate": 0.00029480789616828765, "loss": 0.2647, "step": 590 }, { "epoch": 0.8923591745677635, "grad_norm": 0.1697588562965393, "learning_rate": 0.00029439363704250176, "loss": 0.2434, "step": 600 }, { "epoch": 0.9072318274772263, "grad_norm": 0.15528830885887146, "learning_rate": 0.0002939637938671306, "loss": 0.2293, "step": 610 }, { "epoch": 0.922104480386689, "grad_norm": 0.43390974402427673, "learning_rate": 0.0002935184130340893, "loss": 0.228, "step": 620 }, { "epoch": 0.9369771332961517, "grad_norm": 0.2026420682668686, "learning_rate": 0.000293057542612234, "loss": 0.2355, "step": 630 }, { "epoch": 0.9518497862056144, "grad_norm": 0.16864228248596191, "learning_rate": 0.00029258123234217435, "loss": 0.2213, "step": 640 }, { "epoch": 0.9667224391150772, "grad_norm": 0.15947186946868896, "learning_rate": 0.0002920895336309044, "loss": 0.2079, "step": 650 }, { "epoch": 0.9815950920245399, "grad_norm": 0.21965055167675018, "learning_rate": 0.0002915824995462551, "loss": 0.2002, "step": 660 }, { "epoch": 0.9964677449340026, "grad_norm": 0.23223313689231873, "learning_rate": 0.00029106018481116626, "loss": 0.1983, "step": 670 }, { "epoch": 1.0117122141662018, "grad_norm": 0.26117920875549316, "learning_rate": 0.00029052264579778063, "loss": 0.2175, "step": 680 }, { "epoch": 1.0265848670756645, "grad_norm": 0.176736518740654, "learning_rate": 0.00028996994052135996, "loss": 0.1831, "step": 690 }, { "epoch": 1.0414575199851273, "grad_norm": 0.17873461544513702, "learning_rate": 0.0002894021286340233, "loss": 0.1784, "step": 700 }, { "epoch": 1.05633017289459, "grad_norm": 0.2646450996398926, "learning_rate": 0.0002888192714183092, "loss": 0.1784, "step": 710 }, { "epoch": 1.0712028258040527, "grad_norm": 0.16840551793575287, "learning_rate": 0.00028822143178056114, "loss": 0.1726, "step": 720 }, { "epoch": 1.0860754787135156, "grad_norm": 0.1423952877521515, "learning_rate": 0.0002876086742441387, "loss": 0.1608, "step": 730 }, { "epoch": 1.1009481316229783, "grad_norm": 0.16237640380859375, "learning_rate": 0.0002869810649424535, "loss": 0.179, "step": 740 }, { "epoch": 1.115820784532441, "grad_norm": 0.158773735165596, "learning_rate": 0.0002863386716118316, "loss": 0.1742, "step": 750 }, { "epoch": 1.1306934374419038, "grad_norm": 0.17627516388893127, "learning_rate": 0.0002856815635842029, "loss": 0.1821, "step": 760 }, { "epoch": 1.1455660903513665, "grad_norm": 0.23613831400871277, "learning_rate": 0.00028500981177961816, "loss": 0.156, "step": 770 }, { "epoch": 1.1604387432608292, "grad_norm": 0.16501256823539734, "learning_rate": 0.0002843234886985951, "loss": 0.1517, "step": 780 }, { "epoch": 1.175311396170292, "grad_norm": 0.2365158647298813, "learning_rate": 0.00028362266841429345, "loss": 0.1391, "step": 790 }, { "epoch": 1.1901840490797546, "grad_norm": 0.17508777976036072, "learning_rate": 0.00028290742656452014, "loss": 0.1434, "step": 800 }, { "epoch": 1.2050567019892173, "grad_norm": 0.145797461271286, "learning_rate": 0.0002821778403435663, "loss": 0.1607, "step": 810 }, { "epoch": 1.21992935489868, "grad_norm": 0.15968403220176697, "learning_rate": 0.00028143398849387577, "loss": 0.1536, "step": 820 }, { "epoch": 1.2348020078081428, "grad_norm": 0.1553070992231369, "learning_rate": 0.00028067595129754647, "loss": 0.1481, "step": 830 }, { "epoch": 1.2496746607176055, "grad_norm": 0.1769135743379593, "learning_rate": 0.0002799038105676658, "loss": 0.1285, "step": 840 }, { "epoch": 1.2645473136270682, "grad_norm": 0.1639111191034317, "learning_rate": 0.0002791176496394808, "loss": 0.144, "step": 850 }, { "epoch": 1.279419966536531, "grad_norm": 0.19045153260231018, "learning_rate": 0.00027831755336140416, "loss": 0.1347, "step": 860 }, { "epoch": 1.2942926194459936, "grad_norm": 0.18079642951488495, "learning_rate": 0.00027750360808585637, "loss": 0.1254, "step": 870 }, { "epoch": 1.3091652723554563, "grad_norm": 0.18368874490261078, "learning_rate": 0.00027667590165994613, "loss": 0.1289, "step": 880 }, { "epoch": 1.324037925264919, "grad_norm": 0.20005619525909424, "learning_rate": 0.00027583452341598935, "loss": 0.1246, "step": 890 }, { "epoch": 1.338910578174382, "grad_norm": 0.1317131668329239, "learning_rate": 0.0002749795641618673, "loss": 0.1238, "step": 900 }, { "epoch": 1.3537832310838445, "grad_norm": 0.15287995338439941, "learning_rate": 0.00027411111617122656, "loss": 0.1224, "step": 910 }, { "epoch": 1.3686558839933074, "grad_norm": 0.1613466739654541, "learning_rate": 0.0002732292731735196, "loss": 0.1178, "step": 920 }, { "epoch": 1.3835285369027701, "grad_norm": 0.1685304194688797, "learning_rate": 0.000272334130343889, "loss": 0.1201, "step": 930 }, { "epoch": 1.3984011898122328, "grad_norm": 0.19208119809627533, "learning_rate": 0.0002714257842928956, "loss": 0.1103, "step": 940 }, { "epoch": 1.4132738427216955, "grad_norm": 0.17899583280086517, "learning_rate": 0.00027050433305609125, "loss": 0.1128, "step": 950 }, { "epoch": 1.4281464956311583, "grad_norm": 0.19848547875881195, "learning_rate": 0.0002695698760834384, "loss": 0.1112, "step": 960 }, { "epoch": 1.443019148540621, "grad_norm": 0.1710231602191925, "learning_rate": 0.0002686225142285762, "loss": 0.1107, "step": 970 }, { "epoch": 1.4578918014500837, "grad_norm": 0.1552249938249588, "learning_rate": 0.0002676623497379363, "loss": 0.0984, "step": 980 }, { "epoch": 1.4727644543595464, "grad_norm": 0.1702568084001541, "learning_rate": 0.0002666894862397072, "loss": 0.1109, "step": 990 }, { "epoch": 1.487637107269009, "grad_norm": 0.12360525131225586, "learning_rate": 0.00026570402873264996, "loss": 0.1018, "step": 1000 }, { "epoch": 1.487637107269009, "eval_loss": 0.10193677991628647, "eval_runtime": 212.152, "eval_samples_per_second": 22.536, "eval_steps_per_second": 22.536, "step": 1000 }, { "epoch": 1.5025097601784718, "grad_norm": 0.14356306195259094, "learning_rate": 0.0002647060835747659, "loss": 0.101, "step": 1010 }, { "epoch": 1.5173824130879345, "grad_norm": 0.12723973393440247, "learning_rate": 0.00026369575847181795, "loss": 0.095, "step": 1020 }, { "epoch": 1.5322550659973972, "grad_norm": 0.12857410311698914, "learning_rate": 0.0002626731624657058, "loss": 0.0915, "step": 1030 }, { "epoch": 1.54712771890686, "grad_norm": 0.1593610793352127, "learning_rate": 0.0002616384059226977, "loss": 0.0993, "step": 1040 }, { "epoch": 1.562000371816323, "grad_norm": 0.11687605082988739, "learning_rate": 0.0002605916005215186, "loss": 0.0894, "step": 1050 }, { "epoch": 1.5768730247257854, "grad_norm": 0.1873299479484558, "learning_rate": 0.0002595328592412969, "loss": 0.097, "step": 1060 }, { "epoch": 1.5917456776352483, "grad_norm": 0.1516319364309311, "learning_rate": 0.00025846229634937136, "loss": 0.0931, "step": 1070 }, { "epoch": 1.6066183305447108, "grad_norm": 0.1431397646665573, "learning_rate": 0.0002573800273889577, "loss": 0.0918, "step": 1080 }, { "epoch": 1.6214909834541738, "grad_norm": 0.17975349724292755, "learning_rate": 0.0002562861691666793, "loss": 0.0892, "step": 1090 }, { "epoch": 1.6363636363636362, "grad_norm": 0.1414797306060791, "learning_rate": 0.0002551808397399597, "loss": 0.0952, "step": 1100 }, { "epoch": 1.6512362892730992, "grad_norm": 0.151850625872612, "learning_rate": 0.0002540641584042812, "loss": 0.1008, "step": 1110 }, { "epoch": 1.6661089421825617, "grad_norm": 0.1266675442457199, "learning_rate": 0.00025293624568031, "loss": 0.0782, "step": 1120 }, { "epoch": 1.6809815950920246, "grad_norm": 0.12076599150896072, "learning_rate": 0.0002517972233008882, "loss": 0.0772, "step": 1130 }, { "epoch": 1.6958542480014873, "grad_norm": 0.125094935297966, "learning_rate": 0.0002506472141978955, "loss": 0.0837, "step": 1140 }, { "epoch": 1.71072690091095, "grad_norm": 0.13272984325885773, "learning_rate": 0.0002494863424889819, "loss": 0.0736, "step": 1150 }, { "epoch": 1.7255995538204127, "grad_norm": 0.16893050074577332, "learning_rate": 0.00024831473346417153, "loss": 0.0856, "step": 1160 }, { "epoch": 1.7404722067298755, "grad_norm": 0.11702137440443039, "learning_rate": 0.00024713251357234053, "loss": 0.0799, "step": 1170 }, { "epoch": 1.7553448596393382, "grad_norm": 0.13682794570922852, "learning_rate": 0.00024593981040756997, "loss": 0.089, "step": 1180 }, { "epoch": 1.7702175125488009, "grad_norm": 0.13676613569259644, "learning_rate": 0.0002447367526953746, "loss": 0.0797, "step": 1190 }, { "epoch": 1.7850901654582636, "grad_norm": 0.13324877619743347, "learning_rate": 0.00024352347027881003, "loss": 0.0792, "step": 1200 }, { "epoch": 1.7999628183677263, "grad_norm": 0.11255478858947754, "learning_rate": 0.00024230009410445893, "loss": 0.0763, "step": 1210 }, { "epoch": 1.814835471277189, "grad_norm": 0.10950371623039246, "learning_rate": 0.0002410667562082985, "loss": 0.0663, "step": 1220 }, { "epoch": 1.8297081241866517, "grad_norm": 0.11777317523956299, "learning_rate": 0.00023982358970145004, "loss": 0.0694, "step": 1230 }, { "epoch": 1.8445807770961147, "grad_norm": 0.1194106712937355, "learning_rate": 0.00023857072875581244, "loss": 0.0703, "step": 1240 }, { "epoch": 1.8594534300055772, "grad_norm": 0.11233114451169968, "learning_rate": 0.00023730830858958177, "loss": 0.0655, "step": 1250 }, { "epoch": 1.87432608291504, "grad_norm": 0.11641702055931091, "learning_rate": 0.00023603646545265687, "loss": 0.0645, "step": 1260 }, { "epoch": 1.8891987358245026, "grad_norm": 0.1414889097213745, "learning_rate": 0.00023475533661193495, "loss": 0.068, "step": 1270 }, { "epoch": 1.9040713887339655, "grad_norm": 0.10632241517305374, "learning_rate": 0.00023346506033649614, "loss": 0.064, "step": 1280 }, { "epoch": 1.918944041643428, "grad_norm": 0.10176625102758408, "learning_rate": 0.0002321657758826807, "loss": 0.062, "step": 1290 }, { "epoch": 1.933816694552891, "grad_norm": 0.09434150904417038, "learning_rate": 0.00023085762347905943, "loss": 0.0684, "step": 1300 }, { "epoch": 1.9486893474623534, "grad_norm": 0.12967799603939056, "learning_rate": 0.00022954074431129915, "loss": 0.0605, "step": 1310 }, { "epoch": 1.9635620003718164, "grad_norm": 0.1181391179561615, "learning_rate": 0.0002282152805069247, "loss": 0.0654, "step": 1320 }, { "epoch": 1.978434653281279, "grad_norm": 0.10801093280315399, "learning_rate": 0.00022688137511997977, "loss": 0.07, "step": 1330 }, { "epoch": 1.9933073061907418, "grad_norm": 0.11437591165304184, "learning_rate": 0.00022553917211558713, "loss": 0.0578, "step": 1340 }, { "epoch": 2.008551775422941, "grad_norm": 0.11018254607915878, "learning_rate": 0.0002241888163544111, "loss": 0.0565, "step": 1350 }, { "epoch": 2.0234244283324037, "grad_norm": 0.08331198990345001, "learning_rate": 0.0002228304535770228, "loss": 0.0399, "step": 1360 }, { "epoch": 2.0382970812418666, "grad_norm": 0.09547814726829529, "learning_rate": 0.00022146423038817102, "loss": 0.0438, "step": 1370 }, { "epoch": 2.053169734151329, "grad_norm": 0.10641171038150787, "learning_rate": 0.00022009029424095928, "loss": 0.0384, "step": 1380 }, { "epoch": 2.068042387060792, "grad_norm": 0.10844069719314575, "learning_rate": 0.0002187087934209318, "loss": 0.044, "step": 1390 }, { "epoch": 2.0829150399702545, "grad_norm": 0.10333788394927979, "learning_rate": 0.00021731987703006933, "loss": 0.041, "step": 1400 }, { "epoch": 2.0977876928797174, "grad_norm": 0.10635129362344742, "learning_rate": 0.0002159236949706967, "loss": 0.04, "step": 1410 }, { "epoch": 2.11266034578918, "grad_norm": 0.09010270237922668, "learning_rate": 0.00021452039792930474, "loss": 0.0402, "step": 1420 }, { "epoch": 2.127532998698643, "grad_norm": 0.09274252504110336, "learning_rate": 0.00021311013736028658, "loss": 0.0384, "step": 1430 }, { "epoch": 2.1424056516081054, "grad_norm": 0.08550871163606644, "learning_rate": 0.00021169306546959174, "loss": 0.0428, "step": 1440 }, { "epoch": 2.1572783045175683, "grad_norm": 0.10152186453342438, "learning_rate": 0.00021026933519829896, "loss": 0.0442, "step": 1450 }, { "epoch": 2.1721509574270312, "grad_norm": 0.08528181910514832, "learning_rate": 0.00020883910020610957, "loss": 0.0375, "step": 1460 }, { "epoch": 2.1870236103364937, "grad_norm": 0.09736708551645279, "learning_rate": 0.00020740251485476345, "loss": 0.0387, "step": 1470 }, { "epoch": 2.2018962632459567, "grad_norm": 0.09133671224117279, "learning_rate": 0.00020595973419137908, "loss": 0.0373, "step": 1480 }, { "epoch": 2.216768916155419, "grad_norm": 0.08406363427639008, "learning_rate": 0.00020451091393171964, "loss": 0.0381, "step": 1490 }, { "epoch": 2.231641569064882, "grad_norm": 0.08503925055265427, "learning_rate": 0.00020305621044338718, "loss": 0.0376, "step": 1500 }, { "epoch": 2.231641569064882, "eval_loss": 0.051042910665273666, "eval_runtime": 212.5441, "eval_samples_per_second": 22.494, "eval_steps_per_second": 22.494, "step": 1500 }, { "epoch": 2.2465142219743446, "grad_norm": 0.09201103448867798, "learning_rate": 0.00020159578072894606, "loss": 0.0393, "step": 1510 }, { "epoch": 2.2613868748838075, "grad_norm": 0.09499834477901459, "learning_rate": 0.00020012978240897814, "loss": 0.0346, "step": 1520 }, { "epoch": 2.27625952779327, "grad_norm": 0.09396501630544662, "learning_rate": 0.00019865837370507106, "loss": 0.039, "step": 1530 }, { "epoch": 2.291132180702733, "grad_norm": 0.08983522653579712, "learning_rate": 0.00019718171342274205, "loss": 0.0387, "step": 1540 }, { "epoch": 2.3060048336121954, "grad_norm": 0.1118871420621872, "learning_rate": 0.00019569996093429814, "loss": 0.0379, "step": 1550 }, { "epoch": 2.3208774865216584, "grad_norm": 0.08434595167636871, "learning_rate": 0.00019421327616163563, "loss": 0.0372, "step": 1560 }, { "epoch": 2.335750139431121, "grad_norm": 0.0915694460272789, "learning_rate": 0.00019272181955898017, "loss": 0.036, "step": 1570 }, { "epoch": 2.350622792340584, "grad_norm": 0.08459066599607468, "learning_rate": 0.0001912257520955692, "loss": 0.0363, "step": 1580 }, { "epoch": 2.3654954452500463, "grad_norm": 0.09195558726787567, "learning_rate": 0.00018972523523827907, "loss": 0.0389, "step": 1590 }, { "epoch": 2.3803680981595092, "grad_norm": 0.09830203652381897, "learning_rate": 0.0001882204309341982, "loss": 0.0373, "step": 1600 }, { "epoch": 2.3952407510689717, "grad_norm": 0.08541320264339447, "learning_rate": 0.00018671150159314855, "loss": 0.0342, "step": 1610 }, { "epoch": 2.4101134039784347, "grad_norm": 0.08817029744386673, "learning_rate": 0.00018519861007015729, "loss": 0.0371, "step": 1620 }, { "epoch": 2.4249860568878976, "grad_norm": 0.08839129656553268, "learning_rate": 0.00018368191964788, "loss": 0.0355, "step": 1630 }, { "epoch": 2.43985870979736, "grad_norm": 0.08589951694011688, "learning_rate": 0.00018216159401897812, "loss": 0.0339, "step": 1640 }, { "epoch": 2.454731362706823, "grad_norm": 0.09998754411935806, "learning_rate": 0.00018063779726845203, "loss": 0.0339, "step": 1650 }, { "epoch": 2.4696040156162855, "grad_norm": 0.08363664150238037, "learning_rate": 0.0001791106938559317, "loss": 0.0357, "step": 1660 }, { "epoch": 2.4844766685257484, "grad_norm": 0.08930620551109314, "learning_rate": 0.00017758044859792705, "loss": 0.0347, "step": 1670 }, { "epoch": 2.499349321435211, "grad_norm": 0.08270251750946045, "learning_rate": 0.00017604722665003956, "loss": 0.0332, "step": 1680 }, { "epoch": 2.514221974344674, "grad_norm": 0.09085123986005783, "learning_rate": 0.00017451119348913744, "loss": 0.0357, "step": 1690 }, { "epoch": 2.5290946272541364, "grad_norm": 0.0897296592593193, "learning_rate": 0.00017297251489549638, "loss": 0.0368, "step": 1700 }, { "epoch": 2.5439672801635993, "grad_norm": 0.07172433286905289, "learning_rate": 0.000171431356934907, "loss": 0.0371, "step": 1710 }, { "epoch": 2.558839933073062, "grad_norm": 0.0848449245095253, "learning_rate": 0.0001698878859407519, "loss": 0.032, "step": 1720 }, { "epoch": 2.5737125859825247, "grad_norm": 0.08270355314016342, "learning_rate": 0.00016834226849605371, "loss": 0.0333, "step": 1730 }, { "epoch": 2.588585238891987, "grad_norm": 0.07130729407072067, "learning_rate": 0.00016679467141549617, "loss": 0.0324, "step": 1740 }, { "epoch": 2.60345789180145, "grad_norm": 0.07863139361143112, "learning_rate": 0.00016524526172742026, "loss": 0.0295, "step": 1750 }, { "epoch": 2.6183305447109126, "grad_norm": 0.08600688725709915, "learning_rate": 0.00016369420665579725, "loss": 0.0342, "step": 1760 }, { "epoch": 2.6332031976203756, "grad_norm": 0.10146727412939072, "learning_rate": 0.0001621416736021805, "loss": 0.032, "step": 1770 }, { "epoch": 2.648075850529838, "grad_norm": 0.0812121257185936, "learning_rate": 0.00016058783012763844, "loss": 0.0341, "step": 1780 }, { "epoch": 2.662948503439301, "grad_norm": 0.0973149985074997, "learning_rate": 0.00015903284393466987, "loss": 0.0313, "step": 1790 }, { "epoch": 2.677821156348764, "grad_norm": 0.0835902988910675, "learning_rate": 0.00015747688284910457, "loss": 0.0298, "step": 1800 }, { "epoch": 2.6926938092582264, "grad_norm": 0.07972200214862823, "learning_rate": 0.00015592011480198992, "loss": 0.0346, "step": 1810 }, { "epoch": 2.707566462167689, "grad_norm": 0.07594762742519379, "learning_rate": 0.0001543627078114667, "loss": 0.0338, "step": 1820 }, { "epoch": 2.722439115077152, "grad_norm": 0.07757771015167236, "learning_rate": 0.00015280482996463533, "loss": 0.0315, "step": 1830 }, { "epoch": 2.737311767986615, "grad_norm": 0.06432707607746124, "learning_rate": 0.00015124664939941457, "loss": 0.0319, "step": 1840 }, { "epoch": 2.7521844208960773, "grad_norm": 0.07696104794740677, "learning_rate": 0.00014968833428639474, "loss": 0.0301, "step": 1850 }, { "epoch": 2.7670570738055402, "grad_norm": 0.07426641881465912, "learning_rate": 0.00014813005281068774, "loss": 0.0285, "step": 1860 }, { "epoch": 2.7819297267150027, "grad_norm": 0.0765393078327179, "learning_rate": 0.00014657197315377495, "loss": 0.0313, "step": 1870 }, { "epoch": 2.7968023796244657, "grad_norm": 0.07151610404253006, "learning_rate": 0.00014501426347535598, "loss": 0.03, "step": 1880 }, { "epoch": 2.811675032533928, "grad_norm": 0.07834175229072571, "learning_rate": 0.0001434570918951996, "loss": 0.0286, "step": 1890 }, { "epoch": 2.826547685443391, "grad_norm": 0.09932053834199905, "learning_rate": 0.00014190062647499892, "loss": 0.0307, "step": 1900 }, { "epoch": 2.8414203383528536, "grad_norm": 0.08595503121614456, "learning_rate": 0.00014034503520023297, "loss": 0.0306, "step": 1910 }, { "epoch": 2.8562929912623165, "grad_norm": 0.08349858224391937, "learning_rate": 0.00013879048596203636, "loss": 0.0306, "step": 1920 }, { "epoch": 2.871165644171779, "grad_norm": 0.07905739545822144, "learning_rate": 0.0001372371465390794, "loss": 0.0305, "step": 1930 }, { "epoch": 2.886038297081242, "grad_norm": 0.06820567697286606, "learning_rate": 0.0001356851845794598, "loss": 0.0276, "step": 1940 }, { "epoch": 2.9009109499907044, "grad_norm": 0.07227708399295807, "learning_rate": 0.00013413476758260934, "loss": 0.0267, "step": 1950 }, { "epoch": 2.9157836029001674, "grad_norm": 0.09035148471593857, "learning_rate": 0.00013258606288121542, "loss": 0.0287, "step": 1960 }, { "epoch": 2.9306562558096303, "grad_norm": 0.08626757562160492, "learning_rate": 0.00013103923762316198, "loss": 0.0298, "step": 1970 }, { "epoch": 2.945528908719093, "grad_norm": 0.0765102431178093, "learning_rate": 0.00012949445875348902, "loss": 0.0274, "step": 1980 }, { "epoch": 2.9604015616285553, "grad_norm": 0.08610813319683075, "learning_rate": 0.00012795189299637483, "loss": 0.0283, "step": 1990 }, { "epoch": 2.975274214538018, "grad_norm": 0.08020433783531189, "learning_rate": 0.00012641170683714222, "loss": 0.0267, "step": 2000 }, { "epoch": 2.975274214538018, "eval_loss": 0.0338360071182251, "eval_runtime": 212.7237, "eval_samples_per_second": 22.475, "eval_steps_per_second": 22.475, "step": 2000 }, { "epoch": 2.990146867447481, "grad_norm": 0.06885667890310287, "learning_rate": 0.00012487406650428954, "loss": 0.0277, "step": 2010 }, { "epoch": 3.00539133667968, "grad_norm": 0.07658534497022629, "learning_rate": 0.00012333913795155053, "loss": 0.0251, "step": 2020 }, { "epoch": 3.020263989589143, "grad_norm": 0.06449634581804276, "learning_rate": 0.00012180708683998321, "loss": 0.0147, "step": 2030 }, { "epoch": 3.0351366424986055, "grad_norm": 0.06312290579080582, "learning_rate": 0.00012027807852009038, "loss": 0.0157, "step": 2040 }, { "epoch": 3.0500092954080684, "grad_norm": 0.07343071699142456, "learning_rate": 0.00011875227801397381, "loss": 0.0149, "step": 2050 }, { "epoch": 3.064881948317531, "grad_norm": 0.06489036977291107, "learning_rate": 0.00011722984999752392, "loss": 0.0155, "step": 2060 }, { "epoch": 3.079754601226994, "grad_norm": 0.06041651591658592, "learning_rate": 0.00011571095878264658, "loss": 0.0139, "step": 2070 }, { "epoch": 3.094627254136457, "grad_norm": 0.07048339396715164, "learning_rate": 0.00011419576829952933, "loss": 0.014, "step": 2080 }, { "epoch": 3.1094999070459193, "grad_norm": 0.05680292099714279, "learning_rate": 0.00011268444207894902, "loss": 0.0133, "step": 2090 }, { "epoch": 3.124372559955382, "grad_norm": 0.0727318823337555, "learning_rate": 0.00011117714323462186, "loss": 0.0147, "step": 2100 }, { "epoch": 3.1392452128648447, "grad_norm": 0.054686855524778366, "learning_rate": 0.00010967403444559963, "loss": 0.0143, "step": 2110 }, { "epoch": 3.1541178657743076, "grad_norm": 0.05729954317212105, "learning_rate": 0.00010817527793871143, "loss": 0.0134, "step": 2120 }, { "epoch": 3.16899051868377, "grad_norm": 0.08314567804336548, "learning_rate": 0.00010668103547105553, "loss": 0.0148, "step": 2130 }, { "epoch": 3.183863171593233, "grad_norm": 0.05523039028048515, "learning_rate": 0.00010519146831254088, "loss": 0.0129, "step": 2140 }, { "epoch": 3.1987358245026956, "grad_norm": 0.05546917766332626, "learning_rate": 0.00010370673722848183, "loss": 0.0139, "step": 2150 }, { "epoch": 3.2136084774121585, "grad_norm": 0.05486704409122467, "learning_rate": 0.00010222700246224735, "loss": 0.0135, "step": 2160 }, { "epoch": 3.228481130321621, "grad_norm": 0.05656208097934723, "learning_rate": 0.00010075242371796585, "loss": 0.0125, "step": 2170 }, { "epoch": 3.243353783231084, "grad_norm": 0.053801827132701874, "learning_rate": 9.928316014328916e-05, "loss": 0.0141, "step": 2180 }, { "epoch": 3.2582264361405464, "grad_norm": 0.061040911823511124, "learning_rate": 9.781937031221589e-05, "loss": 0.0136, "step": 2190 }, { "epoch": 3.2730990890500093, "grad_norm": 0.05558522418141365, "learning_rate": 9.636121220797707e-05, "loss": 0.0138, "step": 2200 }, { "epoch": 3.287971741959472, "grad_norm": 0.055547308176755905, "learning_rate": 9.490884320598516e-05, "loss": 0.0136, "step": 2210 }, { "epoch": 3.3028443948689348, "grad_norm": 0.061592597514390945, "learning_rate": 9.34624200568492e-05, "loss": 0.014, "step": 2220 }, { "epoch": 3.3177170477783973, "grad_norm": 0.05287894979119301, "learning_rate": 9.202209886945698e-05, "loss": 0.0125, "step": 2230 }, { "epoch": 3.33258970068786, "grad_norm": 0.06365808844566345, "learning_rate": 9.058803509412646e-05, "loss": 0.0139, "step": 2240 }, { "epoch": 3.347462353597323, "grad_norm": 0.05474059656262398, "learning_rate": 8.916038350582876e-05, "loss": 0.0141, "step": 2250 }, { "epoch": 3.3623350065067856, "grad_norm": 0.054872963577508926, "learning_rate": 8.773929818748315e-05, "loss": 0.0135, "step": 2260 }, { "epoch": 3.3772076594162486, "grad_norm": 0.05935963988304138, "learning_rate": 8.632493251332793e-05, "loss": 0.0128, "step": 2270 }, { "epoch": 3.392080312325711, "grad_norm": 0.06830602139234543, "learning_rate": 8.491743913236628e-05, "loss": 0.0133, "step": 2280 }, { "epoch": 3.406952965235174, "grad_norm": 0.057178862392902374, "learning_rate": 8.351696995189218e-05, "loss": 0.0121, "step": 2290 }, { "epoch": 3.4218256181446365, "grad_norm": 0.06827449798583984, "learning_rate": 8.212367612109464e-05, "loss": 0.0127, "step": 2300 }, { "epoch": 3.4366982710540994, "grad_norm": 0.04981634393334389, "learning_rate": 8.073770801474495e-05, "loss": 0.0132, "step": 2310 }, { "epoch": 3.451570923963562, "grad_norm": 0.052124422043561935, "learning_rate": 7.935921521696702e-05, "loss": 0.0129, "step": 2320 }, { "epoch": 3.466443576873025, "grad_norm": 0.05991722270846367, "learning_rate": 7.798834650509306e-05, "loss": 0.0128, "step": 2330 }, { "epoch": 3.4813162297824873, "grad_norm": 0.05946414917707443, "learning_rate": 7.662524983360665e-05, "loss": 0.0127, "step": 2340 }, { "epoch": 3.4961888826919503, "grad_norm": 0.05650801584124565, "learning_rate": 7.527007231817389e-05, "loss": 0.0127, "step": 2350 }, { "epoch": 3.5110615356014128, "grad_norm": 0.04841410368680954, "learning_rate": 7.392296021976614e-05, "loss": 0.0122, "step": 2360 }, { "epoch": 3.5259341885108757, "grad_norm": 0.05933946743607521, "learning_rate": 7.258405892887398e-05, "loss": 0.0121, "step": 2370 }, { "epoch": 3.540806841420338, "grad_norm": 0.05451497435569763, "learning_rate": 7.125351294981598e-05, "loss": 0.0127, "step": 2380 }, { "epoch": 3.555679494329801, "grad_norm": 0.05574881285429001, "learning_rate": 6.993146588514225e-05, "loss": 0.0124, "step": 2390 }, { "epoch": 3.5705521472392636, "grad_norm": 0.057919006794691086, "learning_rate": 6.86180604201361e-05, "loss": 0.0119, "step": 2400 }, { "epoch": 3.5854248001487266, "grad_norm": 0.051368821412324905, "learning_rate": 6.731343830741433e-05, "loss": 0.0126, "step": 2410 }, { "epoch": 3.6002974530581895, "grad_norm": 0.06351654976606369, "learning_rate": 6.6017740351628e-05, "loss": 0.0135, "step": 2420 }, { "epoch": 3.615170105967652, "grad_norm": 0.053709421306848526, "learning_rate": 6.473110639426616e-05, "loss": 0.0122, "step": 2430 }, { "epoch": 3.6300427588771145, "grad_norm": 0.061445701867341995, "learning_rate": 6.345367529856254e-05, "loss": 0.0132, "step": 2440 }, { "epoch": 3.6449154117865774, "grad_norm": 0.0678747370839119, "learning_rate": 6.218558493450893e-05, "loss": 0.0125, "step": 2450 }, { "epoch": 3.6597880646960403, "grad_norm": 0.05095114931464195, "learning_rate": 6.0926972163974775e-05, "loss": 0.012, "step": 2460 }, { "epoch": 3.674660717605503, "grad_norm": 0.05740583688020706, "learning_rate": 5.9677972825936254e-05, "loss": 0.0125, "step": 2470 }, { "epoch": 3.6895333705149658, "grad_norm": 0.05399662256240845, "learning_rate": 5.8438721721815536e-05, "loss": 0.0134, "step": 2480 }, { "epoch": 3.7044060234244283, "grad_norm": 0.056056030094623566, "learning_rate": 5.720935260093177e-05, "loss": 0.0125, "step": 2490 }, { "epoch": 3.719278676333891, "grad_norm": 0.046866290271282196, "learning_rate": 5.598999814606618e-05, "loss": 0.0118, "step": 2500 }, { "epoch": 3.719278676333891, "eval_loss": 0.031009526923298836, "eval_runtime": 212.3923, "eval_samples_per_second": 22.51, "eval_steps_per_second": 22.51, "step": 2500 }, { "epoch": 3.7341513292433537, "grad_norm": 0.046400755643844604, "learning_rate": 5.4780789959141524e-05, "loss": 0.0122, "step": 2510 }, { "epoch": 3.7490239821528166, "grad_norm": 0.05211547762155533, "learning_rate": 5.358185854701909e-05, "loss": 0.0122, "step": 2520 }, { "epoch": 3.763896635062279, "grad_norm": 0.0429752878844738, "learning_rate": 5.239333330741298e-05, "loss": 0.0124, "step": 2530 }, { "epoch": 3.778769287971742, "grad_norm": 0.05008607730269432, "learning_rate": 5.121534251492486e-05, "loss": 0.0125, "step": 2540 }, { "epoch": 3.7936419408812045, "grad_norm": 0.046397943049669266, "learning_rate": 5.004801330719941e-05, "loss": 0.0111, "step": 2550 }, { "epoch": 3.8085145937906675, "grad_norm": 0.05960022658109665, "learning_rate": 4.8891471671202675e-05, "loss": 0.0117, "step": 2560 }, { "epoch": 3.82338724670013, "grad_norm": 0.04353282228112221, "learning_rate": 4.7745842429624795e-05, "loss": 0.0119, "step": 2570 }, { "epoch": 3.838259899609593, "grad_norm": 0.05057670921087265, "learning_rate": 4.661124922740794e-05, "loss": 0.0116, "step": 2580 }, { "epoch": 3.853132552519056, "grad_norm": 0.04886782541871071, "learning_rate": 4.548781451840179e-05, "loss": 0.0113, "step": 2590 }, { "epoch": 3.8680052054285183, "grad_norm": 0.055182382464408875, "learning_rate": 4.437565955214723e-05, "loss": 0.0116, "step": 2600 }, { "epoch": 3.882877858337981, "grad_norm": 0.048834457993507385, "learning_rate": 4.3274904360790505e-05, "loss": 0.0121, "step": 2610 }, { "epoch": 3.8977505112474438, "grad_norm": 0.05025951564311981, "learning_rate": 4.218566774612802e-05, "loss": 0.0112, "step": 2620 }, { "epoch": 3.9126231641569067, "grad_norm": 0.05054251477122307, "learning_rate": 4.1108067266784746e-05, "loss": 0.0112, "step": 2630 }, { "epoch": 3.927495817066369, "grad_norm": 0.05326022952795029, "learning_rate": 4.004221922552608e-05, "loss": 0.0119, "step": 2640 }, { "epoch": 3.9423684699758317, "grad_norm": 0.05668502673506737, "learning_rate": 3.898823865670579e-05, "loss": 0.0114, "step": 2650 }, { "epoch": 3.9572411228852946, "grad_norm": 0.054235439747571945, "learning_rate": 3.794623931385062e-05, "loss": 0.0119, "step": 2660 }, { "epoch": 3.9721137757947576, "grad_norm": 0.05231969431042671, "learning_rate": 3.6916333657383024e-05, "loss": 0.0108, "step": 2670 }, { "epoch": 3.98698642870422, "grad_norm": 0.057500049471855164, "learning_rate": 3.5898632842483746e-05, "loss": 0.011, "step": 2680 }, { "epoch": 4.002230897936419, "grad_norm": 0.04203633964061737, "learning_rate": 3.489324670709494e-05, "loss": 0.0113, "step": 2690 }, { "epoch": 4.017103550845882, "grad_norm": 0.029648838564753532, "learning_rate": 3.390028376006589e-05, "loss": 0.0059, "step": 2700 }, { "epoch": 4.031976203755345, "grad_norm": 0.03779765963554382, "learning_rate": 3.2919851169441625e-05, "loss": 0.006, "step": 2710 }, { "epoch": 4.046848856664807, "grad_norm": 0.040116600692272186, "learning_rate": 3.195205475089667e-05, "loss": 0.0058, "step": 2720 }, { "epoch": 4.06172150957427, "grad_norm": 0.030058899894356728, "learning_rate": 3.099699895631474e-05, "loss": 0.0056, "step": 2730 }, { "epoch": 4.076594162483733, "grad_norm": 0.03675166517496109, "learning_rate": 3.0054786862515257e-05, "loss": 0.0058, "step": 2740 }, { "epoch": 4.091466815393196, "grad_norm": 0.03470413759350777, "learning_rate": 2.912552016012879e-05, "loss": 0.0057, "step": 2750 }, { "epoch": 4.106339468302658, "grad_norm": 0.03222460299730301, "learning_rate": 2.8209299142621522e-05, "loss": 0.0057, "step": 2760 }, { "epoch": 4.121212121212121, "grad_norm": 0.036458127200603485, "learning_rate": 2.7306222695471173e-05, "loss": 0.0056, "step": 2770 }, { "epoch": 4.136084774121584, "grad_norm": 0.035760316997766495, "learning_rate": 2.641638828549425e-05, "loss": 0.0055, "step": 2780 }, { "epoch": 4.150957427031047, "grad_norm": 0.04281270503997803, "learning_rate": 2.5539891950326875e-05, "loss": 0.0056, "step": 2790 }, { "epoch": 4.165830079940509, "grad_norm": 0.030339548364281654, "learning_rate": 2.4676828288059558e-05, "loss": 0.0057, "step": 2800 }, { "epoch": 4.180702732849972, "grad_norm": 0.03753247857093811, "learning_rate": 2.382729044702748e-05, "loss": 0.0058, "step": 2810 }, { "epoch": 4.195575385759435, "grad_norm": 0.035988811403512955, "learning_rate": 2.299137011575738e-05, "loss": 0.0055, "step": 2820 }, { "epoch": 4.210448038668898, "grad_norm": 0.0344134196639061, "learning_rate": 2.2169157513071566e-05, "loss": 0.0057, "step": 2830 }, { "epoch": 4.22532069157836, "grad_norm": 0.03696177527308464, "learning_rate": 2.136074137835107e-05, "loss": 0.0056, "step": 2840 }, { "epoch": 4.240193344487823, "grad_norm": 0.03733756020665169, "learning_rate": 2.056620896195804e-05, "loss": 0.0057, "step": 2850 }, { "epoch": 4.255065997397286, "grad_norm": 0.03630942478775978, "learning_rate": 1.978564601581919e-05, "loss": 0.0056, "step": 2860 }, { "epoch": 4.269938650306749, "grad_norm": 0.03577449545264244, "learning_rate": 1.9019136784170635e-05, "loss": 0.0055, "step": 2870 }, { "epoch": 4.284811303216211, "grad_norm": 0.03209745138883591, "learning_rate": 1.82667639944657e-05, "loss": 0.0054, "step": 2880 }, { "epoch": 4.299683956125674, "grad_norm": 0.03668665885925293, "learning_rate": 1.752860884844646e-05, "loss": 0.0055, "step": 2890 }, { "epoch": 4.314556609035137, "grad_norm": 0.03498975560069084, "learning_rate": 1.680475101337959e-05, "loss": 0.0055, "step": 2900 }, { "epoch": 4.3294292619445995, "grad_norm": 0.04088146984577179, "learning_rate": 1.60952686134583e-05, "loss": 0.0055, "step": 2910 }, { "epoch": 4.3443019148540625, "grad_norm": 0.035557616502046585, "learning_rate": 1.5400238221370413e-05, "loss": 0.0056, "step": 2920 }, { "epoch": 4.3591745677635245, "grad_norm": 0.03443196415901184, "learning_rate": 1.4719734850034277e-05, "loss": 0.0056, "step": 2930 }, { "epoch": 4.3740472206729875, "grad_norm": 0.03481742739677429, "learning_rate": 1.4053831944502508e-05, "loss": 0.0057, "step": 2940 }, { "epoch": 4.38891987358245, "grad_norm": 0.03648516163229942, "learning_rate": 1.340260137403557e-05, "loss": 0.0053, "step": 2950 }, { "epoch": 4.403792526491913, "grad_norm": 0.03400832787156105, "learning_rate": 1.2766113424344814e-05, "loss": 0.0055, "step": 2960 }, { "epoch": 4.418665179401375, "grad_norm": 0.03558880090713501, "learning_rate": 1.21444367900069e-05, "loss": 0.0055, "step": 2970 }, { "epoch": 4.433537832310838, "grad_norm": 0.035319775342941284, "learning_rate": 1.1537638567049729e-05, "loss": 0.0055, "step": 2980 }, { "epoch": 4.448410485220301, "grad_norm": 0.03432595729827881, "learning_rate": 1.0945784245710848e-05, "loss": 0.0054, "step": 2990 }, { "epoch": 4.463283138129764, "grad_norm": 0.03571225702762604, "learning_rate": 1.036893770336938e-05, "loss": 0.0055, "step": 3000 }, { "epoch": 4.463283138129764, "eval_loss": 0.03200867399573326, "eval_runtime": 212.5457, "eval_samples_per_second": 22.494, "eval_steps_per_second": 22.494, "step": 3000 }, { "epoch": 4.478155791039226, "grad_norm": 0.040391724556684494, "learning_rate": 9.807161197651742e-06, "loss": 0.0056, "step": 3010 }, { "epoch": 4.493028443948689, "grad_norm": 0.03410281240940094, "learning_rate": 9.260515359712517e-06, "loss": 0.0055, "step": 3020 }, { "epoch": 4.507901096858152, "grad_norm": 0.03447275608778, "learning_rate": 8.729059187690479e-06, "loss": 0.0054, "step": 3030 }, { "epoch": 4.522773749767615, "grad_norm": 0.032652657479047775, "learning_rate": 8.212850040341273e-06, "loss": 0.0055, "step": 3040 }, { "epoch": 4.537646402677078, "grad_norm": 0.035828616470098495, "learning_rate": 7.711943630846684e-06, "loss": 0.0053, "step": 3050 }, { "epoch": 4.55251905558654, "grad_norm": 0.03351854532957077, "learning_rate": 7.226394020801645e-06, "loss": 0.0054, "step": 3060 }, { "epoch": 4.567391708496003, "grad_norm": 0.03872072696685791, "learning_rate": 6.7562536143796254e-06, "loss": 0.0056, "step": 3070 }, { "epoch": 4.582264361405466, "grad_norm": 0.03518550843000412, "learning_rate": 6.301573152676664e-06, "loss": 0.0054, "step": 3080 }, { "epoch": 4.597137014314928, "grad_norm": 0.0351685993373394, "learning_rate": 5.862401708235076e-06, "loss": 0.0052, "step": 3090 }, { "epoch": 4.612009667224391, "grad_norm": 0.0348668210208416, "learning_rate": 5.438786679747081e-06, "loss": 0.0055, "step": 3100 }, { "epoch": 4.626882320133854, "grad_norm": 0.03660331293940544, "learning_rate": 5.030773786939319e-06, "loss": 0.0055, "step": 3110 }, { "epoch": 4.641754973043317, "grad_norm": 0.04046601429581642, "learning_rate": 4.638407065638322e-06, "loss": 0.0054, "step": 3120 }, { "epoch": 4.65662762595278, "grad_norm": 0.03230154886841774, "learning_rate": 4.261728863017827e-06, "loss": 0.0054, "step": 3130 }, { "epoch": 4.671500278862242, "grad_norm": 0.034297142177820206, "learning_rate": 3.900779833028472e-06, "loss": 0.0054, "step": 3140 }, { "epoch": 4.686372931771705, "grad_norm": 0.03240946680307388, "learning_rate": 3.5555989320099952e-06, "loss": 0.0053, "step": 3150 }, { "epoch": 4.701245584681168, "grad_norm": 0.04137023165822029, "learning_rate": 3.2262234144868116e-06, "loss": 0.0054, "step": 3160 }, { "epoch": 4.7161182375906305, "grad_norm": 0.030783316120505333, "learning_rate": 2.912688829147214e-06, "loss": 0.0052, "step": 3170 }, { "epoch": 4.730990890500093, "grad_norm": 0.03588159382343292, "learning_rate": 2.6150290150067588e-06, "loss": 0.0055, "step": 3180 }, { "epoch": 4.7458635434095555, "grad_norm": 0.03300805762410164, "learning_rate": 2.3332760977559873e-06, "loss": 0.0053, "step": 3190 }, { "epoch": 4.7607361963190185, "grad_norm": 0.03986676409840584, "learning_rate": 2.0674604862932654e-06, "loss": 0.0055, "step": 3200 }, { "epoch": 4.775608849228481, "grad_norm": 0.03252493590116501, "learning_rate": 1.8176108694427927e-06, "loss": 0.0052, "step": 3210 }, { "epoch": 4.790481502137943, "grad_norm": 0.03938417136669159, "learning_rate": 1.583754212858329e-06, "loss": 0.0054, "step": 3220 }, { "epoch": 4.805354155047406, "grad_norm": 0.03552339971065521, "learning_rate": 1.3659157561127732e-06, "loss": 0.0057, "step": 3230 }, { "epoch": 4.820226807956869, "grad_norm": 0.03480495885014534, "learning_rate": 1.1641190099741904e-06, "loss": 0.0053, "step": 3240 }, { "epoch": 4.835099460866332, "grad_norm": 0.03451026231050491, "learning_rate": 9.783857538683603e-07, "loss": 0.0053, "step": 3250 }, { "epoch": 4.849972113775795, "grad_norm": 0.033308371901512146, "learning_rate": 8.087360335281235e-07, "loss": 0.0055, "step": 3260 }, { "epoch": 4.864844766685257, "grad_norm": 0.035610370337963104, "learning_rate": 6.551881588299279e-07, "loss": 0.0054, "step": 3270 }, { "epoch": 4.87971741959472, "grad_norm": 0.030910024419426918, "learning_rate": 5.177587018176777e-07, "loss": 0.0054, "step": 3280 }, { "epoch": 4.894590072504183, "grad_norm": 0.034942276775836945, "learning_rate": 3.964624949141626e-07, "loss": 0.0054, "step": 3290 }, { "epoch": 4.909462725413646, "grad_norm": 0.03491232544183731, "learning_rate": 2.913126293202228e-07, "loss": 0.0053, "step": 3300 }, { "epoch": 4.924335378323108, "grad_norm": 0.0331818163394928, "learning_rate": 2.0232045360184523e-07, "loss": 0.0051, "step": 3310 }, { "epoch": 4.939208031232571, "grad_norm": 0.034393060952425, "learning_rate": 1.2949557246537678e-07, "loss": 0.0053, "step": 3320 }, { "epoch": 4.954080684142034, "grad_norm": 0.03940508887171745, "learning_rate": 7.284584572085361e-08, "loss": 0.0052, "step": 3330 }, { "epoch": 4.968953337051497, "grad_norm": 0.03125544637441635, "learning_rate": 3.237738743372964e-08, "loss": 0.0052, "step": 3340 }, { "epoch": 4.983825989960959, "grad_norm": 0.03558258339762688, "learning_rate": 8.094565265054365e-09, "loss": 0.0054, "step": 3350 }, { "epoch": 4.998698642870422, "grad_norm": 0.03360743075609207, "learning_rate": 0.0, "loss": 0.0054, "step": 3360 }, { "epoch": 4.998698642870422, "step": 3360, "total_flos": 5.14290499398402e+18, "train_loss": 0.19145491501161208, "train_runtime": 31931.328, "train_samples_per_second": 6.737, "train_steps_per_second": 0.105 } ], "logging_steps": 10, "max_steps": 3360, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 5.14290499398402e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }