{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 492, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.006097560975609756, "grad_norm": 1.763520359992981, "learning_rate": 0.0001, "loss": 1.1695, "step": 1 }, { "epoch": 0.012195121951219513, "grad_norm": 1.4942116737365723, "learning_rate": 0.0001, "loss": 1.0553, "step": 2 }, { "epoch": 0.018292682926829267, "grad_norm": 1.345508337020874, "learning_rate": 0.0001, "loss": 1.0218, "step": 3 }, { "epoch": 0.024390243902439025, "grad_norm": 1.123505711555481, "learning_rate": 0.0001, "loss": 0.9164, "step": 4 }, { "epoch": 0.03048780487804878, "grad_norm": 0.823020875453949, "learning_rate": 0.0001, "loss": 0.9038, "step": 5 }, { "epoch": 0.036585365853658534, "grad_norm": 0.763806939125061, "learning_rate": 0.0001, "loss": 0.9062, "step": 6 }, { "epoch": 0.042682926829268296, "grad_norm": 0.7193121314048767, "learning_rate": 0.0001, "loss": 0.8466, "step": 7 }, { "epoch": 0.04878048780487805, "grad_norm": 0.7080236077308655, "learning_rate": 0.0001, "loss": 0.82, "step": 8 }, { "epoch": 0.054878048780487805, "grad_norm": 0.6981502175331116, "learning_rate": 0.0001, "loss": 0.8546, "step": 9 }, { "epoch": 0.06097560975609756, "grad_norm": 0.7174396514892578, "learning_rate": 0.0001, "loss": 0.8438, "step": 10 }, { "epoch": 0.06707317073170732, "grad_norm": 0.6729642152786255, "learning_rate": 0.0001, "loss": 0.8285, "step": 11 }, { "epoch": 0.07317073170731707, "grad_norm": 0.6757375001907349, "learning_rate": 0.0001, "loss": 0.8068, "step": 12 }, { "epoch": 0.07926829268292683, "grad_norm": 0.6743811368942261, "learning_rate": 0.0001, "loss": 0.7853, "step": 13 }, { "epoch": 0.08536585365853659, "grad_norm": 0.6855434775352478, "learning_rate": 0.0001, "loss": 0.7305, "step": 14 }, { "epoch": 0.09146341463414634, "grad_norm": 0.7576789259910583, "learning_rate": 0.0001, "loss": 0.7894, "step": 15 }, { "epoch": 0.0975609756097561, "grad_norm": 0.6285218000411987, "learning_rate": 0.0001, "loss": 0.7821, "step": 16 }, { "epoch": 0.10365853658536585, "grad_norm": 0.6224460005760193, "learning_rate": 0.0001, "loss": 0.806, "step": 17 }, { "epoch": 0.10975609756097561, "grad_norm": 0.6526975631713867, "learning_rate": 0.0001, "loss": 0.7725, "step": 18 }, { "epoch": 0.11585365853658537, "grad_norm": 0.7162805795669556, "learning_rate": 0.0001, "loss": 0.7637, "step": 19 }, { "epoch": 0.12195121951219512, "grad_norm": 0.6594821214675903, "learning_rate": 0.0001, "loss": 0.7953, "step": 20 }, { "epoch": 0.12804878048780488, "grad_norm": 0.6285718679428101, "learning_rate": 0.0001, "loss": 0.7673, "step": 21 }, { "epoch": 0.13414634146341464, "grad_norm": 0.6275126338005066, "learning_rate": 0.0001, "loss": 0.7639, "step": 22 }, { "epoch": 0.1402439024390244, "grad_norm": 0.6683803200721741, "learning_rate": 0.0001, "loss": 0.7598, "step": 23 }, { "epoch": 0.14634146341463414, "grad_norm": 0.6154472231864929, "learning_rate": 0.0001, "loss": 0.7243, "step": 24 }, { "epoch": 0.1524390243902439, "grad_norm": 0.6709151864051819, "learning_rate": 0.0001, "loss": 0.7869, "step": 25 }, { "epoch": 0.15853658536585366, "grad_norm": 0.6176601648330688, "learning_rate": 0.0001, "loss": 0.7181, "step": 26 }, { "epoch": 0.16463414634146342, "grad_norm": 0.5965794324874878, "learning_rate": 0.0001, "loss": 0.6955, "step": 27 }, { "epoch": 0.17073170731707318, "grad_norm": 0.5950392484664917, "learning_rate": 0.0001, "loss": 0.6866, "step": 28 }, { "epoch": 0.17682926829268292, "grad_norm": 0.5902345180511475, "learning_rate": 0.0001, "loss": 0.7488, "step": 29 }, { "epoch": 0.18292682926829268, "grad_norm": 0.5966442227363586, "learning_rate": 0.0001, "loss": 0.766, "step": 30 }, { "epoch": 0.18902439024390244, "grad_norm": 0.6065996289253235, "learning_rate": 0.0001, "loss": 0.7602, "step": 31 }, { "epoch": 0.1951219512195122, "grad_norm": 0.6001562476158142, "learning_rate": 0.0001, "loss": 0.7307, "step": 32 }, { "epoch": 0.20121951219512196, "grad_norm": 0.5457689166069031, "learning_rate": 0.0001, "loss": 0.7117, "step": 33 }, { "epoch": 0.2073170731707317, "grad_norm": 0.5943721532821655, "learning_rate": 0.0001, "loss": 0.7419, "step": 34 }, { "epoch": 0.21341463414634146, "grad_norm": 0.5822892785072327, "learning_rate": 0.0001, "loss": 0.7199, "step": 35 }, { "epoch": 0.21951219512195122, "grad_norm": 0.5900689959526062, "learning_rate": 0.0001, "loss": 0.7001, "step": 36 }, { "epoch": 0.22560975609756098, "grad_norm": 0.6492246389389038, "learning_rate": 0.0001, "loss": 0.7024, "step": 37 }, { "epoch": 0.23170731707317074, "grad_norm": 0.5830572247505188, "learning_rate": 0.0001, "loss": 0.7311, "step": 38 }, { "epoch": 0.23780487804878048, "grad_norm": 0.6123290061950684, "learning_rate": 0.0001, "loss": 0.7593, "step": 39 }, { "epoch": 0.24390243902439024, "grad_norm": 0.6116678714752197, "learning_rate": 0.0001, "loss": 0.7079, "step": 40 }, { "epoch": 0.25, "grad_norm": 0.6134564876556396, "learning_rate": 0.0001, "loss": 0.7426, "step": 41 }, { "epoch": 0.25609756097560976, "grad_norm": 0.5981906652450562, "learning_rate": 0.0001, "loss": 0.7207, "step": 42 }, { "epoch": 0.2621951219512195, "grad_norm": 0.6197260022163391, "learning_rate": 0.0001, "loss": 0.743, "step": 43 }, { "epoch": 0.2682926829268293, "grad_norm": 0.5889937877655029, "learning_rate": 0.0001, "loss": 0.7111, "step": 44 }, { "epoch": 0.27439024390243905, "grad_norm": 0.5781967639923096, "learning_rate": 0.0001, "loss": 0.7088, "step": 45 }, { "epoch": 0.2804878048780488, "grad_norm": 0.5735342502593994, "learning_rate": 0.0001, "loss": 0.7264, "step": 46 }, { "epoch": 0.2865853658536585, "grad_norm": 0.6068210005760193, "learning_rate": 0.0001, "loss": 0.7217, "step": 47 }, { "epoch": 0.2926829268292683, "grad_norm": 0.584036648273468, "learning_rate": 0.0001, "loss": 0.6998, "step": 48 }, { "epoch": 0.29878048780487804, "grad_norm": 0.5738788843154907, "learning_rate": 0.0001, "loss": 0.6662, "step": 49 }, { "epoch": 0.3048780487804878, "grad_norm": 0.5746581554412842, "learning_rate": 0.0001, "loss": 0.6702, "step": 50 }, { "epoch": 0.31097560975609756, "grad_norm": 0.5572565793991089, "learning_rate": 0.0001, "loss": 0.6766, "step": 51 }, { "epoch": 0.3170731707317073, "grad_norm": 0.6274172067642212, "learning_rate": 0.0001, "loss": 0.7509, "step": 52 }, { "epoch": 0.3231707317073171, "grad_norm": 0.5426685810089111, "learning_rate": 0.0001, "loss": 0.7065, "step": 53 }, { "epoch": 0.32926829268292684, "grad_norm": 0.5456064343452454, "learning_rate": 0.0001, "loss": 0.6069, "step": 54 }, { "epoch": 0.3353658536585366, "grad_norm": 0.5641257762908936, "learning_rate": 0.0001, "loss": 0.6862, "step": 55 }, { "epoch": 0.34146341463414637, "grad_norm": 0.5878000259399414, "learning_rate": 0.0001, "loss": 0.6827, "step": 56 }, { "epoch": 0.3475609756097561, "grad_norm": 0.5976933240890503, "learning_rate": 0.0001, "loss": 0.6838, "step": 57 }, { "epoch": 0.35365853658536583, "grad_norm": 0.5872485041618347, "learning_rate": 0.0001, "loss": 0.7017, "step": 58 }, { "epoch": 0.3597560975609756, "grad_norm": 0.5930238366127014, "learning_rate": 0.0001, "loss": 0.6735, "step": 59 }, { "epoch": 0.36585365853658536, "grad_norm": 0.5682117342948914, "learning_rate": 0.0001, "loss": 0.6686, "step": 60 }, { "epoch": 0.3719512195121951, "grad_norm": 0.5660499334335327, "learning_rate": 0.0001, "loss": 0.6579, "step": 61 }, { "epoch": 0.3780487804878049, "grad_norm": 0.5715780854225159, "learning_rate": 0.0001, "loss": 0.7127, "step": 62 }, { "epoch": 0.38414634146341464, "grad_norm": 0.5816344022750854, "learning_rate": 0.0001, "loss": 0.7158, "step": 63 }, { "epoch": 0.3902439024390244, "grad_norm": 0.5610223412513733, "learning_rate": 0.0001, "loss": 0.6527, "step": 64 }, { "epoch": 0.39634146341463417, "grad_norm": 0.5489451885223389, "learning_rate": 0.0001, "loss": 0.6902, "step": 65 }, { "epoch": 0.4024390243902439, "grad_norm": 0.5633963942527771, "learning_rate": 0.0001, "loss": 0.729, "step": 66 }, { "epoch": 0.40853658536585363, "grad_norm": 0.5687914490699768, "learning_rate": 0.0001, "loss": 0.6558, "step": 67 }, { "epoch": 0.4146341463414634, "grad_norm": 0.5886531472206116, "learning_rate": 0.0001, "loss": 0.7309, "step": 68 }, { "epoch": 0.42073170731707316, "grad_norm": 0.546073853969574, "learning_rate": 0.0001, "loss": 0.6625, "step": 69 }, { "epoch": 0.4268292682926829, "grad_norm": 0.5979751348495483, "learning_rate": 0.0001, "loss": 0.6942, "step": 70 }, { "epoch": 0.4329268292682927, "grad_norm": 0.5324491858482361, "learning_rate": 0.0001, "loss": 0.6274, "step": 71 }, { "epoch": 0.43902439024390244, "grad_norm": 0.6174746751785278, "learning_rate": 0.0001, "loss": 0.7073, "step": 72 }, { "epoch": 0.4451219512195122, "grad_norm": 0.5845648646354675, "learning_rate": 0.0001, "loss": 0.6871, "step": 73 }, { "epoch": 0.45121951219512196, "grad_norm": 0.5905411839485168, "learning_rate": 0.0001, "loss": 0.6846, "step": 74 }, { "epoch": 0.4573170731707317, "grad_norm": 0.5970960855484009, "learning_rate": 0.0001, "loss": 0.6588, "step": 75 }, { "epoch": 0.4634146341463415, "grad_norm": 0.5933733582496643, "learning_rate": 0.0001, "loss": 0.678, "step": 76 }, { "epoch": 0.4695121951219512, "grad_norm": 0.5747849941253662, "learning_rate": 0.0001, "loss": 0.683, "step": 77 }, { "epoch": 0.47560975609756095, "grad_norm": 0.5409815311431885, "learning_rate": 0.0001, "loss": 0.6287, "step": 78 }, { "epoch": 0.4817073170731707, "grad_norm": 0.6004408001899719, "learning_rate": 0.0001, "loss": 0.6356, "step": 79 }, { "epoch": 0.4878048780487805, "grad_norm": 0.5724059343338013, "learning_rate": 0.0001, "loss": 0.6388, "step": 80 }, { "epoch": 0.49390243902439024, "grad_norm": 0.6099798083305359, "learning_rate": 0.0001, "loss": 0.7115, "step": 81 }, { "epoch": 0.5, "grad_norm": 0.5958842039108276, "learning_rate": 0.0001, "loss": 0.655, "step": 82 }, { "epoch": 0.5060975609756098, "grad_norm": 0.6181111335754395, "learning_rate": 0.0001, "loss": 0.6391, "step": 83 }, { "epoch": 0.5121951219512195, "grad_norm": 0.5894577503204346, "learning_rate": 0.0001, "loss": 0.6791, "step": 84 }, { "epoch": 0.5182926829268293, "grad_norm": 0.5830883979797363, "learning_rate": 0.0001, "loss": 0.6582, "step": 85 }, { "epoch": 0.524390243902439, "grad_norm": 0.5686275362968445, "learning_rate": 0.0001, "loss": 0.678, "step": 86 }, { "epoch": 0.5304878048780488, "grad_norm": 0.6119154095649719, "learning_rate": 0.0001, "loss": 0.6714, "step": 87 }, { "epoch": 0.5365853658536586, "grad_norm": 0.5826413035392761, "learning_rate": 0.0001, "loss": 0.6746, "step": 88 }, { "epoch": 0.5426829268292683, "grad_norm": 0.6128208041191101, "learning_rate": 0.0001, "loss": 0.6851, "step": 89 }, { "epoch": 0.5487804878048781, "grad_norm": 0.575299859046936, "learning_rate": 0.0001, "loss": 0.6439, "step": 90 }, { "epoch": 0.5548780487804879, "grad_norm": 0.6011075377464294, "learning_rate": 0.0001, "loss": 0.689, "step": 91 }, { "epoch": 0.5609756097560976, "grad_norm": 0.5696834325790405, "learning_rate": 0.0001, "loss": 0.688, "step": 92 }, { "epoch": 0.5670731707317073, "grad_norm": 0.5776868462562561, "learning_rate": 0.0001, "loss": 0.6237, "step": 93 }, { "epoch": 0.573170731707317, "grad_norm": 0.5697721242904663, "learning_rate": 0.0001, "loss": 0.6551, "step": 94 }, { "epoch": 0.5792682926829268, "grad_norm": 0.5542324185371399, "learning_rate": 0.0001, "loss": 0.615, "step": 95 }, { "epoch": 0.5853658536585366, "grad_norm": 0.5746421217918396, "learning_rate": 0.0001, "loss": 0.6814, "step": 96 }, { "epoch": 0.5914634146341463, "grad_norm": 0.5714977383613586, "learning_rate": 0.0001, "loss": 0.6698, "step": 97 }, { "epoch": 0.5975609756097561, "grad_norm": 0.5868296027183533, "learning_rate": 0.0001, "loss": 0.6482, "step": 98 }, { "epoch": 0.6036585365853658, "grad_norm": 0.5577363967895508, "learning_rate": 0.0001, "loss": 0.663, "step": 99 }, { "epoch": 0.6097560975609756, "grad_norm": 0.51622474193573, "learning_rate": 0.0001, "loss": 0.5813, "step": 100 }, { "epoch": 0.6158536585365854, "grad_norm": 0.5596529245376587, "learning_rate": 0.0001, "loss": 0.6157, "step": 101 }, { "epoch": 0.6219512195121951, "grad_norm": 0.585007905960083, "learning_rate": 0.0001, "loss": 0.6734, "step": 102 }, { "epoch": 0.6280487804878049, "grad_norm": 0.5682265758514404, "learning_rate": 0.0001, "loss": 0.6231, "step": 103 }, { "epoch": 0.6341463414634146, "grad_norm": 0.6157271265983582, "learning_rate": 0.0001, "loss": 0.6679, "step": 104 }, { "epoch": 0.6402439024390244, "grad_norm": 0.5796582698822021, "learning_rate": 0.0001, "loss": 0.6091, "step": 105 }, { "epoch": 0.6463414634146342, "grad_norm": 0.5919722318649292, "learning_rate": 0.0001, "loss": 0.6744, "step": 106 }, { "epoch": 0.6524390243902439, "grad_norm": 0.5803415775299072, "learning_rate": 0.0001, "loss": 0.6316, "step": 107 }, { "epoch": 0.6585365853658537, "grad_norm": 0.5573592782020569, "learning_rate": 0.0001, "loss": 0.6028, "step": 108 }, { "epoch": 0.6646341463414634, "grad_norm": 0.5864866375923157, "learning_rate": 0.0001, "loss": 0.6442, "step": 109 }, { "epoch": 0.6707317073170732, "grad_norm": 0.5456053018569946, "learning_rate": 0.0001, "loss": 0.6233, "step": 110 }, { "epoch": 0.676829268292683, "grad_norm": 0.575710654258728, "learning_rate": 0.0001, "loss": 0.6303, "step": 111 }, { "epoch": 0.6829268292682927, "grad_norm": 0.6122698783874512, "learning_rate": 0.0001, "loss": 0.6676, "step": 112 }, { "epoch": 0.6890243902439024, "grad_norm": 0.5976404547691345, "learning_rate": 0.0001, "loss": 0.6533, "step": 113 }, { "epoch": 0.6951219512195121, "grad_norm": 0.6462607979774475, "learning_rate": 0.0001, "loss": 0.7024, "step": 114 }, { "epoch": 0.7012195121951219, "grad_norm": 0.5650457143783569, "learning_rate": 0.0001, "loss": 0.6667, "step": 115 }, { "epoch": 0.7073170731707317, "grad_norm": 0.5858912467956543, "learning_rate": 0.0001, "loss": 0.6492, "step": 116 }, { "epoch": 0.7134146341463414, "grad_norm": 0.5636318325996399, "learning_rate": 0.0001, "loss": 0.6112, "step": 117 }, { "epoch": 0.7195121951219512, "grad_norm": 0.5599079728126526, "learning_rate": 0.0001, "loss": 0.6817, "step": 118 }, { "epoch": 0.725609756097561, "grad_norm": 0.551928699016571, "learning_rate": 0.0001, "loss": 0.6534, "step": 119 }, { "epoch": 0.7317073170731707, "grad_norm": 0.5585001707077026, "learning_rate": 0.0001, "loss": 0.6517, "step": 120 }, { "epoch": 0.7378048780487805, "grad_norm": 0.5939499139785767, "learning_rate": 0.0001, "loss": 0.637, "step": 121 }, { "epoch": 0.7439024390243902, "grad_norm": 0.6028351187705994, "learning_rate": 0.0001, "loss": 0.6497, "step": 122 }, { "epoch": 0.75, "grad_norm": 0.6053422689437866, "learning_rate": 0.0001, "loss": 0.6606, "step": 123 }, { "epoch": 0.7560975609756098, "grad_norm": 0.5626771450042725, "learning_rate": 0.0001, "loss": 0.6475, "step": 124 }, { "epoch": 0.7621951219512195, "grad_norm": 0.5561665892601013, "learning_rate": 0.0001, "loss": 0.6126, "step": 125 }, { "epoch": 0.7682926829268293, "grad_norm": 0.5361859202384949, "learning_rate": 0.0001, "loss": 0.6737, "step": 126 }, { "epoch": 0.774390243902439, "grad_norm": 0.5999827980995178, "learning_rate": 0.0001, "loss": 0.627, "step": 127 }, { "epoch": 0.7804878048780488, "grad_norm": 0.5717467665672302, "learning_rate": 0.0001, "loss": 0.7242, "step": 128 }, { "epoch": 0.7865853658536586, "grad_norm": 0.5655209422111511, "learning_rate": 0.0001, "loss": 0.6072, "step": 129 }, { "epoch": 0.7926829268292683, "grad_norm": 0.5843133926391602, "learning_rate": 0.0001, "loss": 0.6727, "step": 130 }, { "epoch": 0.7987804878048781, "grad_norm": 0.5787593722343445, "learning_rate": 0.0001, "loss": 0.6394, "step": 131 }, { "epoch": 0.8048780487804879, "grad_norm": 0.5661312341690063, "learning_rate": 0.0001, "loss": 0.6122, "step": 132 }, { "epoch": 0.8109756097560976, "grad_norm": 0.602393388748169, "learning_rate": 0.0001, "loss": 0.6193, "step": 133 }, { "epoch": 0.8170731707317073, "grad_norm": 0.630905032157898, "learning_rate": 0.0001, "loss": 0.6427, "step": 134 }, { "epoch": 0.823170731707317, "grad_norm": 0.6203592419624329, "learning_rate": 0.0001, "loss": 0.6491, "step": 135 }, { "epoch": 0.8292682926829268, "grad_norm": 0.5753608345985413, "learning_rate": 0.0001, "loss": 0.6295, "step": 136 }, { "epoch": 0.8353658536585366, "grad_norm": 0.5919385552406311, "learning_rate": 0.0001, "loss": 0.6262, "step": 137 }, { "epoch": 0.8414634146341463, "grad_norm": 0.564659833908081, "learning_rate": 0.0001, "loss": 0.6437, "step": 138 }, { "epoch": 0.8475609756097561, "grad_norm": 0.5595895648002625, "learning_rate": 0.0001, "loss": 0.628, "step": 139 }, { "epoch": 0.8536585365853658, "grad_norm": 0.5651856064796448, "learning_rate": 0.0001, "loss": 0.622, "step": 140 }, { "epoch": 0.8597560975609756, "grad_norm": 0.5735089778900146, "learning_rate": 0.0001, "loss": 0.6313, "step": 141 }, { "epoch": 0.8658536585365854, "grad_norm": 0.6084374189376831, "learning_rate": 0.0001, "loss": 0.6528, "step": 142 }, { "epoch": 0.8719512195121951, "grad_norm": 0.5673129558563232, "learning_rate": 0.0001, "loss": 0.6163, "step": 143 }, { "epoch": 0.8780487804878049, "grad_norm": 0.5617730021476746, "learning_rate": 0.0001, "loss": 0.6397, "step": 144 }, { "epoch": 0.8841463414634146, "grad_norm": 0.5928285121917725, "learning_rate": 0.0001, "loss": 0.64, "step": 145 }, { "epoch": 0.8902439024390244, "grad_norm": 0.5878246426582336, "learning_rate": 0.0001, "loss": 0.6691, "step": 146 }, { "epoch": 0.8963414634146342, "grad_norm": 0.5934311747550964, "learning_rate": 0.0001, "loss": 0.6325, "step": 147 }, { "epoch": 0.9024390243902439, "grad_norm": 0.5465561151504517, "learning_rate": 0.0001, "loss": 0.663, "step": 148 }, { "epoch": 0.9085365853658537, "grad_norm": 0.5870200991630554, "learning_rate": 0.0001, "loss": 0.6104, "step": 149 }, { "epoch": 0.9146341463414634, "grad_norm": 0.6161399483680725, "learning_rate": 0.0001, "loss": 0.6553, "step": 150 }, { "epoch": 0.9207317073170732, "grad_norm": 0.5733305811882019, "learning_rate": 0.0001, "loss": 0.6167, "step": 151 }, { "epoch": 0.926829268292683, "grad_norm": 0.595331072807312, "learning_rate": 0.0001, "loss": 0.6594, "step": 152 }, { "epoch": 0.9329268292682927, "grad_norm": 0.5634722709655762, "learning_rate": 0.0001, "loss": 0.6435, "step": 153 }, { "epoch": 0.9390243902439024, "grad_norm": 0.5649352073669434, "learning_rate": 0.0001, "loss": 0.6338, "step": 154 }, { "epoch": 0.9451219512195121, "grad_norm": 0.5804089903831482, "learning_rate": 0.0001, "loss": 0.6151, "step": 155 }, { "epoch": 0.9512195121951219, "grad_norm": 0.5910571217536926, "learning_rate": 0.0001, "loss": 0.6083, "step": 156 }, { "epoch": 0.9573170731707317, "grad_norm": 0.6512947082519531, "learning_rate": 0.0001, "loss": 0.652, "step": 157 }, { "epoch": 0.9634146341463414, "grad_norm": 0.6277866363525391, "learning_rate": 0.0001, "loss": 0.6363, "step": 158 }, { "epoch": 0.9695121951219512, "grad_norm": 0.5870842933654785, "learning_rate": 0.0001, "loss": 0.6417, "step": 159 }, { "epoch": 0.975609756097561, "grad_norm": 0.546256422996521, "learning_rate": 0.0001, "loss": 0.5957, "step": 160 }, { "epoch": 0.9817073170731707, "grad_norm": 0.5940456390380859, "learning_rate": 0.0001, "loss": 0.5774, "step": 161 }, { "epoch": 0.9878048780487805, "grad_norm": 0.5390895009040833, "learning_rate": 0.0001, "loss": 0.6131, "step": 162 }, { "epoch": 0.9939024390243902, "grad_norm": 0.5646426677703857, "learning_rate": 0.0001, "loss": 0.6247, "step": 163 }, { "epoch": 1.0, "grad_norm": 0.5933319330215454, "learning_rate": 0.0001, "loss": 0.6107, "step": 164 }, { "epoch": 1.0060975609756098, "grad_norm": 0.5555415749549866, "learning_rate": 0.0001, "loss": 0.5038, "step": 165 }, { "epoch": 1.0121951219512195, "grad_norm": 0.5714491605758667, "learning_rate": 0.0001, "loss": 0.5403, "step": 166 }, { "epoch": 1.0182926829268293, "grad_norm": 0.6099926829338074, "learning_rate": 0.0001, "loss": 0.4943, "step": 167 }, { "epoch": 1.024390243902439, "grad_norm": 0.7038013339042664, "learning_rate": 0.0001, "loss": 0.4801, "step": 168 }, { "epoch": 1.0304878048780488, "grad_norm": 0.6525987982749939, "learning_rate": 0.0001, "loss": 0.499, "step": 169 }, { "epoch": 1.0365853658536586, "grad_norm": 0.5772536396980286, "learning_rate": 0.0001, "loss": 0.4899, "step": 170 }, { "epoch": 1.0426829268292683, "grad_norm": 0.5953510999679565, "learning_rate": 0.0001, "loss": 0.5343, "step": 171 }, { "epoch": 1.048780487804878, "grad_norm": 0.579450786113739, "learning_rate": 0.0001, "loss": 0.5222, "step": 172 }, { "epoch": 1.0548780487804879, "grad_norm": 0.5960140228271484, "learning_rate": 0.0001, "loss": 0.4936, "step": 173 }, { "epoch": 1.0609756097560976, "grad_norm": 0.5782721042633057, "learning_rate": 0.0001, "loss": 0.487, "step": 174 }, { "epoch": 1.0670731707317074, "grad_norm": 0.6194652318954468, "learning_rate": 0.0001, "loss": 0.5045, "step": 175 }, { "epoch": 1.0731707317073171, "grad_norm": 0.7137989401817322, "learning_rate": 0.0001, "loss": 0.5206, "step": 176 }, { "epoch": 1.079268292682927, "grad_norm": 0.6591524481773376, "learning_rate": 0.0001, "loss": 0.5203, "step": 177 }, { "epoch": 1.0853658536585367, "grad_norm": 0.5615283846855164, "learning_rate": 0.0001, "loss": 0.4845, "step": 178 }, { "epoch": 1.0914634146341464, "grad_norm": 0.5729933381080627, "learning_rate": 0.0001, "loss": 0.5166, "step": 179 }, { "epoch": 1.0975609756097562, "grad_norm": 0.5670926570892334, "learning_rate": 0.0001, "loss": 0.5311, "step": 180 }, { "epoch": 1.103658536585366, "grad_norm": 0.5750375390052795, "learning_rate": 0.0001, "loss": 0.4739, "step": 181 }, { "epoch": 1.1097560975609757, "grad_norm": 0.5616285800933838, "learning_rate": 0.0001, "loss": 0.513, "step": 182 }, { "epoch": 1.1158536585365855, "grad_norm": 0.6150811910629272, "learning_rate": 0.0001, "loss": 0.53, "step": 183 }, { "epoch": 1.1219512195121952, "grad_norm": 0.6283072233200073, "learning_rate": 0.0001, "loss": 0.5099, "step": 184 }, { "epoch": 1.1280487804878048, "grad_norm": 0.5622886419296265, "learning_rate": 0.0001, "loss": 0.4663, "step": 185 }, { "epoch": 1.1341463414634148, "grad_norm": 0.6202870607376099, "learning_rate": 0.0001, "loss": 0.5113, "step": 186 }, { "epoch": 1.1402439024390243, "grad_norm": 0.5678901672363281, "learning_rate": 0.0001, "loss": 0.4595, "step": 187 }, { "epoch": 1.146341463414634, "grad_norm": 0.6146119832992554, "learning_rate": 0.0001, "loss": 0.5248, "step": 188 }, { "epoch": 1.1524390243902438, "grad_norm": 0.5726969838142395, "learning_rate": 0.0001, "loss": 0.5016, "step": 189 }, { "epoch": 1.1585365853658536, "grad_norm": 0.5848289132118225, "learning_rate": 0.0001, "loss": 0.5236, "step": 190 }, { "epoch": 1.1646341463414633, "grad_norm": 0.598795473575592, "learning_rate": 0.0001, "loss": 0.5444, "step": 191 }, { "epoch": 1.170731707317073, "grad_norm": 0.5984260439872742, "learning_rate": 0.0001, "loss": 0.5291, "step": 192 }, { "epoch": 1.1768292682926829, "grad_norm": 0.5640114545822144, "learning_rate": 0.0001, "loss": 0.5366, "step": 193 }, { "epoch": 1.1829268292682926, "grad_norm": 0.5771395564079285, "learning_rate": 0.0001, "loss": 0.519, "step": 194 }, { "epoch": 1.1890243902439024, "grad_norm": 0.5926110744476318, "learning_rate": 0.0001, "loss": 0.4945, "step": 195 }, { "epoch": 1.1951219512195121, "grad_norm": 0.6406283974647522, "learning_rate": 0.0001, "loss": 0.5313, "step": 196 }, { "epoch": 1.201219512195122, "grad_norm": 0.5671162009239197, "learning_rate": 0.0001, "loss": 0.4971, "step": 197 }, { "epoch": 1.2073170731707317, "grad_norm": 0.5952590703964233, "learning_rate": 0.0001, "loss": 0.4886, "step": 198 }, { "epoch": 1.2134146341463414, "grad_norm": 0.6368497014045715, "learning_rate": 0.0001, "loss": 0.4984, "step": 199 }, { "epoch": 1.2195121951219512, "grad_norm": 0.6427241563796997, "learning_rate": 0.0001, "loss": 0.5201, "step": 200 }, { "epoch": 1.225609756097561, "grad_norm": 0.5814225673675537, "learning_rate": 0.0001, "loss": 0.5021, "step": 201 }, { "epoch": 1.2317073170731707, "grad_norm": 0.5985032916069031, "learning_rate": 0.0001, "loss": 0.4969, "step": 202 }, { "epoch": 1.2378048780487805, "grad_norm": 0.5723533630371094, "learning_rate": 0.0001, "loss": 0.485, "step": 203 }, { "epoch": 1.2439024390243902, "grad_norm": 0.598479688167572, "learning_rate": 0.0001, "loss": 0.496, "step": 204 }, { "epoch": 1.25, "grad_norm": 0.6005733013153076, "learning_rate": 0.0001, "loss": 0.4746, "step": 205 }, { "epoch": 1.2560975609756098, "grad_norm": 0.630957841873169, "learning_rate": 0.0001, "loss": 0.5069, "step": 206 }, { "epoch": 1.2621951219512195, "grad_norm": 0.6369969248771667, "learning_rate": 0.0001, "loss": 0.4869, "step": 207 }, { "epoch": 1.2682926829268293, "grad_norm": 0.6387524008750916, "learning_rate": 0.0001, "loss": 0.5133, "step": 208 }, { "epoch": 1.274390243902439, "grad_norm": 0.6263754367828369, "learning_rate": 0.0001, "loss": 0.5444, "step": 209 }, { "epoch": 1.2804878048780488, "grad_norm": 0.557532012462616, "learning_rate": 0.0001, "loss": 0.4726, "step": 210 }, { "epoch": 1.2865853658536586, "grad_norm": 0.576702892780304, "learning_rate": 0.0001, "loss": 0.5325, "step": 211 }, { "epoch": 1.2926829268292683, "grad_norm": 0.6313229203224182, "learning_rate": 0.0001, "loss": 0.5044, "step": 212 }, { "epoch": 1.298780487804878, "grad_norm": 0.625912070274353, "learning_rate": 0.0001, "loss": 0.5381, "step": 213 }, { "epoch": 1.3048780487804879, "grad_norm": 0.6148139238357544, "learning_rate": 0.0001, "loss": 0.4934, "step": 214 }, { "epoch": 1.3109756097560976, "grad_norm": 0.6258604526519775, "learning_rate": 0.0001, "loss": 0.5239, "step": 215 }, { "epoch": 1.3170731707317074, "grad_norm": 0.6130456924438477, "learning_rate": 0.0001, "loss": 0.5014, "step": 216 }, { "epoch": 1.3231707317073171, "grad_norm": 0.606001615524292, "learning_rate": 0.0001, "loss": 0.5201, "step": 217 }, { "epoch": 1.329268292682927, "grad_norm": 0.5635973215103149, "learning_rate": 0.0001, "loss": 0.4932, "step": 218 }, { "epoch": 1.3353658536585367, "grad_norm": 0.5979434251785278, "learning_rate": 0.0001, "loss": 0.5142, "step": 219 }, { "epoch": 1.3414634146341464, "grad_norm": 0.5663168430328369, "learning_rate": 0.0001, "loss": 0.524, "step": 220 }, { "epoch": 1.3475609756097562, "grad_norm": 0.6072438955307007, "learning_rate": 0.0001, "loss": 0.4997, "step": 221 }, { "epoch": 1.3536585365853657, "grad_norm": 0.601750373840332, "learning_rate": 0.0001, "loss": 0.4946, "step": 222 }, { "epoch": 1.3597560975609757, "grad_norm": 0.6556447744369507, "learning_rate": 0.0001, "loss": 0.5114, "step": 223 }, { "epoch": 1.3658536585365852, "grad_norm": 0.6329565048217773, "learning_rate": 0.0001, "loss": 0.512, "step": 224 }, { "epoch": 1.3719512195121952, "grad_norm": 0.6002699136734009, "learning_rate": 0.0001, "loss": 0.494, "step": 225 }, { "epoch": 1.3780487804878048, "grad_norm": 0.6447397470474243, "learning_rate": 0.0001, "loss": 0.548, "step": 226 }, { "epoch": 1.3841463414634148, "grad_norm": 0.5840697288513184, "learning_rate": 0.0001, "loss": 0.5177, "step": 227 }, { "epoch": 1.3902439024390243, "grad_norm": 0.5911181569099426, "learning_rate": 0.0001, "loss": 0.5183, "step": 228 }, { "epoch": 1.3963414634146343, "grad_norm": 0.6022722125053406, "learning_rate": 0.0001, "loss": 0.476, "step": 229 }, { "epoch": 1.4024390243902438, "grad_norm": 0.5788743495941162, "learning_rate": 0.0001, "loss": 0.5109, "step": 230 }, { "epoch": 1.4085365853658536, "grad_norm": 0.5945917963981628, "learning_rate": 0.0001, "loss": 0.4869, "step": 231 }, { "epoch": 1.4146341463414633, "grad_norm": 0.638956606388092, "learning_rate": 0.0001, "loss": 0.53, "step": 232 }, { "epoch": 1.420731707317073, "grad_norm": 0.6204885840415955, "learning_rate": 0.0001, "loss": 0.5205, "step": 233 }, { "epoch": 1.4268292682926829, "grad_norm": 0.5931024551391602, "learning_rate": 0.0001, "loss": 0.5167, "step": 234 }, { "epoch": 1.4329268292682926, "grad_norm": 0.5996592044830322, "learning_rate": 0.0001, "loss": 0.4935, "step": 235 }, { "epoch": 1.4390243902439024, "grad_norm": 0.6242860555648804, "learning_rate": 0.0001, "loss": 0.5047, "step": 236 }, { "epoch": 1.4451219512195121, "grad_norm": 0.5914901494979858, "learning_rate": 0.0001, "loss": 0.5092, "step": 237 }, { "epoch": 1.451219512195122, "grad_norm": 0.6710638999938965, "learning_rate": 0.0001, "loss": 0.5437, "step": 238 }, { "epoch": 1.4573170731707317, "grad_norm": 0.6554276347160339, "learning_rate": 0.0001, "loss": 0.4906, "step": 239 }, { "epoch": 1.4634146341463414, "grad_norm": 0.6532212495803833, "learning_rate": 0.0001, "loss": 0.5508, "step": 240 }, { "epoch": 1.4695121951219512, "grad_norm": 0.5957479476928711, "learning_rate": 0.0001, "loss": 0.4902, "step": 241 }, { "epoch": 1.475609756097561, "grad_norm": 0.5946776270866394, "learning_rate": 0.0001, "loss": 0.5085, "step": 242 }, { "epoch": 1.4817073170731707, "grad_norm": 0.5819572806358337, "learning_rate": 0.0001, "loss": 0.4819, "step": 243 }, { "epoch": 1.4878048780487805, "grad_norm": 0.6151570081710815, "learning_rate": 0.0001, "loss": 0.5058, "step": 244 }, { "epoch": 1.4939024390243902, "grad_norm": 0.6580333709716797, "learning_rate": 0.0001, "loss": 0.506, "step": 245 }, { "epoch": 1.5, "grad_norm": 0.6214548945426941, "learning_rate": 0.0001, "loss": 0.4739, "step": 246 }, { "epoch": 1.5060975609756098, "grad_norm": 0.6240037083625793, "learning_rate": 0.0001, "loss": 0.4898, "step": 247 }, { "epoch": 1.5121951219512195, "grad_norm": 0.6115790605545044, "learning_rate": 0.0001, "loss": 0.5143, "step": 248 }, { "epoch": 1.5182926829268293, "grad_norm": 0.5654324293136597, "learning_rate": 0.0001, "loss": 0.4409, "step": 249 }, { "epoch": 1.524390243902439, "grad_norm": 0.5737196207046509, "learning_rate": 0.0001, "loss": 0.4936, "step": 250 }, { "epoch": 1.5304878048780488, "grad_norm": 0.6084273457527161, "learning_rate": 0.0001, "loss": 0.5182, "step": 251 }, { "epoch": 1.5365853658536586, "grad_norm": 0.5695486664772034, "learning_rate": 0.0001, "loss": 0.4857, "step": 252 }, { "epoch": 1.5426829268292683, "grad_norm": 0.5693416595458984, "learning_rate": 0.0001, "loss": 0.5028, "step": 253 }, { "epoch": 1.548780487804878, "grad_norm": 0.5976539850234985, "learning_rate": 0.0001, "loss": 0.492, "step": 254 }, { "epoch": 1.5548780487804879, "grad_norm": 0.6122463941574097, "learning_rate": 0.0001, "loss": 0.5412, "step": 255 }, { "epoch": 1.5609756097560976, "grad_norm": 0.5977299213409424, "learning_rate": 0.0001, "loss": 0.5173, "step": 256 }, { "epoch": 1.5670731707317072, "grad_norm": 0.5926475524902344, "learning_rate": 0.0001, "loss": 0.5037, "step": 257 }, { "epoch": 1.5731707317073171, "grad_norm": 0.5920047163963318, "learning_rate": 0.0001, "loss": 0.4779, "step": 258 }, { "epoch": 1.5792682926829267, "grad_norm": 0.5987219214439392, "learning_rate": 0.0001, "loss": 0.5132, "step": 259 }, { "epoch": 1.5853658536585367, "grad_norm": 0.5943930149078369, "learning_rate": 0.0001, "loss": 0.4938, "step": 260 }, { "epoch": 1.5914634146341462, "grad_norm": 0.6259720921516418, "learning_rate": 0.0001, "loss": 0.5295, "step": 261 }, { "epoch": 1.5975609756097562, "grad_norm": 0.6168601512908936, "learning_rate": 0.0001, "loss": 0.4633, "step": 262 }, { "epoch": 1.6036585365853657, "grad_norm": 0.6057328581809998, "learning_rate": 0.0001, "loss": 0.5074, "step": 263 }, { "epoch": 1.6097560975609757, "grad_norm": 0.607790470123291, "learning_rate": 0.0001, "loss": 0.5068, "step": 264 }, { "epoch": 1.6158536585365852, "grad_norm": 0.5669077634811401, "learning_rate": 0.0001, "loss": 0.4578, "step": 265 }, { "epoch": 1.6219512195121952, "grad_norm": 0.58953458070755, "learning_rate": 0.0001, "loss": 0.512, "step": 266 }, { "epoch": 1.6280487804878048, "grad_norm": 0.6138054728507996, "learning_rate": 0.0001, "loss": 0.5035, "step": 267 }, { "epoch": 1.6341463414634148, "grad_norm": 0.6316951513290405, "learning_rate": 0.0001, "loss": 0.5374, "step": 268 }, { "epoch": 1.6402439024390243, "grad_norm": 0.5779020190238953, "learning_rate": 0.0001, "loss": 0.4934, "step": 269 }, { "epoch": 1.6463414634146343, "grad_norm": 0.6008270978927612, "learning_rate": 0.0001, "loss": 0.4628, "step": 270 }, { "epoch": 1.6524390243902438, "grad_norm": 0.5894110202789307, "learning_rate": 0.0001, "loss": 0.5109, "step": 271 }, { "epoch": 1.6585365853658538, "grad_norm": 0.5894849896430969, "learning_rate": 0.0001, "loss": 0.4861, "step": 272 }, { "epoch": 1.6646341463414633, "grad_norm": 0.6085466146469116, "learning_rate": 0.0001, "loss": 0.5101, "step": 273 }, { "epoch": 1.6707317073170733, "grad_norm": 0.6503622531890869, "learning_rate": 0.0001, "loss": 0.5508, "step": 274 }, { "epoch": 1.6768292682926829, "grad_norm": 0.6089245676994324, "learning_rate": 0.0001, "loss": 0.4911, "step": 275 }, { "epoch": 1.6829268292682928, "grad_norm": 0.6388260126113892, "learning_rate": 0.0001, "loss": 0.5165, "step": 276 }, { "epoch": 1.6890243902439024, "grad_norm": 0.6048246622085571, "learning_rate": 0.0001, "loss": 0.5405, "step": 277 }, { "epoch": 1.6951219512195121, "grad_norm": 0.5887222290039062, "learning_rate": 0.0001, "loss": 0.5205, "step": 278 }, { "epoch": 1.701219512195122, "grad_norm": 0.6097093820571899, "learning_rate": 0.0001, "loss": 0.5139, "step": 279 }, { "epoch": 1.7073170731707317, "grad_norm": 0.5547489523887634, "learning_rate": 0.0001, "loss": 0.4915, "step": 280 }, { "epoch": 1.7134146341463414, "grad_norm": 0.6122882962226868, "learning_rate": 0.0001, "loss": 0.493, "step": 281 }, { "epoch": 1.7195121951219512, "grad_norm": 0.6592060923576355, "learning_rate": 0.0001, "loss": 0.5314, "step": 282 }, { "epoch": 1.725609756097561, "grad_norm": 0.6154331564903259, "learning_rate": 0.0001, "loss": 0.5025, "step": 283 }, { "epoch": 1.7317073170731707, "grad_norm": 0.5997411608695984, "learning_rate": 0.0001, "loss": 0.5057, "step": 284 }, { "epoch": 1.7378048780487805, "grad_norm": 0.615349292755127, "learning_rate": 0.0001, "loss": 0.5195, "step": 285 }, { "epoch": 1.7439024390243902, "grad_norm": 0.6155688762664795, "learning_rate": 0.0001, "loss": 0.5194, "step": 286 }, { "epoch": 1.75, "grad_norm": 0.5677372217178345, "learning_rate": 0.0001, "loss": 0.5433, "step": 287 }, { "epoch": 1.7560975609756098, "grad_norm": 0.5937820672988892, "learning_rate": 0.0001, "loss": 0.5269, "step": 288 }, { "epoch": 1.7621951219512195, "grad_norm": 0.5868131518363953, "learning_rate": 0.0001, "loss": 0.535, "step": 289 }, { "epoch": 1.7682926829268293, "grad_norm": 0.6256383061408997, "learning_rate": 0.0001, "loss": 0.5196, "step": 290 }, { "epoch": 1.774390243902439, "grad_norm": 0.6187792420387268, "learning_rate": 0.0001, "loss": 0.5027, "step": 291 }, { "epoch": 1.7804878048780488, "grad_norm": 0.6260528564453125, "learning_rate": 0.0001, "loss": 0.5437, "step": 292 }, { "epoch": 1.7865853658536586, "grad_norm": 0.5868582129478455, "learning_rate": 0.0001, "loss": 0.5133, "step": 293 }, { "epoch": 1.7926829268292683, "grad_norm": 0.6079871654510498, "learning_rate": 0.0001, "loss": 0.5102, "step": 294 }, { "epoch": 1.798780487804878, "grad_norm": 0.5693763494491577, "learning_rate": 0.0001, "loss": 0.4933, "step": 295 }, { "epoch": 1.8048780487804879, "grad_norm": 0.6394689679145813, "learning_rate": 0.0001, "loss": 0.5452, "step": 296 }, { "epoch": 1.8109756097560976, "grad_norm": 0.6318659782409668, "learning_rate": 0.0001, "loss": 0.5391, "step": 297 }, { "epoch": 1.8170731707317072, "grad_norm": 0.5786278247833252, "learning_rate": 0.0001, "loss": 0.5129, "step": 298 }, { "epoch": 1.8231707317073171, "grad_norm": 0.6378489136695862, "learning_rate": 0.0001, "loss": 0.4935, "step": 299 }, { "epoch": 1.8292682926829267, "grad_norm": 0.637844979763031, "learning_rate": 0.0001, "loss": 0.5057, "step": 300 }, { "epoch": 1.8353658536585367, "grad_norm": 0.6403583288192749, "learning_rate": 0.0001, "loss": 0.542, "step": 301 }, { "epoch": 1.8414634146341462, "grad_norm": 0.6149348616600037, "learning_rate": 0.0001, "loss": 0.5108, "step": 302 }, { "epoch": 1.8475609756097562, "grad_norm": 0.5945342779159546, "learning_rate": 0.0001, "loss": 0.496, "step": 303 }, { "epoch": 1.8536585365853657, "grad_norm": 0.6346225142478943, "learning_rate": 0.0001, "loss": 0.5463, "step": 304 }, { "epoch": 1.8597560975609757, "grad_norm": 0.590212881565094, "learning_rate": 0.0001, "loss": 0.5126, "step": 305 }, { "epoch": 1.8658536585365852, "grad_norm": 0.5924628973007202, "learning_rate": 0.0001, "loss": 0.5096, "step": 306 }, { "epoch": 1.8719512195121952, "grad_norm": 0.6342692375183105, "learning_rate": 0.0001, "loss": 0.5063, "step": 307 }, { "epoch": 1.8780487804878048, "grad_norm": 0.6688621640205383, "learning_rate": 0.0001, "loss": 0.5534, "step": 308 }, { "epoch": 1.8841463414634148, "grad_norm": 0.628839910030365, "learning_rate": 0.0001, "loss": 0.4975, "step": 309 }, { "epoch": 1.8902439024390243, "grad_norm": 0.6141210794448853, "learning_rate": 0.0001, "loss": 0.4777, "step": 310 }, { "epoch": 1.8963414634146343, "grad_norm": 0.6270496845245361, "learning_rate": 0.0001, "loss": 0.5019, "step": 311 }, { "epoch": 1.9024390243902438, "grad_norm": 0.5861090421676636, "learning_rate": 0.0001, "loss": 0.5066, "step": 312 }, { "epoch": 1.9085365853658538, "grad_norm": 0.5715667009353638, "learning_rate": 0.0001, "loss": 0.4766, "step": 313 }, { "epoch": 1.9146341463414633, "grad_norm": 0.6288326978683472, "learning_rate": 0.0001, "loss": 0.5152, "step": 314 }, { "epoch": 1.9207317073170733, "grad_norm": 0.5759385228157043, "learning_rate": 0.0001, "loss": 0.51, "step": 315 }, { "epoch": 1.9268292682926829, "grad_norm": 0.6145620346069336, "learning_rate": 0.0001, "loss": 0.5104, "step": 316 }, { "epoch": 1.9329268292682928, "grad_norm": 0.6138148903846741, "learning_rate": 0.0001, "loss": 0.4967, "step": 317 }, { "epoch": 1.9390243902439024, "grad_norm": 0.6269311308860779, "learning_rate": 0.0001, "loss": 0.5479, "step": 318 }, { "epoch": 1.9451219512195121, "grad_norm": 0.6406437754631042, "learning_rate": 0.0001, "loss": 0.5199, "step": 319 }, { "epoch": 1.951219512195122, "grad_norm": 0.5639004707336426, "learning_rate": 0.0001, "loss": 0.4852, "step": 320 }, { "epoch": 1.9573170731707317, "grad_norm": 0.5929526090621948, "learning_rate": 0.0001, "loss": 0.5253, "step": 321 }, { "epoch": 1.9634146341463414, "grad_norm": 0.59356689453125, "learning_rate": 0.0001, "loss": 0.5094, "step": 322 }, { "epoch": 1.9695121951219512, "grad_norm": 0.6183592677116394, "learning_rate": 0.0001, "loss": 0.495, "step": 323 }, { "epoch": 1.975609756097561, "grad_norm": 0.5988680720329285, "learning_rate": 0.0001, "loss": 0.5017, "step": 324 }, { "epoch": 1.9817073170731707, "grad_norm": 0.6253383159637451, "learning_rate": 0.0001, "loss": 0.5101, "step": 325 }, { "epoch": 1.9878048780487805, "grad_norm": 0.6147765517234802, "learning_rate": 0.0001, "loss": 0.4952, "step": 326 }, { "epoch": 1.9939024390243902, "grad_norm": 0.6041817665100098, "learning_rate": 0.0001, "loss": 0.5042, "step": 327 }, { "epoch": 2.0, "grad_norm": 0.5927252769470215, "learning_rate": 0.0001, "loss": 0.4996, "step": 328 }, { "epoch": 2.0060975609756095, "grad_norm": 0.6218935251235962, "learning_rate": 0.0001, "loss": 0.4171, "step": 329 }, { "epoch": 2.0121951219512195, "grad_norm": 0.5569261312484741, "learning_rate": 0.0001, "loss": 0.3905, "step": 330 }, { "epoch": 2.018292682926829, "grad_norm": 0.5948651432991028, "learning_rate": 0.0001, "loss": 0.3704, "step": 331 }, { "epoch": 2.024390243902439, "grad_norm": 0.6893870830535889, "learning_rate": 0.0001, "loss": 0.3446, "step": 332 }, { "epoch": 2.0304878048780486, "grad_norm": 0.6298575401306152, "learning_rate": 0.0001, "loss": 0.3657, "step": 333 }, { "epoch": 2.0365853658536586, "grad_norm": 0.6463242173194885, "learning_rate": 0.0001, "loss": 0.3752, "step": 334 }, { "epoch": 2.042682926829268, "grad_norm": 0.6220399141311646, "learning_rate": 0.0001, "loss": 0.4133, "step": 335 }, { "epoch": 2.048780487804878, "grad_norm": 0.6175084710121155, "learning_rate": 0.0001, "loss": 0.3856, "step": 336 }, { "epoch": 2.0548780487804876, "grad_norm": 0.5709812641143799, "learning_rate": 0.0001, "loss": 0.3791, "step": 337 }, { "epoch": 2.0609756097560976, "grad_norm": 0.5842687487602234, "learning_rate": 0.0001, "loss": 0.3981, "step": 338 }, { "epoch": 2.067073170731707, "grad_norm": 0.5711541771888733, "learning_rate": 0.0001, "loss": 0.3463, "step": 339 }, { "epoch": 2.073170731707317, "grad_norm": 0.6160522103309631, "learning_rate": 0.0001, "loss": 0.3579, "step": 340 }, { "epoch": 2.0792682926829267, "grad_norm": 0.6163449287414551, "learning_rate": 0.0001, "loss": 0.3651, "step": 341 }, { "epoch": 2.0853658536585367, "grad_norm": 0.6386067271232605, "learning_rate": 0.0001, "loss": 0.4165, "step": 342 }, { "epoch": 2.091463414634146, "grad_norm": 0.6074360609054565, "learning_rate": 0.0001, "loss": 0.383, "step": 343 }, { "epoch": 2.097560975609756, "grad_norm": 0.5862374305725098, "learning_rate": 0.0001, "loss": 0.3658, "step": 344 }, { "epoch": 2.1036585365853657, "grad_norm": 0.5639402270317078, "learning_rate": 0.0001, "loss": 0.3708, "step": 345 }, { "epoch": 2.1097560975609757, "grad_norm": 0.5674434304237366, "learning_rate": 0.0001, "loss": 0.376, "step": 346 }, { "epoch": 2.1158536585365852, "grad_norm": 0.641013503074646, "learning_rate": 0.0001, "loss": 0.3898, "step": 347 }, { "epoch": 2.1219512195121952, "grad_norm": 0.6373003125190735, "learning_rate": 0.0001, "loss": 0.3998, "step": 348 }, { "epoch": 2.1280487804878048, "grad_norm": 0.6026149392127991, "learning_rate": 0.0001, "loss": 0.3419, "step": 349 }, { "epoch": 2.1341463414634148, "grad_norm": 0.5974167585372925, "learning_rate": 0.0001, "loss": 0.3501, "step": 350 }, { "epoch": 2.1402439024390243, "grad_norm": 0.5709217190742493, "learning_rate": 0.0001, "loss": 0.4023, "step": 351 }, { "epoch": 2.1463414634146343, "grad_norm": 0.6201815605163574, "learning_rate": 0.0001, "loss": 0.3801, "step": 352 }, { "epoch": 2.152439024390244, "grad_norm": 0.5644124150276184, "learning_rate": 0.0001, "loss": 0.3536, "step": 353 }, { "epoch": 2.158536585365854, "grad_norm": 0.5843915343284607, "learning_rate": 0.0001, "loss": 0.367, "step": 354 }, { "epoch": 2.1646341463414633, "grad_norm": 0.6504707336425781, "learning_rate": 0.0001, "loss": 0.41, "step": 355 }, { "epoch": 2.1707317073170733, "grad_norm": 0.6272132396697998, "learning_rate": 0.0001, "loss": 0.3642, "step": 356 }, { "epoch": 2.176829268292683, "grad_norm": 0.6171401143074036, "learning_rate": 0.0001, "loss": 0.3709, "step": 357 }, { "epoch": 2.182926829268293, "grad_norm": 0.5451359748840332, "learning_rate": 0.0001, "loss": 0.3699, "step": 358 }, { "epoch": 2.1890243902439024, "grad_norm": 0.5557040572166443, "learning_rate": 0.0001, "loss": 0.3889, "step": 359 }, { "epoch": 2.1951219512195124, "grad_norm": 0.5514318943023682, "learning_rate": 0.0001, "loss": 0.3595, "step": 360 }, { "epoch": 2.201219512195122, "grad_norm": 0.6279582381248474, "learning_rate": 0.0001, "loss": 0.365, "step": 361 }, { "epoch": 2.207317073170732, "grad_norm": 0.6362396478652954, "learning_rate": 0.0001, "loss": 0.3676, "step": 362 }, { "epoch": 2.2134146341463414, "grad_norm": 0.6167373061180115, "learning_rate": 0.0001, "loss": 0.4047, "step": 363 }, { "epoch": 2.2195121951219514, "grad_norm": 0.5988054871559143, "learning_rate": 0.0001, "loss": 0.3866, "step": 364 }, { "epoch": 2.225609756097561, "grad_norm": 0.6260228753089905, "learning_rate": 0.0001, "loss": 0.3969, "step": 365 }, { "epoch": 2.231707317073171, "grad_norm": 0.5669357180595398, "learning_rate": 0.0001, "loss": 0.3624, "step": 366 }, { "epoch": 2.2378048780487805, "grad_norm": 0.5572336316108704, "learning_rate": 0.0001, "loss": 0.3802, "step": 367 }, { "epoch": 2.2439024390243905, "grad_norm": 0.577407956123352, "learning_rate": 0.0001, "loss": 0.3814, "step": 368 }, { "epoch": 2.25, "grad_norm": 0.5576046109199524, "learning_rate": 0.0001, "loss": 0.3529, "step": 369 }, { "epoch": 2.2560975609756095, "grad_norm": 0.5899252891540527, "learning_rate": 0.0001, "loss": 0.361, "step": 370 }, { "epoch": 2.2621951219512195, "grad_norm": 0.6026024222373962, "learning_rate": 0.0001, "loss": 0.3602, "step": 371 }, { "epoch": 2.2682926829268295, "grad_norm": 0.651066780090332, "learning_rate": 0.0001, "loss": 0.3646, "step": 372 }, { "epoch": 2.274390243902439, "grad_norm": 0.6255848407745361, "learning_rate": 0.0001, "loss": 0.3468, "step": 373 }, { "epoch": 2.2804878048780486, "grad_norm": 0.6624294519424438, "learning_rate": 0.0001, "loss": 0.3928, "step": 374 }, { "epoch": 2.2865853658536586, "grad_norm": 0.5514746308326721, "learning_rate": 0.0001, "loss": 0.374, "step": 375 }, { "epoch": 2.292682926829268, "grad_norm": 0.5865519642829895, "learning_rate": 0.0001, "loss": 0.387, "step": 376 }, { "epoch": 2.298780487804878, "grad_norm": 0.5901021957397461, "learning_rate": 0.0001, "loss": 0.3922, "step": 377 }, { "epoch": 2.3048780487804876, "grad_norm": 0.5819031000137329, "learning_rate": 0.0001, "loss": 0.3825, "step": 378 }, { "epoch": 2.3109756097560976, "grad_norm": 0.5795203447341919, "learning_rate": 0.0001, "loss": 0.3983, "step": 379 }, { "epoch": 2.317073170731707, "grad_norm": 0.5817603468894958, "learning_rate": 0.0001, "loss": 0.3892, "step": 380 }, { "epoch": 2.323170731707317, "grad_norm": 0.5905787348747253, "learning_rate": 0.0001, "loss": 0.3662, "step": 381 }, { "epoch": 2.3292682926829267, "grad_norm": 0.6160801649093628, "learning_rate": 0.0001, "loss": 0.382, "step": 382 }, { "epoch": 2.3353658536585367, "grad_norm": 0.6367721557617188, "learning_rate": 0.0001, "loss": 0.3684, "step": 383 }, { "epoch": 2.341463414634146, "grad_norm": 0.6236375570297241, "learning_rate": 0.0001, "loss": 0.3671, "step": 384 }, { "epoch": 2.347560975609756, "grad_norm": 0.5669872164726257, "learning_rate": 0.0001, "loss": 0.3634, "step": 385 }, { "epoch": 2.3536585365853657, "grad_norm": 0.5991116166114807, "learning_rate": 0.0001, "loss": 0.3628, "step": 386 }, { "epoch": 2.3597560975609757, "grad_norm": 0.5670086145401001, "learning_rate": 0.0001, "loss": 0.3635, "step": 387 }, { "epoch": 2.3658536585365852, "grad_norm": 0.629401683807373, "learning_rate": 0.0001, "loss": 0.3925, "step": 388 }, { "epoch": 2.3719512195121952, "grad_norm": 0.6248301267623901, "learning_rate": 0.0001, "loss": 0.3825, "step": 389 }, { "epoch": 2.3780487804878048, "grad_norm": 0.5823646187782288, "learning_rate": 0.0001, "loss": 0.3775, "step": 390 }, { "epoch": 2.3841463414634148, "grad_norm": 0.6670135855674744, "learning_rate": 0.0001, "loss": 0.3927, "step": 391 }, { "epoch": 2.3902439024390243, "grad_norm": 0.6390913128852844, "learning_rate": 0.0001, "loss": 0.4057, "step": 392 }, { "epoch": 2.3963414634146343, "grad_norm": 0.5848169922828674, "learning_rate": 0.0001, "loss": 0.3712, "step": 393 }, { "epoch": 2.402439024390244, "grad_norm": 0.5966094732284546, "learning_rate": 0.0001, "loss": 0.3713, "step": 394 }, { "epoch": 2.408536585365854, "grad_norm": 0.6144512891769409, "learning_rate": 0.0001, "loss": 0.3698, "step": 395 }, { "epoch": 2.4146341463414633, "grad_norm": 0.5988245010375977, "learning_rate": 0.0001, "loss": 0.3686, "step": 396 }, { "epoch": 2.4207317073170733, "grad_norm": 0.6109009981155396, "learning_rate": 0.0001, "loss": 0.3921, "step": 397 }, { "epoch": 2.426829268292683, "grad_norm": 0.6432120203971863, "learning_rate": 0.0001, "loss": 0.4231, "step": 398 }, { "epoch": 2.432926829268293, "grad_norm": 0.5902109742164612, "learning_rate": 0.0001, "loss": 0.3699, "step": 399 }, { "epoch": 2.4390243902439024, "grad_norm": 0.6081752777099609, "learning_rate": 0.0001, "loss": 0.3836, "step": 400 }, { "epoch": 2.4451219512195124, "grad_norm": 0.6146216988563538, "learning_rate": 0.0001, "loss": 0.3785, "step": 401 }, { "epoch": 2.451219512195122, "grad_norm": 0.6472842693328857, "learning_rate": 0.0001, "loss": 0.373, "step": 402 }, { "epoch": 2.457317073170732, "grad_norm": 0.60771644115448, "learning_rate": 0.0001, "loss": 0.3685, "step": 403 }, { "epoch": 2.4634146341463414, "grad_norm": 0.6457931995391846, "learning_rate": 0.0001, "loss": 0.3746, "step": 404 }, { "epoch": 2.4695121951219514, "grad_norm": 0.5895772576332092, "learning_rate": 0.0001, "loss": 0.3758, "step": 405 }, { "epoch": 2.475609756097561, "grad_norm": 0.6693524718284607, "learning_rate": 0.0001, "loss": 0.3904, "step": 406 }, { "epoch": 2.4817073170731705, "grad_norm": 0.6366068124771118, "learning_rate": 0.0001, "loss": 0.3923, "step": 407 }, { "epoch": 2.4878048780487805, "grad_norm": 0.6241960525512695, "learning_rate": 0.0001, "loss": 0.3559, "step": 408 }, { "epoch": 2.4939024390243905, "grad_norm": 0.6247851252555847, "learning_rate": 0.0001, "loss": 0.3881, "step": 409 }, { "epoch": 2.5, "grad_norm": 0.6421067714691162, "learning_rate": 0.0001, "loss": 0.4021, "step": 410 }, { "epoch": 2.5060975609756095, "grad_norm": 0.7222415804862976, "learning_rate": 0.0001, "loss": 0.391, "step": 411 }, { "epoch": 2.5121951219512195, "grad_norm": 0.6274811625480652, "learning_rate": 0.0001, "loss": 0.3817, "step": 412 }, { "epoch": 2.5182926829268295, "grad_norm": 0.5927621126174927, "learning_rate": 0.0001, "loss": 0.3595, "step": 413 }, { "epoch": 2.524390243902439, "grad_norm": 0.5889265537261963, "learning_rate": 0.0001, "loss": 0.3684, "step": 414 }, { "epoch": 2.5304878048780486, "grad_norm": 0.6477332711219788, "learning_rate": 0.0001, "loss": 0.4308, "step": 415 }, { "epoch": 2.5365853658536586, "grad_norm": 0.6162149906158447, "learning_rate": 0.0001, "loss": 0.4087, "step": 416 }, { "epoch": 2.5426829268292686, "grad_norm": 0.6609845757484436, "learning_rate": 0.0001, "loss": 0.4028, "step": 417 }, { "epoch": 2.548780487804878, "grad_norm": 0.6425780057907104, "learning_rate": 0.0001, "loss": 0.3832, "step": 418 }, { "epoch": 2.5548780487804876, "grad_norm": 0.6117408275604248, "learning_rate": 0.0001, "loss": 0.368, "step": 419 }, { "epoch": 2.5609756097560976, "grad_norm": 0.6596407890319824, "learning_rate": 0.0001, "loss": 0.3848, "step": 420 }, { "epoch": 2.567073170731707, "grad_norm": 0.6080613136291504, "learning_rate": 0.0001, "loss": 0.3862, "step": 421 }, { "epoch": 2.573170731707317, "grad_norm": 0.6160922646522522, "learning_rate": 0.0001, "loss": 0.3797, "step": 422 }, { "epoch": 2.5792682926829267, "grad_norm": 0.6346991658210754, "learning_rate": 0.0001, "loss": 0.3702, "step": 423 }, { "epoch": 2.5853658536585367, "grad_norm": 0.6169600486755371, "learning_rate": 0.0001, "loss": 0.3931, "step": 424 }, { "epoch": 2.591463414634146, "grad_norm": 0.6396271586418152, "learning_rate": 0.0001, "loss": 0.4133, "step": 425 }, { "epoch": 2.597560975609756, "grad_norm": 0.5953004360198975, "learning_rate": 0.0001, "loss": 0.3732, "step": 426 }, { "epoch": 2.6036585365853657, "grad_norm": 0.6704226732254028, "learning_rate": 0.0001, "loss": 0.3924, "step": 427 }, { "epoch": 2.6097560975609757, "grad_norm": 0.6755167245864868, "learning_rate": 0.0001, "loss": 0.3891, "step": 428 }, { "epoch": 2.6158536585365852, "grad_norm": 0.6189351677894592, "learning_rate": 0.0001, "loss": 0.4072, "step": 429 }, { "epoch": 2.6219512195121952, "grad_norm": 0.6409624218940735, "learning_rate": 0.0001, "loss": 0.382, "step": 430 }, { "epoch": 2.6280487804878048, "grad_norm": 0.629356324672699, "learning_rate": 0.0001, "loss": 0.3783, "step": 431 }, { "epoch": 2.6341463414634148, "grad_norm": 0.6259102821350098, "learning_rate": 0.0001, "loss": 0.3837, "step": 432 }, { "epoch": 2.6402439024390243, "grad_norm": 0.6589633822441101, "learning_rate": 0.0001, "loss": 0.3958, "step": 433 }, { "epoch": 2.6463414634146343, "grad_norm": 0.6646971702575684, "learning_rate": 0.0001, "loss": 0.3948, "step": 434 }, { "epoch": 2.652439024390244, "grad_norm": 0.6579565405845642, "learning_rate": 0.0001, "loss": 0.3749, "step": 435 }, { "epoch": 2.658536585365854, "grad_norm": 0.6253348588943481, "learning_rate": 0.0001, "loss": 0.3737, "step": 436 }, { "epoch": 2.6646341463414633, "grad_norm": 0.6139116287231445, "learning_rate": 0.0001, "loss": 0.4165, "step": 437 }, { "epoch": 2.6707317073170733, "grad_norm": 0.6256686449050903, "learning_rate": 0.0001, "loss": 0.3838, "step": 438 }, { "epoch": 2.676829268292683, "grad_norm": 0.6139652729034424, "learning_rate": 0.0001, "loss": 0.3751, "step": 439 }, { "epoch": 2.682926829268293, "grad_norm": 0.6227155923843384, "learning_rate": 0.0001, "loss": 0.3752, "step": 440 }, { "epoch": 2.6890243902439024, "grad_norm": 0.590382993221283, "learning_rate": 0.0001, "loss": 0.3896, "step": 441 }, { "epoch": 2.6951219512195124, "grad_norm": 0.6084756255149841, "learning_rate": 0.0001, "loss": 0.3725, "step": 442 }, { "epoch": 2.701219512195122, "grad_norm": 0.6576021909713745, "learning_rate": 0.0001, "loss": 0.4095, "step": 443 }, { "epoch": 2.7073170731707314, "grad_norm": 0.6265486478805542, "learning_rate": 0.0001, "loss": 0.3868, "step": 444 }, { "epoch": 2.7134146341463414, "grad_norm": 0.651096761226654, "learning_rate": 0.0001, "loss": 0.4042, "step": 445 }, { "epoch": 2.7195121951219514, "grad_norm": 0.6373317241668701, "learning_rate": 0.0001, "loss": 0.4209, "step": 446 }, { "epoch": 2.725609756097561, "grad_norm": 0.6040897965431213, "learning_rate": 0.0001, "loss": 0.4084, "step": 447 }, { "epoch": 2.7317073170731705, "grad_norm": 0.6254827976226807, "learning_rate": 0.0001, "loss": 0.3646, "step": 448 }, { "epoch": 2.7378048780487805, "grad_norm": 0.6285514831542969, "learning_rate": 0.0001, "loss": 0.3711, "step": 449 }, { "epoch": 2.7439024390243905, "grad_norm": 0.675573468208313, "learning_rate": 0.0001, "loss": 0.4191, "step": 450 }, { "epoch": 2.75, "grad_norm": 0.6126376390457153, "learning_rate": 0.0001, "loss": 0.3782, "step": 451 }, { "epoch": 2.7560975609756095, "grad_norm": 0.6281729340553284, "learning_rate": 0.0001, "loss": 0.3778, "step": 452 }, { "epoch": 2.7621951219512195, "grad_norm": 0.5908406376838684, "learning_rate": 0.0001, "loss": 0.3927, "step": 453 }, { "epoch": 2.7682926829268295, "grad_norm": 0.6050170660018921, "learning_rate": 0.0001, "loss": 0.431, "step": 454 }, { "epoch": 2.774390243902439, "grad_norm": 0.624231219291687, "learning_rate": 0.0001, "loss": 0.3774, "step": 455 }, { "epoch": 2.7804878048780486, "grad_norm": 0.6320463418960571, "learning_rate": 0.0001, "loss": 0.4062, "step": 456 }, { "epoch": 2.7865853658536586, "grad_norm": 0.6329071521759033, "learning_rate": 0.0001, "loss": 0.3962, "step": 457 }, { "epoch": 2.7926829268292686, "grad_norm": 0.6450055241584778, "learning_rate": 0.0001, "loss": 0.4096, "step": 458 }, { "epoch": 2.798780487804878, "grad_norm": 0.6559624671936035, "learning_rate": 0.0001, "loss": 0.4015, "step": 459 }, { "epoch": 2.8048780487804876, "grad_norm": 0.5944327116012573, "learning_rate": 0.0001, "loss": 0.3864, "step": 460 }, { "epoch": 2.8109756097560976, "grad_norm": 0.6524405479431152, "learning_rate": 0.0001, "loss": 0.4245, "step": 461 }, { "epoch": 2.817073170731707, "grad_norm": 0.6659778952598572, "learning_rate": 0.0001, "loss": 0.419, "step": 462 }, { "epoch": 2.823170731707317, "grad_norm": 0.6520142555236816, "learning_rate": 0.0001, "loss": 0.4163, "step": 463 }, { "epoch": 2.8292682926829267, "grad_norm": 0.6226247549057007, "learning_rate": 0.0001, "loss": 0.3898, "step": 464 }, { "epoch": 2.8353658536585367, "grad_norm": 0.6132051348686218, "learning_rate": 0.0001, "loss": 0.3854, "step": 465 }, { "epoch": 2.841463414634146, "grad_norm": 0.6409340500831604, "learning_rate": 0.0001, "loss": 0.3663, "step": 466 }, { "epoch": 2.847560975609756, "grad_norm": 0.638858437538147, "learning_rate": 0.0001, "loss": 0.381, "step": 467 }, { "epoch": 2.8536585365853657, "grad_norm": 0.6682012677192688, "learning_rate": 0.0001, "loss": 0.4027, "step": 468 }, { "epoch": 2.8597560975609757, "grad_norm": 0.6829751133918762, "learning_rate": 0.0001, "loss": 0.4232, "step": 469 }, { "epoch": 2.8658536585365852, "grad_norm": 0.6196625232696533, "learning_rate": 0.0001, "loss": 0.3629, "step": 470 }, { "epoch": 2.8719512195121952, "grad_norm": 0.6654703617095947, "learning_rate": 0.0001, "loss": 0.4071, "step": 471 }, { "epoch": 2.8780487804878048, "grad_norm": 0.6258810758590698, "learning_rate": 0.0001, "loss": 0.3893, "step": 472 }, { "epoch": 2.8841463414634148, "grad_norm": 0.6281041502952576, "learning_rate": 0.0001, "loss": 0.3978, "step": 473 }, { "epoch": 2.8902439024390243, "grad_norm": 0.6136834621429443, "learning_rate": 0.0001, "loss": 0.4258, "step": 474 }, { "epoch": 2.8963414634146343, "grad_norm": 0.6135198473930359, "learning_rate": 0.0001, "loss": 0.3793, "step": 475 }, { "epoch": 2.902439024390244, "grad_norm": 0.6039949059486389, "learning_rate": 0.0001, "loss": 0.4034, "step": 476 }, { "epoch": 2.908536585365854, "grad_norm": 0.6059561967849731, "learning_rate": 0.0001, "loss": 0.3997, "step": 477 }, { "epoch": 2.9146341463414633, "grad_norm": 0.6142321825027466, "learning_rate": 0.0001, "loss": 0.3778, "step": 478 }, { "epoch": 2.9207317073170733, "grad_norm": 0.6661014556884766, "learning_rate": 0.0001, "loss": 0.4241, "step": 479 }, { "epoch": 2.926829268292683, "grad_norm": 0.6781815886497498, "learning_rate": 0.0001, "loss": 0.3969, "step": 480 }, { "epoch": 2.932926829268293, "grad_norm": 0.6294031739234924, "learning_rate": 0.0001, "loss": 0.3768, "step": 481 }, { "epoch": 2.9390243902439024, "grad_norm": 0.6458147764205933, "learning_rate": 0.0001, "loss": 0.393, "step": 482 }, { "epoch": 2.9451219512195124, "grad_norm": 0.5952702760696411, "learning_rate": 0.0001, "loss": 0.3844, "step": 483 }, { "epoch": 2.951219512195122, "grad_norm": 0.5768480896949768, "learning_rate": 0.0001, "loss": 0.3893, "step": 484 }, { "epoch": 2.9573170731707314, "grad_norm": 0.6429164409637451, "learning_rate": 0.0001, "loss": 0.4078, "step": 485 }, { "epoch": 2.9634146341463414, "grad_norm": 0.5966724753379822, "learning_rate": 0.0001, "loss": 0.3689, "step": 486 }, { "epoch": 2.9695121951219514, "grad_norm": 0.6305826306343079, "learning_rate": 0.0001, "loss": 0.3982, "step": 487 }, { "epoch": 2.975609756097561, "grad_norm": 0.6368945240974426, "learning_rate": 0.0001, "loss": 0.4033, "step": 488 }, { "epoch": 2.9817073170731705, "grad_norm": 0.6413828730583191, "learning_rate": 0.0001, "loss": 0.392, "step": 489 }, { "epoch": 2.9878048780487805, "grad_norm": 0.626516580581665, "learning_rate": 0.0001, "loss": 0.3908, "step": 490 }, { "epoch": 2.9939024390243905, "grad_norm": 0.6416463255882263, "learning_rate": 0.0001, "loss": 0.397, "step": 491 }, { "epoch": 3.0, "grad_norm": 0.6507825255393982, "learning_rate": 0.0001, "loss": 0.3931, "step": 492 } ], "logging_steps": 1, "max_steps": 492, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.340951084078203e+17, "train_batch_size": 10, "trial_name": null, "trial_params": null }