{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999245909056632, "eval_steps": 500, "global_step": 6630, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00015081818867355404, "grad_norm": 0.6922811636045936, "learning_rate": 4.999999747404737e-07, "loss": 1.6389, "step": 1 }, { "epoch": 0.0003016363773471081, "grad_norm": 0.7140881455188725, "learning_rate": 4.999998989619002e-07, "loss": 1.7317, "step": 2 }, { "epoch": 0.0004524545660206621, "grad_norm": 0.7901119831427035, "learning_rate": 4.999997726642967e-07, "loss": 1.6932, "step": 3 }, { "epoch": 0.0006032727546942162, "grad_norm": 0.779821728799067, "learning_rate": 4.999995958476916e-07, "loss": 1.6998, "step": 4 }, { "epoch": 0.0007540909433677702, "grad_norm": 0.7460901373887201, "learning_rate": 4.999993685121245e-07, "loss": 1.6461, "step": 5 }, { "epoch": 0.0009049091320413242, "grad_norm": 0.7488137173948498, "learning_rate": 4.999990906576464e-07, "loss": 1.7045, "step": 6 }, { "epoch": 0.0010557273207148783, "grad_norm": 0.8105539489064422, "learning_rate": 4.999987622843198e-07, "loss": 1.6732, "step": 7 }, { "epoch": 0.0012065455093884323, "grad_norm": 0.6994546178393476, "learning_rate": 4.999983833922184e-07, "loss": 1.6795, "step": 8 }, { "epoch": 0.0013573636980619864, "grad_norm": 0.7937671031757709, "learning_rate": 4.999979539814273e-07, "loss": 1.7008, "step": 9 }, { "epoch": 0.0015081818867355404, "grad_norm": 0.6602329299835072, "learning_rate": 4.999974740520428e-07, "loss": 1.6682, "step": 10 }, { "epoch": 0.0016590000754090944, "grad_norm": 0.6821099703015343, "learning_rate": 4.999969436041726e-07, "loss": 1.678, "step": 11 }, { "epoch": 0.0018098182640826485, "grad_norm": 0.7189347675969909, "learning_rate": 4.999963626379361e-07, "loss": 1.6921, "step": 12 }, { "epoch": 0.0019606364527562023, "grad_norm": 0.6454737446123784, "learning_rate": 4.999957311534635e-07, "loss": 1.6031, "step": 13 }, { "epoch": 0.0021114546414297566, "grad_norm": 0.6932215740279462, "learning_rate": 4.999950491508967e-07, "loss": 1.6368, "step": 14 }, { "epoch": 0.0022622728301033104, "grad_norm": 0.6697749479364712, "learning_rate": 4.999943166303888e-07, "loss": 1.7087, "step": 15 }, { "epoch": 0.0024130910187768646, "grad_norm": 0.6414871607410737, "learning_rate": 4.999935335921042e-07, "loss": 1.5736, "step": 16 }, { "epoch": 0.0025639092074504185, "grad_norm": 0.6524390383806831, "learning_rate": 4.999927000362188e-07, "loss": 1.6607, "step": 17 }, { "epoch": 0.0027147273961239727, "grad_norm": 0.5498201184163394, "learning_rate": 4.999918159629197e-07, "loss": 1.6549, "step": 18 }, { "epoch": 0.0028655455847975265, "grad_norm": 0.5906190966505179, "learning_rate": 4.999908813724054e-07, "loss": 1.6223, "step": 19 }, { "epoch": 0.003016363773471081, "grad_norm": 0.5409603294442903, "learning_rate": 4.999898962648858e-07, "loss": 1.646, "step": 20 }, { "epoch": 0.0031671819621446346, "grad_norm": 0.47639936391820453, "learning_rate": 4.999888606405821e-07, "loss": 1.6288, "step": 21 }, { "epoch": 0.003318000150818189, "grad_norm": 0.49001145145256797, "learning_rate": 4.999877744997268e-07, "loss": 1.6078, "step": 22 }, { "epoch": 0.0034688183394917427, "grad_norm": 0.4480835545958283, "learning_rate": 4.999866378425637e-07, "loss": 1.6913, "step": 23 }, { "epoch": 0.003619636528165297, "grad_norm": 0.47975660258284775, "learning_rate": 4.999854506693481e-07, "loss": 1.6562, "step": 24 }, { "epoch": 0.0037704547168388508, "grad_norm": 0.45739459732441357, "learning_rate": 4.999842129803465e-07, "loss": 1.5977, "step": 25 }, { "epoch": 0.003921272905512405, "grad_norm": 0.47345874678684624, "learning_rate": 4.999829247758368e-07, "loss": 1.6145, "step": 26 }, { "epoch": 0.004072091094185958, "grad_norm": 0.5277188876747779, "learning_rate": 4.999815860561083e-07, "loss": 1.6744, "step": 27 }, { "epoch": 0.004222909282859513, "grad_norm": 0.49781748452982383, "learning_rate": 4.999801968214616e-07, "loss": 1.6795, "step": 28 }, { "epoch": 0.004373727471533067, "grad_norm": 0.4058450594970341, "learning_rate": 4.999787570722085e-07, "loss": 1.5995, "step": 29 }, { "epoch": 0.004524545660206621, "grad_norm": 0.4688159734665827, "learning_rate": 4.999772668086723e-07, "loss": 1.6533, "step": 30 }, { "epoch": 0.004675363848880175, "grad_norm": 0.5382144698866896, "learning_rate": 4.999757260311877e-07, "loss": 1.6687, "step": 31 }, { "epoch": 0.004826182037553729, "grad_norm": 0.43364920622549574, "learning_rate": 4.999741347401006e-07, "loss": 1.6237, "step": 32 }, { "epoch": 0.004977000226227283, "grad_norm": 0.4417491883275204, "learning_rate": 4.999724929357683e-07, "loss": 1.6064, "step": 33 }, { "epoch": 0.005127818414900837, "grad_norm": 0.3798147688605585, "learning_rate": 4.999708006185593e-07, "loss": 1.5982, "step": 34 }, { "epoch": 0.005278636603574391, "grad_norm": 0.3722151540923329, "learning_rate": 4.999690577888537e-07, "loss": 1.5851, "step": 35 }, { "epoch": 0.005429454792247945, "grad_norm": 0.3670996345126827, "learning_rate": 4.999672644470428e-07, "loss": 1.6649, "step": 36 }, { "epoch": 0.005580272980921499, "grad_norm": 0.32863895232492335, "learning_rate": 4.999654205935294e-07, "loss": 1.6039, "step": 37 }, { "epoch": 0.005731091169595053, "grad_norm": 0.33479731199962554, "learning_rate": 4.999635262287272e-07, "loss": 1.6022, "step": 38 }, { "epoch": 0.005881909358268607, "grad_norm": 0.3340694893124953, "learning_rate": 4.999615813530618e-07, "loss": 1.7122, "step": 39 }, { "epoch": 0.006032727546942162, "grad_norm": 0.34129372778475764, "learning_rate": 4.999595859669698e-07, "loss": 1.7592, "step": 40 }, { "epoch": 0.006183545735615715, "grad_norm": 0.3737447888089227, "learning_rate": 4.999575400708991e-07, "loss": 1.675, "step": 41 }, { "epoch": 0.006334363924289269, "grad_norm": 0.31806492846143203, "learning_rate": 4.999554436653093e-07, "loss": 1.6369, "step": 42 }, { "epoch": 0.006485182112962823, "grad_norm": 4.465999509985267, "learning_rate": 4.999532967506708e-07, "loss": 1.768, "step": 43 }, { "epoch": 0.006636000301636378, "grad_norm": 0.38534249346532656, "learning_rate": 4.999510993274659e-07, "loss": 1.6387, "step": 44 }, { "epoch": 0.0067868184903099316, "grad_norm": 0.31913751532411, "learning_rate": 4.99948851396188e-07, "loss": 1.6166, "step": 45 }, { "epoch": 0.006937636678983485, "grad_norm": 2.426973413694192, "learning_rate": 4.999465529573416e-07, "loss": 1.6164, "step": 46 }, { "epoch": 0.007088454867657039, "grad_norm": 0.3265648082672616, "learning_rate": 4.999442040114429e-07, "loss": 1.6604, "step": 47 }, { "epoch": 0.007239273056330594, "grad_norm": 0.31660772435225665, "learning_rate": 4.999418045590194e-07, "loss": 1.5524, "step": 48 }, { "epoch": 0.007390091245004148, "grad_norm": 0.31800527693674274, "learning_rate": 4.999393546006097e-07, "loss": 1.6414, "step": 49 }, { "epoch": 0.0075409094336777015, "grad_norm": 0.3152640122154442, "learning_rate": 4.999368541367639e-07, "loss": 1.6063, "step": 50 }, { "epoch": 0.007691727622351255, "grad_norm": 0.32043620783105936, "learning_rate": 4.999343031680434e-07, "loss": 1.6677, "step": 51 }, { "epoch": 0.00784254581102481, "grad_norm": 0.3018244592626452, "learning_rate": 4.999317016950211e-07, "loss": 1.6265, "step": 52 }, { "epoch": 0.007993363999698364, "grad_norm": 0.3424930702846109, "learning_rate": 4.99929049718281e-07, "loss": 1.6404, "step": 53 }, { "epoch": 0.008144182188371917, "grad_norm": 0.30974610592242463, "learning_rate": 4.999263472384186e-07, "loss": 1.5421, "step": 54 }, { "epoch": 0.008295000377045472, "grad_norm": 0.30658278771053415, "learning_rate": 4.999235942560407e-07, "loss": 1.6901, "step": 55 }, { "epoch": 0.008445818565719026, "grad_norm": 0.3174958401672945, "learning_rate": 4.999207907717654e-07, "loss": 1.5751, "step": 56 }, { "epoch": 0.00859663675439258, "grad_norm": 0.38057071951040056, "learning_rate": 4.999179367862221e-07, "loss": 1.5924, "step": 57 }, { "epoch": 0.008747454943066134, "grad_norm": 0.3042205867066707, "learning_rate": 4.999150323000517e-07, "loss": 1.6212, "step": 58 }, { "epoch": 0.008898273131739689, "grad_norm": 0.29037474131564006, "learning_rate": 4.999120773139062e-07, "loss": 1.6054, "step": 59 }, { "epoch": 0.009049091320413242, "grad_norm": 0.33743493712479644, "learning_rate": 4.999090718284493e-07, "loss": 1.6234, "step": 60 }, { "epoch": 0.009199909509086796, "grad_norm": 0.3190295567941831, "learning_rate": 4.999060158443557e-07, "loss": 1.6061, "step": 61 }, { "epoch": 0.00935072769776035, "grad_norm": 0.3060496053543246, "learning_rate": 4.999029093623115e-07, "loss": 1.53, "step": 62 }, { "epoch": 0.009501545886433904, "grad_norm": 0.330007452265362, "learning_rate": 4.998997523830142e-07, "loss": 1.5874, "step": 63 }, { "epoch": 0.009652364075107459, "grad_norm": 0.2913872540073151, "learning_rate": 4.998965449071727e-07, "loss": 1.6215, "step": 64 }, { "epoch": 0.009803182263781011, "grad_norm": 0.3861447338808342, "learning_rate": 4.998932869355073e-07, "loss": 1.618, "step": 65 }, { "epoch": 0.009954000452454566, "grad_norm": 0.27958822059870586, "learning_rate": 4.998899784687492e-07, "loss": 1.6103, "step": 66 }, { "epoch": 0.01010481864112812, "grad_norm": 0.5931853797312046, "learning_rate": 4.998866195076416e-07, "loss": 1.5916, "step": 67 }, { "epoch": 0.010255636829801674, "grad_norm": 0.30425612593635215, "learning_rate": 4.998832100529383e-07, "loss": 1.6481, "step": 68 }, { "epoch": 0.010406455018475229, "grad_norm": 0.28173707892876365, "learning_rate": 4.998797501054052e-07, "loss": 1.6412, "step": 69 }, { "epoch": 0.010557273207148781, "grad_norm": 0.35484062928114324, "learning_rate": 4.998762396658189e-07, "loss": 1.6716, "step": 70 }, { "epoch": 0.010708091395822336, "grad_norm": 0.30696711292842577, "learning_rate": 4.998726787349676e-07, "loss": 1.6161, "step": 71 }, { "epoch": 0.01085890958449589, "grad_norm": 0.29830361910091235, "learning_rate": 4.99869067313651e-07, "loss": 1.5406, "step": 72 }, { "epoch": 0.011009727773169444, "grad_norm": 0.3257150413463108, "learning_rate": 4.998654054026798e-07, "loss": 1.6029, "step": 73 }, { "epoch": 0.011160545961842998, "grad_norm": 0.27608611779319664, "learning_rate": 4.998616930028763e-07, "loss": 1.5345, "step": 74 }, { "epoch": 0.011311364150516551, "grad_norm": 0.2801184141085234, "learning_rate": 4.998579301150741e-07, "loss": 1.6298, "step": 75 }, { "epoch": 0.011462182339190106, "grad_norm": 0.2877514992208577, "learning_rate": 4.998541167401179e-07, "loss": 1.5923, "step": 76 }, { "epoch": 0.01161300052786366, "grad_norm": 0.4974526858441575, "learning_rate": 4.998502528788641e-07, "loss": 1.6215, "step": 77 }, { "epoch": 0.011763818716537214, "grad_norm": 0.3049310065556733, "learning_rate": 4.998463385321801e-07, "loss": 1.6285, "step": 78 }, { "epoch": 0.011914636905210768, "grad_norm": 0.32712493893767625, "learning_rate": 4.99842373700945e-07, "loss": 1.6601, "step": 79 }, { "epoch": 0.012065455093884323, "grad_norm": 0.4232235008613389, "learning_rate": 4.998383583860487e-07, "loss": 1.6311, "step": 80 }, { "epoch": 0.012216273282557876, "grad_norm": 0.27530166675598666, "learning_rate": 4.99834292588393e-07, "loss": 1.5821, "step": 81 }, { "epoch": 0.01236709147123143, "grad_norm": 0.3199412588866916, "learning_rate": 4.998301763088906e-07, "loss": 1.7004, "step": 82 }, { "epoch": 0.012517909659904984, "grad_norm": 0.3765606480051871, "learning_rate": 4.99826009548466e-07, "loss": 1.572, "step": 83 }, { "epoch": 0.012668727848578538, "grad_norm": 0.27388681299904893, "learning_rate": 4.998217923080545e-07, "loss": 1.5805, "step": 84 }, { "epoch": 0.012819546037252093, "grad_norm": 0.28497127222933466, "learning_rate": 4.998175245886032e-07, "loss": 1.6096, "step": 85 }, { "epoch": 0.012970364225925646, "grad_norm": 0.288999756333011, "learning_rate": 4.998132063910701e-07, "loss": 1.629, "step": 86 }, { "epoch": 0.0131211824145992, "grad_norm": 0.2829852204496664, "learning_rate": 4.99808837716425e-07, "loss": 1.6398, "step": 87 }, { "epoch": 0.013272000603272755, "grad_norm": 0.3163616256907245, "learning_rate": 4.998044185656485e-07, "loss": 1.6663, "step": 88 }, { "epoch": 0.013422818791946308, "grad_norm": 0.29291670767485606, "learning_rate": 4.997999489397332e-07, "loss": 1.6779, "step": 89 }, { "epoch": 0.013573636980619863, "grad_norm": 0.3574975540951008, "learning_rate": 4.997954288396823e-07, "loss": 1.6485, "step": 90 }, { "epoch": 0.013724455169293416, "grad_norm": 0.2945864109199964, "learning_rate": 4.997908582665109e-07, "loss": 1.6502, "step": 91 }, { "epoch": 0.01387527335796697, "grad_norm": 0.2753915566219201, "learning_rate": 4.997862372212452e-07, "loss": 1.5492, "step": 92 }, { "epoch": 0.014026091546640525, "grad_norm": 0.26560879180698244, "learning_rate": 4.997815657049228e-07, "loss": 1.6694, "step": 93 }, { "epoch": 0.014176909735314078, "grad_norm": 0.28303864778500293, "learning_rate": 4.997768437185925e-07, "loss": 1.5992, "step": 94 }, { "epoch": 0.014327727923987633, "grad_norm": 0.27651869982324084, "learning_rate": 4.997720712633146e-07, "loss": 1.6188, "step": 95 }, { "epoch": 0.014478546112661188, "grad_norm": 0.26326254508243496, "learning_rate": 4.997672483401605e-07, "loss": 1.5778, "step": 96 }, { "epoch": 0.01462936430133474, "grad_norm": 0.2826952345075853, "learning_rate": 4.997623749502134e-07, "loss": 1.6432, "step": 97 }, { "epoch": 0.014780182490008295, "grad_norm": 0.27953137964722935, "learning_rate": 4.997574510945671e-07, "loss": 1.5695, "step": 98 }, { "epoch": 0.014931000678681848, "grad_norm": 0.2688364949225841, "learning_rate": 4.997524767743275e-07, "loss": 1.6274, "step": 99 }, { "epoch": 0.015081818867355403, "grad_norm": 0.3209356041339321, "learning_rate": 4.997474519906113e-07, "loss": 1.6223, "step": 100 }, { "epoch": 0.015232637056028958, "grad_norm": 1.0329372307400904, "learning_rate": 4.997423767445468e-07, "loss": 1.626, "step": 101 }, { "epoch": 0.01538345524470251, "grad_norm": 0.2925370222104452, "learning_rate": 4.997372510372735e-07, "loss": 1.5832, "step": 102 }, { "epoch": 0.015534273433376065, "grad_norm": 0.827547882991668, "learning_rate": 4.997320748699422e-07, "loss": 1.6391, "step": 103 }, { "epoch": 0.01568509162204962, "grad_norm": 0.28584586927783384, "learning_rate": 4.997268482437152e-07, "loss": 1.6098, "step": 104 }, { "epoch": 0.015835909810723175, "grad_norm": 0.2772653083687217, "learning_rate": 4.99721571159766e-07, "loss": 1.5489, "step": 105 }, { "epoch": 0.015986727999396728, "grad_norm": 0.3027002593820936, "learning_rate": 4.997162436192795e-07, "loss": 1.6175, "step": 106 }, { "epoch": 0.01613754618807028, "grad_norm": 0.26151021422707005, "learning_rate": 4.997108656234518e-07, "loss": 1.5712, "step": 107 }, { "epoch": 0.016288364376743834, "grad_norm": 0.2760524205676756, "learning_rate": 4.997054371734905e-07, "loss": 1.6033, "step": 108 }, { "epoch": 0.01643918256541739, "grad_norm": 0.2652710973293421, "learning_rate": 4.996999582706143e-07, "loss": 1.5926, "step": 109 }, { "epoch": 0.016590000754090943, "grad_norm": 0.27448663116920097, "learning_rate": 4.996944289160536e-07, "loss": 1.5478, "step": 110 }, { "epoch": 0.016740818942764496, "grad_norm": 0.27373441719549557, "learning_rate": 4.996888491110498e-07, "loss": 1.6436, "step": 111 }, { "epoch": 0.016891637131438052, "grad_norm": 0.2823563689658009, "learning_rate": 4.996832188568556e-07, "loss": 1.6573, "step": 112 }, { "epoch": 0.017042455320111605, "grad_norm": 0.2905540170801979, "learning_rate": 4.996775381547355e-07, "loss": 1.6346, "step": 113 }, { "epoch": 0.01719327350878516, "grad_norm": 0.36710435208412284, "learning_rate": 4.996718070059646e-07, "loss": 1.6623, "step": 114 }, { "epoch": 0.017344091697458715, "grad_norm": 0.28150968628183515, "learning_rate": 4.996660254118298e-07, "loss": 1.621, "step": 115 }, { "epoch": 0.017494909886132268, "grad_norm": 0.2908537614777647, "learning_rate": 4.996601933736293e-07, "loss": 1.5806, "step": 116 }, { "epoch": 0.01764572807480582, "grad_norm": 4.782372308328432, "learning_rate": 4.996543108926726e-07, "loss": 1.6366, "step": 117 }, { "epoch": 0.017796546263479377, "grad_norm": 0.28382687643398863, "learning_rate": 4.996483779702805e-07, "loss": 1.6322, "step": 118 }, { "epoch": 0.01794736445215293, "grad_norm": 0.26019481261096256, "learning_rate": 4.99642394607785e-07, "loss": 1.6209, "step": 119 }, { "epoch": 0.018098182640826483, "grad_norm": 0.3154808626611058, "learning_rate": 4.996363608065297e-07, "loss": 1.6731, "step": 120 }, { "epoch": 0.01824900082950004, "grad_norm": 0.28724383174878004, "learning_rate": 4.996302765678691e-07, "loss": 1.6151, "step": 121 }, { "epoch": 0.018399819018173592, "grad_norm": 0.2673293043423508, "learning_rate": 4.996241418931695e-07, "loss": 1.6209, "step": 122 }, { "epoch": 0.018550637206847145, "grad_norm": 0.2714980441671276, "learning_rate": 4.996179567838084e-07, "loss": 1.6281, "step": 123 }, { "epoch": 0.0187014553955207, "grad_norm": 0.27001328981908584, "learning_rate": 4.996117212411742e-07, "loss": 1.5936, "step": 124 }, { "epoch": 0.018852273584194255, "grad_norm": 7.240553966508071, "learning_rate": 4.996054352666673e-07, "loss": 1.6544, "step": 125 }, { "epoch": 0.019003091772867808, "grad_norm": 0.2909128295098797, "learning_rate": 4.99599098861699e-07, "loss": 1.5634, "step": 126 }, { "epoch": 0.01915390996154136, "grad_norm": 0.2564976792990476, "learning_rate": 4.995927120276918e-07, "loss": 1.5892, "step": 127 }, { "epoch": 0.019304728150214917, "grad_norm": 0.28813897636995855, "learning_rate": 4.995862747660799e-07, "loss": 1.5968, "step": 128 }, { "epoch": 0.01945554633888847, "grad_norm": 0.26825660858447004, "learning_rate": 4.995797870783087e-07, "loss": 1.6441, "step": 129 }, { "epoch": 0.019606364527562023, "grad_norm": 0.8593441639393585, "learning_rate": 4.995732489658349e-07, "loss": 1.6078, "step": 130 }, { "epoch": 0.01975718271623558, "grad_norm": 0.2744427040666157, "learning_rate": 4.995666604301263e-07, "loss": 1.5881, "step": 131 }, { "epoch": 0.019908000904909132, "grad_norm": 0.2935832854888586, "learning_rate": 4.995600214726624e-07, "loss": 1.6288, "step": 132 }, { "epoch": 0.020058819093582685, "grad_norm": 0.29119246468057447, "learning_rate": 4.995533320949337e-07, "loss": 1.6326, "step": 133 }, { "epoch": 0.02020963728225624, "grad_norm": 0.3205215551752403, "learning_rate": 4.995465922984423e-07, "loss": 1.6077, "step": 134 }, { "epoch": 0.020360455470929795, "grad_norm": 0.2689930184821362, "learning_rate": 4.995398020847013e-07, "loss": 1.6249, "step": 135 }, { "epoch": 0.020511273659603348, "grad_norm": 0.28616284464875474, "learning_rate": 4.995329614552355e-07, "loss": 1.5789, "step": 136 }, { "epoch": 0.0206620918482769, "grad_norm": 0.28300795023532677, "learning_rate": 4.995260704115808e-07, "loss": 1.6373, "step": 137 }, { "epoch": 0.020812910036950457, "grad_norm": 0.2625446722721147, "learning_rate": 4.995191289552842e-07, "loss": 1.6015, "step": 138 }, { "epoch": 0.02096372822562401, "grad_norm": 0.2847634969602461, "learning_rate": 4.995121370879045e-07, "loss": 1.5301, "step": 139 }, { "epoch": 0.021114546414297563, "grad_norm": 0.5003742319182984, "learning_rate": 4.995050948110115e-07, "loss": 1.5855, "step": 140 }, { "epoch": 0.02126536460297112, "grad_norm": 0.3212774484618759, "learning_rate": 4.994980021261864e-07, "loss": 1.6243, "step": 141 }, { "epoch": 0.021416182791644672, "grad_norm": 0.26244417253211844, "learning_rate": 4.994908590350218e-07, "loss": 1.5763, "step": 142 }, { "epoch": 0.021567000980318225, "grad_norm": 0.2657090011462843, "learning_rate": 4.994836655391214e-07, "loss": 1.6437, "step": 143 }, { "epoch": 0.02171781916899178, "grad_norm": 0.27420630718454775, "learning_rate": 4.994764216401004e-07, "loss": 1.5617, "step": 144 }, { "epoch": 0.021868637357665335, "grad_norm": 0.2884668926439784, "learning_rate": 4.994691273395852e-07, "loss": 1.6123, "step": 145 }, { "epoch": 0.022019455546338888, "grad_norm": 0.3301484708204669, "learning_rate": 4.994617826392137e-07, "loss": 1.6564, "step": 146 }, { "epoch": 0.022170273735012444, "grad_norm": 0.2591901169274755, "learning_rate": 4.994543875406349e-07, "loss": 1.6437, "step": 147 }, { "epoch": 0.022321091923685997, "grad_norm": 0.2592192947358982, "learning_rate": 4.994469420455093e-07, "loss": 1.6852, "step": 148 }, { "epoch": 0.02247191011235955, "grad_norm": 0.25884995709127223, "learning_rate": 4.994394461555087e-07, "loss": 1.5904, "step": 149 }, { "epoch": 0.022622728301033103, "grad_norm": 0.2632847561494921, "learning_rate": 4.99431899872316e-07, "loss": 1.604, "step": 150 }, { "epoch": 0.02277354648970666, "grad_norm": 0.27459575640410483, "learning_rate": 4.994243031976255e-07, "loss": 1.5783, "step": 151 }, { "epoch": 0.022924364678380212, "grad_norm": 0.263011002646199, "learning_rate": 4.99416656133143e-07, "loss": 1.612, "step": 152 }, { "epoch": 0.023075182867053765, "grad_norm": 0.452258821903041, "learning_rate": 4.994089586805855e-07, "loss": 1.5412, "step": 153 }, { "epoch": 0.02322600105572732, "grad_norm": 0.2636086865413926, "learning_rate": 4.994012108416812e-07, "loss": 1.5354, "step": 154 }, { "epoch": 0.023376819244400875, "grad_norm": 0.2698908588403687, "learning_rate": 4.993934126181698e-07, "loss": 1.5574, "step": 155 }, { "epoch": 0.023527637433074428, "grad_norm": 0.2620736602088264, "learning_rate": 4.993855640118023e-07, "loss": 1.5399, "step": 156 }, { "epoch": 0.023678455621747984, "grad_norm": 0.2662131144061968, "learning_rate": 4.993776650243408e-07, "loss": 1.5513, "step": 157 }, { "epoch": 0.023829273810421537, "grad_norm": 0.33202641823726214, "learning_rate": 4.993697156575589e-07, "loss": 1.5834, "step": 158 }, { "epoch": 0.02398009199909509, "grad_norm": 0.383715850377704, "learning_rate": 4.993617159132415e-07, "loss": 1.6506, "step": 159 }, { "epoch": 0.024130910187768646, "grad_norm": 0.2642653431126336, "learning_rate": 4.993536657931846e-07, "loss": 1.5637, "step": 160 }, { "epoch": 0.0242817283764422, "grad_norm": 0.292189030769218, "learning_rate": 4.993455652991959e-07, "loss": 1.6019, "step": 161 }, { "epoch": 0.024432546565115752, "grad_norm": 0.27074399593196924, "learning_rate": 4.993374144330942e-07, "loss": 1.6746, "step": 162 }, { "epoch": 0.02458336475378931, "grad_norm": 0.2460380594866087, "learning_rate": 4.993292131967094e-07, "loss": 1.6073, "step": 163 }, { "epoch": 0.02473418294246286, "grad_norm": 0.9355172258960992, "learning_rate": 4.993209615918832e-07, "loss": 1.6106, "step": 164 }, { "epoch": 0.024885001131136415, "grad_norm": 0.2595553364190686, "learning_rate": 4.99312659620468e-07, "loss": 1.5158, "step": 165 }, { "epoch": 0.025035819319809968, "grad_norm": 0.31310147215799994, "learning_rate": 4.993043072843282e-07, "loss": 1.5527, "step": 166 }, { "epoch": 0.025186637508483524, "grad_norm": 0.30063636617290196, "learning_rate": 4.992959045853388e-07, "loss": 1.6818, "step": 167 }, { "epoch": 0.025337455697157077, "grad_norm": 0.26319554722690824, "learning_rate": 4.992874515253867e-07, "loss": 1.6324, "step": 168 }, { "epoch": 0.02548827388583063, "grad_norm": 0.2821630706130611, "learning_rate": 4.992789481063698e-07, "loss": 1.6648, "step": 169 }, { "epoch": 0.025639092074504186, "grad_norm": 0.2642207888569884, "learning_rate": 4.992703943301973e-07, "loss": 1.6741, "step": 170 }, { "epoch": 0.02578991026317774, "grad_norm": 0.26449734982639606, "learning_rate": 4.992617901987897e-07, "loss": 1.578, "step": 171 }, { "epoch": 0.025940728451851292, "grad_norm": 0.26710134208771796, "learning_rate": 4.99253135714079e-07, "loss": 1.5813, "step": 172 }, { "epoch": 0.02609154664052485, "grad_norm": 0.2561809201414807, "learning_rate": 4.992444308780084e-07, "loss": 1.566, "step": 173 }, { "epoch": 0.0262423648291984, "grad_norm": 0.29393431439094037, "learning_rate": 4.992356756925323e-07, "loss": 1.629, "step": 174 }, { "epoch": 0.026393183017871955, "grad_norm": 0.25906234137363926, "learning_rate": 4.992268701596166e-07, "loss": 1.5759, "step": 175 }, { "epoch": 0.02654400120654551, "grad_norm": 0.26821181727998744, "learning_rate": 4.992180142812383e-07, "loss": 1.6519, "step": 176 }, { "epoch": 0.026694819395219064, "grad_norm": 0.3624980940371162, "learning_rate": 4.992091080593858e-07, "loss": 1.605, "step": 177 }, { "epoch": 0.026845637583892617, "grad_norm": 0.2623854348627949, "learning_rate": 4.99200151496059e-07, "loss": 1.6117, "step": 178 }, { "epoch": 0.02699645577256617, "grad_norm": 0.2627813158352471, "learning_rate": 4.991911445932686e-07, "loss": 1.5814, "step": 179 }, { "epoch": 0.027147273961239726, "grad_norm": 0.2594695889060515, "learning_rate": 4.991820873530371e-07, "loss": 1.52, "step": 180 }, { "epoch": 0.02729809214991328, "grad_norm": 0.30046272167288224, "learning_rate": 4.991729797773981e-07, "loss": 1.6095, "step": 181 }, { "epoch": 0.027448910338586832, "grad_norm": 0.254940160039734, "learning_rate": 4.991638218683966e-07, "loss": 1.5254, "step": 182 }, { "epoch": 0.02759972852726039, "grad_norm": 1.303177270729485, "learning_rate": 4.991546136280887e-07, "loss": 1.5905, "step": 183 }, { "epoch": 0.02775054671593394, "grad_norm": 0.31877478376641744, "learning_rate": 4.991453550585418e-07, "loss": 1.588, "step": 184 }, { "epoch": 0.027901364904607494, "grad_norm": 0.24838455111117674, "learning_rate": 4.99136046161835e-07, "loss": 1.5996, "step": 185 }, { "epoch": 0.02805218309328105, "grad_norm": 0.2848153559432735, "learning_rate": 4.991266869400582e-07, "loss": 1.5963, "step": 186 }, { "epoch": 0.028203001281954604, "grad_norm": 0.25828269152635797, "learning_rate": 4.991172773953129e-07, "loss": 1.5755, "step": 187 }, { "epoch": 0.028353819470628157, "grad_norm": 0.2653958489725172, "learning_rate": 4.991078175297117e-07, "loss": 1.6307, "step": 188 }, { "epoch": 0.028504637659301713, "grad_norm": 0.2690080760631909, "learning_rate": 4.990983073453789e-07, "loss": 1.6036, "step": 189 }, { "epoch": 0.028655455847975266, "grad_norm": 0.2618006127959753, "learning_rate": 4.990887468444496e-07, "loss": 1.615, "step": 190 }, { "epoch": 0.02880627403664882, "grad_norm": 0.268044151489942, "learning_rate": 4.990791360290704e-07, "loss": 1.5995, "step": 191 }, { "epoch": 0.028957092225322376, "grad_norm": 0.26809538061657656, "learning_rate": 4.990694749013993e-07, "loss": 1.6634, "step": 192 }, { "epoch": 0.02910791041399593, "grad_norm": 0.26956024396526895, "learning_rate": 4.990597634636054e-07, "loss": 1.6116, "step": 193 }, { "epoch": 0.02925872860266948, "grad_norm": 0.25746207958100037, "learning_rate": 4.990500017178693e-07, "loss": 1.6472, "step": 194 }, { "epoch": 0.029409546791343034, "grad_norm": 0.26012498294026976, "learning_rate": 4.990401896663827e-07, "loss": 1.6604, "step": 195 }, { "epoch": 0.02956036498001659, "grad_norm": 0.3002010469788411, "learning_rate": 4.990303273113489e-07, "loss": 1.6303, "step": 196 }, { "epoch": 0.029711183168690144, "grad_norm": 0.27514115715685794, "learning_rate": 4.99020414654982e-07, "loss": 1.6351, "step": 197 }, { "epoch": 0.029862001357363697, "grad_norm": 0.2503927456470816, "learning_rate": 4.990104516995079e-07, "loss": 1.5543, "step": 198 }, { "epoch": 0.030012819546037253, "grad_norm": 0.27626109103815416, "learning_rate": 4.990004384471633e-07, "loss": 1.6182, "step": 199 }, { "epoch": 0.030163637734710806, "grad_norm": 0.2781518165419307, "learning_rate": 4.989903749001969e-07, "loss": 1.6649, "step": 200 }, { "epoch": 0.03031445592338436, "grad_norm": 0.25231222205522835, "learning_rate": 4.989802610608679e-07, "loss": 1.5731, "step": 201 }, { "epoch": 0.030465274112057916, "grad_norm": 0.26378173793047377, "learning_rate": 4.989700969314472e-07, "loss": 1.5373, "step": 202 }, { "epoch": 0.03061609230073147, "grad_norm": 0.3340714072382629, "learning_rate": 4.989598825142171e-07, "loss": 1.5694, "step": 203 }, { "epoch": 0.03076691048940502, "grad_norm": 0.26463313883572454, "learning_rate": 4.989496178114709e-07, "loss": 1.6428, "step": 204 }, { "epoch": 0.030917728678078578, "grad_norm": 0.2832551149133356, "learning_rate": 4.989393028255134e-07, "loss": 1.5773, "step": 205 }, { "epoch": 0.03106854686675213, "grad_norm": 0.27831436524066777, "learning_rate": 4.989289375586605e-07, "loss": 1.6353, "step": 206 }, { "epoch": 0.031219365055425684, "grad_norm": 0.26392410421703627, "learning_rate": 4.989185220132397e-07, "loss": 1.6607, "step": 207 }, { "epoch": 0.03137018324409924, "grad_norm": 0.2593580146156735, "learning_rate": 4.989080561915895e-07, "loss": 1.6365, "step": 208 }, { "epoch": 0.03152100143277279, "grad_norm": 0.29962645492914997, "learning_rate": 4.988975400960596e-07, "loss": 1.6178, "step": 209 }, { "epoch": 0.03167181962144635, "grad_norm": 0.2761177620930967, "learning_rate": 4.988869737290115e-07, "loss": 1.587, "step": 210 }, { "epoch": 0.0318226378101199, "grad_norm": 0.27633099258498567, "learning_rate": 4.988763570928174e-07, "loss": 1.6454, "step": 211 }, { "epoch": 0.031973455998793456, "grad_norm": 0.2463702497362132, "learning_rate": 4.988656901898612e-07, "loss": 1.5788, "step": 212 }, { "epoch": 0.03212427418746701, "grad_norm": 0.25496481204896715, "learning_rate": 4.988549730225378e-07, "loss": 1.5861, "step": 213 }, { "epoch": 0.03227509237614056, "grad_norm": 0.2580606120283535, "learning_rate": 4.988442055932536e-07, "loss": 1.5762, "step": 214 }, { "epoch": 0.032425910564814114, "grad_norm": 0.2771695020338603, "learning_rate": 4.988333879044262e-07, "loss": 1.581, "step": 215 }, { "epoch": 0.03257672875348767, "grad_norm": 0.2869703933581396, "learning_rate": 4.988225199584844e-07, "loss": 1.6003, "step": 216 }, { "epoch": 0.03272754694216123, "grad_norm": 0.2564015242431221, "learning_rate": 4.988116017578685e-07, "loss": 1.6345, "step": 217 }, { "epoch": 0.03287836513083478, "grad_norm": 0.5442682369824418, "learning_rate": 4.988006333050298e-07, "loss": 1.6365, "step": 218 }, { "epoch": 0.03302918331950833, "grad_norm": 0.25371420603547495, "learning_rate": 4.987896146024312e-07, "loss": 1.5389, "step": 219 }, { "epoch": 0.033180001508181886, "grad_norm": 0.2589325925383411, "learning_rate": 4.987785456525468e-07, "loss": 1.6327, "step": 220 }, { "epoch": 0.03333081969685544, "grad_norm": 0.4127227412991189, "learning_rate": 4.987674264578614e-07, "loss": 1.6354, "step": 221 }, { "epoch": 0.03348163788552899, "grad_norm": 0.2616917271457671, "learning_rate": 4.987562570208722e-07, "loss": 1.5691, "step": 222 }, { "epoch": 0.03363245607420255, "grad_norm": 0.262500735261184, "learning_rate": 4.987450373440868e-07, "loss": 1.5843, "step": 223 }, { "epoch": 0.033783274262876105, "grad_norm": 2.907675552975996, "learning_rate": 4.987337674300242e-07, "loss": 1.6643, "step": 224 }, { "epoch": 0.03393409245154966, "grad_norm": 0.633109876707346, "learning_rate": 4.98722447281215e-07, "loss": 1.5513, "step": 225 }, { "epoch": 0.03408491064022321, "grad_norm": 0.31204975345051955, "learning_rate": 4.987110769002009e-07, "loss": 1.6166, "step": 226 }, { "epoch": 0.034235728828896764, "grad_norm": 0.2694667377758279, "learning_rate": 4.986996562895348e-07, "loss": 1.5886, "step": 227 }, { "epoch": 0.03438654701757032, "grad_norm": 0.3105182287125855, "learning_rate": 4.986881854517811e-07, "loss": 1.5918, "step": 228 }, { "epoch": 0.03453736520624387, "grad_norm": 0.25509511385760725, "learning_rate": 4.986766643895152e-07, "loss": 1.5409, "step": 229 }, { "epoch": 0.03468818339491743, "grad_norm": 6.222098257658109, "learning_rate": 4.98665093105324e-07, "loss": 1.6615, "step": 230 }, { "epoch": 0.03483900158359098, "grad_norm": 0.4035810998796959, "learning_rate": 4.986534716018054e-07, "loss": 1.591, "step": 231 }, { "epoch": 0.034989819772264535, "grad_norm": 0.25480384926234323, "learning_rate": 4.986417998815691e-07, "loss": 1.4973, "step": 232 }, { "epoch": 0.03514063796093809, "grad_norm": 0.26156856896321407, "learning_rate": 4.986300779472355e-07, "loss": 1.6332, "step": 233 }, { "epoch": 0.03529145614961164, "grad_norm": 0.2600770194351854, "learning_rate": 4.986183058014366e-07, "loss": 1.6398, "step": 234 }, { "epoch": 0.035442274338285194, "grad_norm": 0.25367766845188094, "learning_rate": 4.986064834468155e-07, "loss": 1.6245, "step": 235 }, { "epoch": 0.035593092526958754, "grad_norm": 0.26431562176833495, "learning_rate": 4.985946108860268e-07, "loss": 1.58, "step": 236 }, { "epoch": 0.03574391071563231, "grad_norm": 0.31760663960749624, "learning_rate": 4.985826881217361e-07, "loss": 1.572, "step": 237 }, { "epoch": 0.03589472890430586, "grad_norm": 0.24981232976175427, "learning_rate": 4.985707151566205e-07, "loss": 1.5037, "step": 238 }, { "epoch": 0.03604554709297941, "grad_norm": 0.2998208500980713, "learning_rate": 4.985586919933681e-07, "loss": 1.557, "step": 239 }, { "epoch": 0.036196365281652966, "grad_norm": 0.26446444256321233, "learning_rate": 4.985466186346787e-07, "loss": 1.5727, "step": 240 }, { "epoch": 0.03634718347032652, "grad_norm": 0.24675850916193817, "learning_rate": 4.985344950832631e-07, "loss": 1.5936, "step": 241 }, { "epoch": 0.03649800165900008, "grad_norm": 0.2766463983732949, "learning_rate": 4.985223213418433e-07, "loss": 1.6282, "step": 242 }, { "epoch": 0.03664881984767363, "grad_norm": 0.2557536990136019, "learning_rate": 4.985100974131526e-07, "loss": 1.6208, "step": 243 }, { "epoch": 0.036799638036347185, "grad_norm": 0.2599746525688643, "learning_rate": 4.984978232999357e-07, "loss": 1.6953, "step": 244 }, { "epoch": 0.03695045622502074, "grad_norm": 0.3175602608980726, "learning_rate": 4.984854990049486e-07, "loss": 1.5153, "step": 245 }, { "epoch": 0.03710127441369429, "grad_norm": 0.2677265443419408, "learning_rate": 4.984731245309581e-07, "loss": 1.5371, "step": 246 }, { "epoch": 0.037252092602367844, "grad_norm": 0.2641204549342982, "learning_rate": 4.984606998807431e-07, "loss": 1.6204, "step": 247 }, { "epoch": 0.0374029107910414, "grad_norm": 0.24941313609436178, "learning_rate": 4.984482250570931e-07, "loss": 1.6269, "step": 248 }, { "epoch": 0.037553728979714956, "grad_norm": 0.2638709649184524, "learning_rate": 4.98435700062809e-07, "loss": 1.6158, "step": 249 }, { "epoch": 0.03770454716838851, "grad_norm": 0.2656121521078653, "learning_rate": 4.984231249007031e-07, "loss": 1.611, "step": 250 }, { "epoch": 0.03785536535706206, "grad_norm": 0.28853552223521767, "learning_rate": 4.984104995735987e-07, "loss": 1.6577, "step": 251 }, { "epoch": 0.038006183545735615, "grad_norm": 0.2517118886233882, "learning_rate": 4.983978240843308e-07, "loss": 1.5289, "step": 252 }, { "epoch": 0.03815700173440917, "grad_norm": 3.1305018983774993, "learning_rate": 4.983850984357453e-07, "loss": 1.5685, "step": 253 }, { "epoch": 0.03830781992308272, "grad_norm": 0.2549610619095195, "learning_rate": 4.983723226306996e-07, "loss": 1.5981, "step": 254 }, { "epoch": 0.03845863811175628, "grad_norm": 0.26554576080157216, "learning_rate": 4.983594966720621e-07, "loss": 1.6547, "step": 255 }, { "epoch": 0.038609456300429834, "grad_norm": 0.3106860712826546, "learning_rate": 4.983466205627127e-07, "loss": 1.6248, "step": 256 }, { "epoch": 0.03876027448910339, "grad_norm": 0.26254029852902167, "learning_rate": 4.983336943055424e-07, "loss": 1.5561, "step": 257 }, { "epoch": 0.03891109267777694, "grad_norm": 0.27726026922899555, "learning_rate": 4.983207179034534e-07, "loss": 1.6218, "step": 258 }, { "epoch": 0.03906191086645049, "grad_norm": 0.29320880460173404, "learning_rate": 4.983076913593596e-07, "loss": 1.6269, "step": 259 }, { "epoch": 0.039212729055124046, "grad_norm": 0.29577469982501586, "learning_rate": 4.982946146761856e-07, "loss": 1.5869, "step": 260 }, { "epoch": 0.0393635472437976, "grad_norm": 0.2559213435574554, "learning_rate": 4.982814878568675e-07, "loss": 1.5463, "step": 261 }, { "epoch": 0.03951436543247116, "grad_norm": 0.25707483616437765, "learning_rate": 4.982683109043526e-07, "loss": 1.6543, "step": 262 }, { "epoch": 0.03966518362114471, "grad_norm": 0.41074618361188825, "learning_rate": 4.982550838215998e-07, "loss": 1.6011, "step": 263 }, { "epoch": 0.039816001809818265, "grad_norm": 0.25765331574805506, "learning_rate": 4.982418066115786e-07, "loss": 1.5752, "step": 264 }, { "epoch": 0.03996681999849182, "grad_norm": 0.256754926319508, "learning_rate": 4.982284792772704e-07, "loss": 1.5779, "step": 265 }, { "epoch": 0.04011763818716537, "grad_norm": 0.252978048670289, "learning_rate": 4.982151018216676e-07, "loss": 1.5641, "step": 266 }, { "epoch": 0.040268456375838924, "grad_norm": 0.2615264897773922, "learning_rate": 4.982016742477735e-07, "loss": 1.615, "step": 267 }, { "epoch": 0.04041927456451248, "grad_norm": 0.2470323509147562, "learning_rate": 4.981881965586032e-07, "loss": 1.6021, "step": 268 }, { "epoch": 0.040570092753186036, "grad_norm": 0.27623642277328214, "learning_rate": 4.981746687571828e-07, "loss": 1.5625, "step": 269 }, { "epoch": 0.04072091094185959, "grad_norm": 0.2681915393759785, "learning_rate": 4.981610908465498e-07, "loss": 1.6154, "step": 270 }, { "epoch": 0.04087172913053314, "grad_norm": 0.2545576182077647, "learning_rate": 4.981474628297527e-07, "loss": 1.5684, "step": 271 }, { "epoch": 0.041022547319206695, "grad_norm": 0.2751802445237094, "learning_rate": 4.981337847098515e-07, "loss": 1.598, "step": 272 }, { "epoch": 0.04117336550788025, "grad_norm": 0.3117722758748653, "learning_rate": 4.981200564899171e-07, "loss": 1.5139, "step": 273 }, { "epoch": 0.0413241836965538, "grad_norm": 0.6489078607208647, "learning_rate": 4.981062781730322e-07, "loss": 1.651, "step": 274 }, { "epoch": 0.04147500188522736, "grad_norm": 0.25981217410577917, "learning_rate": 4.980924497622901e-07, "loss": 1.5972, "step": 275 }, { "epoch": 0.041625820073900914, "grad_norm": 0.3774374807557159, "learning_rate": 4.98078571260796e-07, "loss": 1.6089, "step": 276 }, { "epoch": 0.04177663826257447, "grad_norm": 0.2563361962896586, "learning_rate": 4.980646426716658e-07, "loss": 1.6366, "step": 277 }, { "epoch": 0.04192745645124802, "grad_norm": 0.24794636056007546, "learning_rate": 4.98050663998027e-07, "loss": 1.541, "step": 278 }, { "epoch": 0.04207827463992157, "grad_norm": 0.2629941923704229, "learning_rate": 4.980366352430181e-07, "loss": 1.6291, "step": 279 }, { "epoch": 0.042229092828595126, "grad_norm": 0.2552870222410783, "learning_rate": 4.980225564097892e-07, "loss": 1.6124, "step": 280 }, { "epoch": 0.042379911017268686, "grad_norm": 0.3106822885855718, "learning_rate": 4.980084275015011e-07, "loss": 1.5888, "step": 281 }, { "epoch": 0.04253072920594224, "grad_norm": 0.25148761118484103, "learning_rate": 4.979942485213265e-07, "loss": 1.5989, "step": 282 }, { "epoch": 0.04268154739461579, "grad_norm": 0.25753173598937684, "learning_rate": 4.979800194724486e-07, "loss": 1.5735, "step": 283 }, { "epoch": 0.042832365583289345, "grad_norm": 2.4177527896744375, "learning_rate": 4.979657403580625e-07, "loss": 1.7179, "step": 284 }, { "epoch": 0.0429831837719629, "grad_norm": 0.2997357031955897, "learning_rate": 4.979514111813742e-07, "loss": 1.5998, "step": 285 }, { "epoch": 0.04313400196063645, "grad_norm": 0.25092531904846166, "learning_rate": 4.97937031945601e-07, "loss": 1.6163, "step": 286 }, { "epoch": 0.043284820149310003, "grad_norm": 0.2695635528659169, "learning_rate": 4.979226026539715e-07, "loss": 1.6604, "step": 287 }, { "epoch": 0.04343563833798356, "grad_norm": 0.2765857609818474, "learning_rate": 4.979081233097256e-07, "loss": 1.5919, "step": 288 }, { "epoch": 0.043586456526657116, "grad_norm": 0.32188937842884396, "learning_rate": 4.978935939161141e-07, "loss": 1.5846, "step": 289 }, { "epoch": 0.04373727471533067, "grad_norm": 0.2416812364013644, "learning_rate": 4.978790144763993e-07, "loss": 1.5054, "step": 290 }, { "epoch": 0.04388809290400422, "grad_norm": 0.26113644000145003, "learning_rate": 4.97864384993855e-07, "loss": 1.5911, "step": 291 }, { "epoch": 0.044038911092677775, "grad_norm": 0.2572965207604668, "learning_rate": 4.978497054717656e-07, "loss": 1.6129, "step": 292 }, { "epoch": 0.04418972928135133, "grad_norm": 2.8605673966482494, "learning_rate": 4.978349759134273e-07, "loss": 1.6077, "step": 293 }, { "epoch": 0.04434054747002489, "grad_norm": 0.27859193377435737, "learning_rate": 4.978201963221472e-07, "loss": 1.6179, "step": 294 }, { "epoch": 0.04449136565869844, "grad_norm": 0.26907059310024917, "learning_rate": 4.978053667012438e-07, "loss": 1.5576, "step": 295 }, { "epoch": 0.044642183847371994, "grad_norm": 0.29052532677945747, "learning_rate": 4.977904870540467e-07, "loss": 1.649, "step": 296 }, { "epoch": 0.04479300203604555, "grad_norm": 0.2472154773342736, "learning_rate": 4.977755573838969e-07, "loss": 1.5837, "step": 297 }, { "epoch": 0.0449438202247191, "grad_norm": 0.2559592266553768, "learning_rate": 4.977605776941466e-07, "loss": 1.5991, "step": 298 }, { "epoch": 0.04509463841339265, "grad_norm": 0.2580734058119581, "learning_rate": 4.977455479881591e-07, "loss": 1.6362, "step": 299 }, { "epoch": 0.045245456602066206, "grad_norm": 0.2509784344630137, "learning_rate": 4.977304682693089e-07, "loss": 1.5833, "step": 300 }, { "epoch": 0.045396274790739766, "grad_norm": 0.25643481159429315, "learning_rate": 4.97715338540982e-07, "loss": 1.602, "step": 301 }, { "epoch": 0.04554709297941332, "grad_norm": 0.25606930376257603, "learning_rate": 4.977001588065754e-07, "loss": 1.6303, "step": 302 }, { "epoch": 0.04569791116808687, "grad_norm": 0.25070608064964595, "learning_rate": 4.976849290694974e-07, "loss": 1.5829, "step": 303 }, { "epoch": 0.045848729356760425, "grad_norm": 0.25380401662714464, "learning_rate": 4.976696493331676e-07, "loss": 1.6079, "step": 304 }, { "epoch": 0.04599954754543398, "grad_norm": 0.3408665041576288, "learning_rate": 4.976543196010167e-07, "loss": 1.624, "step": 305 }, { "epoch": 0.04615036573410753, "grad_norm": 0.2659038642666762, "learning_rate": 4.976389398764865e-07, "loss": 1.5253, "step": 306 }, { "epoch": 0.04630118392278109, "grad_norm": 0.25436011693157684, "learning_rate": 4.976235101630303e-07, "loss": 1.5478, "step": 307 }, { "epoch": 0.04645200211145464, "grad_norm": 0.24834201891089555, "learning_rate": 4.976080304641127e-07, "loss": 1.6087, "step": 308 }, { "epoch": 0.046602820300128196, "grad_norm": 0.3203590066782244, "learning_rate": 4.975925007832091e-07, "loss": 1.5459, "step": 309 }, { "epoch": 0.04675363848880175, "grad_norm": 0.26593109134916937, "learning_rate": 4.975769211238065e-07, "loss": 1.6162, "step": 310 }, { "epoch": 0.0469044566774753, "grad_norm": 0.25633161565087326, "learning_rate": 4.97561291489403e-07, "loss": 1.5698, "step": 311 }, { "epoch": 0.047055274866148855, "grad_norm": 0.24959966756569993, "learning_rate": 4.975456118835078e-07, "loss": 1.6237, "step": 312 }, { "epoch": 0.047206093054822415, "grad_norm": 0.25286897448991674, "learning_rate": 4.975298823096416e-07, "loss": 1.6881, "step": 313 }, { "epoch": 0.04735691124349597, "grad_norm": 0.26036485809927373, "learning_rate": 4.975141027713359e-07, "loss": 1.6125, "step": 314 }, { "epoch": 0.04750772943216952, "grad_norm": 0.25618533844773894, "learning_rate": 4.974982732721338e-07, "loss": 1.55, "step": 315 }, { "epoch": 0.047658547620843074, "grad_norm": 0.33343355928017965, "learning_rate": 4.974823938155895e-07, "loss": 1.5757, "step": 316 }, { "epoch": 0.04780936580951663, "grad_norm": 0.27528927934531816, "learning_rate": 4.974664644052684e-07, "loss": 1.5858, "step": 317 }, { "epoch": 0.04796018399819018, "grad_norm": 0.25562091394247194, "learning_rate": 4.974504850447471e-07, "loss": 1.5687, "step": 318 }, { "epoch": 0.04811100218686373, "grad_norm": 0.24416275184111966, "learning_rate": 4.974344557376135e-07, "loss": 1.5786, "step": 319 }, { "epoch": 0.04826182037553729, "grad_norm": 0.24263644716720503, "learning_rate": 4.974183764874665e-07, "loss": 1.4961, "step": 320 }, { "epoch": 0.048412638564210846, "grad_norm": 0.3598724778136554, "learning_rate": 4.974022472979165e-07, "loss": 1.5871, "step": 321 }, { "epoch": 0.0485634567528844, "grad_norm": 0.25739161091804974, "learning_rate": 4.973860681725848e-07, "loss": 1.6068, "step": 322 }, { "epoch": 0.04871427494155795, "grad_norm": 0.258074132220957, "learning_rate": 4.973698391151043e-07, "loss": 1.5469, "step": 323 }, { "epoch": 0.048865093130231504, "grad_norm": 0.2486463490307011, "learning_rate": 4.973535601291188e-07, "loss": 1.5599, "step": 324 }, { "epoch": 0.04901591131890506, "grad_norm": 0.2614816055199508, "learning_rate": 4.973372312182834e-07, "loss": 1.5778, "step": 325 }, { "epoch": 0.04916672950757862, "grad_norm": 0.25989296790690797, "learning_rate": 4.973208523862643e-07, "loss": 1.6123, "step": 326 }, { "epoch": 0.04931754769625217, "grad_norm": 0.25399098043192375, "learning_rate": 4.973044236367393e-07, "loss": 1.5476, "step": 327 }, { "epoch": 0.04946836588492572, "grad_norm": 0.28911197979590025, "learning_rate": 4.972879449733968e-07, "loss": 1.5964, "step": 328 }, { "epoch": 0.049619184073599276, "grad_norm": 0.27658018770420756, "learning_rate": 4.97271416399937e-07, "loss": 1.5659, "step": 329 }, { "epoch": 0.04977000226227283, "grad_norm": 0.2557904118785333, "learning_rate": 4.97254837920071e-07, "loss": 1.5808, "step": 330 }, { "epoch": 0.04992082045094638, "grad_norm": 0.25726258372686667, "learning_rate": 4.972382095375211e-07, "loss": 1.5627, "step": 331 }, { "epoch": 0.050071638639619935, "grad_norm": 0.26655152839938784, "learning_rate": 4.972215312560208e-07, "loss": 1.5887, "step": 332 }, { "epoch": 0.050222456828293495, "grad_norm": 0.23932086734756317, "learning_rate": 4.97204803079315e-07, "loss": 1.5403, "step": 333 }, { "epoch": 0.05037327501696705, "grad_norm": 0.26431994526165736, "learning_rate": 4.971880250111596e-07, "loss": 1.5744, "step": 334 }, { "epoch": 0.0505240932056406, "grad_norm": 1.5201014117100498, "learning_rate": 4.971711970553216e-07, "loss": 1.6221, "step": 335 }, { "epoch": 0.050674911394314154, "grad_norm": 0.26096064703562155, "learning_rate": 4.971543192155797e-07, "loss": 1.5479, "step": 336 }, { "epoch": 0.05082572958298771, "grad_norm": 0.25940489482173235, "learning_rate": 4.971373914957233e-07, "loss": 1.5641, "step": 337 }, { "epoch": 0.05097654777166126, "grad_norm": 0.30473473344604063, "learning_rate": 4.97120413899553e-07, "loss": 1.6051, "step": 338 }, { "epoch": 0.05112736596033482, "grad_norm": 0.24196960703987094, "learning_rate": 4.97103386430881e-07, "loss": 1.5729, "step": 339 }, { "epoch": 0.05127818414900837, "grad_norm": 0.24610108728619232, "learning_rate": 4.970863090935303e-07, "loss": 1.5358, "step": 340 }, { "epoch": 0.051429002337681926, "grad_norm": 0.2676476083686511, "learning_rate": 4.970691818913354e-07, "loss": 1.5793, "step": 341 }, { "epoch": 0.05157982052635548, "grad_norm": 0.27860897150629826, "learning_rate": 4.970520048281418e-07, "loss": 1.5466, "step": 342 }, { "epoch": 0.05173063871502903, "grad_norm": 0.2623220090940916, "learning_rate": 4.970347779078063e-07, "loss": 1.6198, "step": 343 }, { "epoch": 0.051881456903702584, "grad_norm": 0.25308668267137435, "learning_rate": 4.970175011341968e-07, "loss": 1.5745, "step": 344 }, { "epoch": 0.05203227509237614, "grad_norm": 0.24984219060622315, "learning_rate": 4.970001745111924e-07, "loss": 1.6151, "step": 345 }, { "epoch": 0.0521830932810497, "grad_norm": 0.24582469750896027, "learning_rate": 4.969827980426834e-07, "loss": 1.5943, "step": 346 }, { "epoch": 0.05233391146972325, "grad_norm": 0.7596166627025406, "learning_rate": 4.969653717325715e-07, "loss": 1.5662, "step": 347 }, { "epoch": 0.0524847296583968, "grad_norm": 0.2684913803684381, "learning_rate": 4.969478955847692e-07, "loss": 1.6064, "step": 348 }, { "epoch": 0.052635547847070356, "grad_norm": 0.26403543896139, "learning_rate": 4.969303696032005e-07, "loss": 1.5711, "step": 349 }, { "epoch": 0.05278636603574391, "grad_norm": 0.27319367968006386, "learning_rate": 4.969127937918006e-07, "loss": 1.6161, "step": 350 }, { "epoch": 0.05293718422441746, "grad_norm": 0.25929594252221, "learning_rate": 4.968951681545155e-07, "loss": 1.615, "step": 351 }, { "epoch": 0.05308800241309102, "grad_norm": 0.2566722943610633, "learning_rate": 4.96877492695303e-07, "loss": 1.5854, "step": 352 }, { "epoch": 0.053238820601764575, "grad_norm": 0.24811229538006918, "learning_rate": 4.968597674181316e-07, "loss": 1.6046, "step": 353 }, { "epoch": 0.05338963879043813, "grad_norm": 0.2856100330991902, "learning_rate": 4.96841992326981e-07, "loss": 1.6457, "step": 354 }, { "epoch": 0.05354045697911168, "grad_norm": 0.2748920596999085, "learning_rate": 4.968241674258426e-07, "loss": 1.5703, "step": 355 }, { "epoch": 0.053691275167785234, "grad_norm": 0.2622971771021598, "learning_rate": 4.968062927187183e-07, "loss": 1.5573, "step": 356 }, { "epoch": 0.05384209335645879, "grad_norm": 0.2560163824303438, "learning_rate": 4.967883682096214e-07, "loss": 1.5971, "step": 357 }, { "epoch": 0.05399291154513234, "grad_norm": 0.27367198030490836, "learning_rate": 4.967703939025768e-07, "loss": 1.5848, "step": 358 }, { "epoch": 0.0541437297338059, "grad_norm": 0.2664287206155345, "learning_rate": 4.967523698016201e-07, "loss": 1.5142, "step": 359 }, { "epoch": 0.05429454792247945, "grad_norm": 0.2812645453883054, "learning_rate": 4.967342959107982e-07, "loss": 1.6473, "step": 360 }, { "epoch": 0.054445366111153005, "grad_norm": 0.2639456931300287, "learning_rate": 4.967161722341692e-07, "loss": 1.6125, "step": 361 }, { "epoch": 0.05459618429982656, "grad_norm": 0.25849725873657714, "learning_rate": 4.966979987758025e-07, "loss": 1.5649, "step": 362 }, { "epoch": 0.05474700248850011, "grad_norm": 0.2391469770882484, "learning_rate": 4.966797755397785e-07, "loss": 1.5466, "step": 363 }, { "epoch": 0.054897820677173664, "grad_norm": 0.24853257602934095, "learning_rate": 4.96661502530189e-07, "loss": 1.5335, "step": 364 }, { "epoch": 0.055048638865847224, "grad_norm": 0.27168670853914295, "learning_rate": 4.966431797511366e-07, "loss": 1.5268, "step": 365 }, { "epoch": 0.05519945705452078, "grad_norm": 0.265670244111924, "learning_rate": 4.966248072067353e-07, "loss": 1.6445, "step": 366 }, { "epoch": 0.05535027524319433, "grad_norm": 0.2608115163988439, "learning_rate": 4.966063849011103e-07, "loss": 1.5765, "step": 367 }, { "epoch": 0.05550109343186788, "grad_norm": 0.2701215726834107, "learning_rate": 4.965879128383982e-07, "loss": 1.5365, "step": 368 }, { "epoch": 0.055651911620541436, "grad_norm": 0.25470002940222275, "learning_rate": 4.965693910227462e-07, "loss": 1.579, "step": 369 }, { "epoch": 0.05580272980921499, "grad_norm": 0.2548711844491008, "learning_rate": 4.965508194583132e-07, "loss": 1.6139, "step": 370 }, { "epoch": 0.05595354799788854, "grad_norm": 0.299974020575167, "learning_rate": 4.965321981492687e-07, "loss": 1.549, "step": 371 }, { "epoch": 0.0561043661865621, "grad_norm": 0.278607732945803, "learning_rate": 4.965135270997942e-07, "loss": 1.5023, "step": 372 }, { "epoch": 0.056255184375235655, "grad_norm": 0.25504464963999574, "learning_rate": 4.964948063140817e-07, "loss": 1.6278, "step": 373 }, { "epoch": 0.05640600256390921, "grad_norm": 0.26546345346554, "learning_rate": 4.964760357963345e-07, "loss": 1.5378, "step": 374 }, { "epoch": 0.05655682075258276, "grad_norm": 0.27984646045688566, "learning_rate": 4.964572155507671e-07, "loss": 1.6742, "step": 375 }, { "epoch": 0.056707638941256314, "grad_norm": 0.23650139431217956, "learning_rate": 4.964383455816053e-07, "loss": 1.6343, "step": 376 }, { "epoch": 0.05685845712992987, "grad_norm": 0.278604940494406, "learning_rate": 4.96419425893086e-07, "loss": 1.6187, "step": 377 }, { "epoch": 0.057009275318603427, "grad_norm": 0.256361842335032, "learning_rate": 4.96400456489457e-07, "loss": 1.5913, "step": 378 }, { "epoch": 0.05716009350727698, "grad_norm": 0.2545279810570703, "learning_rate": 4.963814373749777e-07, "loss": 1.6059, "step": 379 }, { "epoch": 0.05731091169595053, "grad_norm": 0.27221070074121395, "learning_rate": 4.963623685539184e-07, "loss": 1.59, "step": 380 }, { "epoch": 0.057461729884624085, "grad_norm": 0.26467844690156933, "learning_rate": 4.963432500305605e-07, "loss": 1.5966, "step": 381 }, { "epoch": 0.05761254807329764, "grad_norm": 0.24614201302924177, "learning_rate": 4.963240818091969e-07, "loss": 1.537, "step": 382 }, { "epoch": 0.05776336626197119, "grad_norm": 0.24763193545120962, "learning_rate": 4.96304863894131e-07, "loss": 1.5637, "step": 383 }, { "epoch": 0.05791418445064475, "grad_norm": 0.2666288900937848, "learning_rate": 4.962855962896782e-07, "loss": 1.6786, "step": 384 }, { "epoch": 0.058065002639318304, "grad_norm": 0.2705435333853324, "learning_rate": 4.962662790001645e-07, "loss": 1.5665, "step": 385 }, { "epoch": 0.05821582082799186, "grad_norm": 0.2607417613515552, "learning_rate": 4.962469120299272e-07, "loss": 1.6145, "step": 386 }, { "epoch": 0.05836663901666541, "grad_norm": 0.29842159732772466, "learning_rate": 4.962274953833147e-07, "loss": 1.5656, "step": 387 }, { "epoch": 0.05851745720533896, "grad_norm": 0.25621058334891145, "learning_rate": 4.962080290646865e-07, "loss": 1.5899, "step": 388 }, { "epoch": 0.058668275394012516, "grad_norm": 0.30606055120493514, "learning_rate": 4.961885130784137e-07, "loss": 1.5625, "step": 389 }, { "epoch": 0.05881909358268607, "grad_norm": 0.2524525980936105, "learning_rate": 4.961689474288779e-07, "loss": 1.6383, "step": 390 }, { "epoch": 0.05896991177135963, "grad_norm": 0.37840292284840893, "learning_rate": 4.961493321204722e-07, "loss": 1.5864, "step": 391 }, { "epoch": 0.05912072996003318, "grad_norm": 0.26299467187040465, "learning_rate": 4.961296671576009e-07, "loss": 1.6757, "step": 392 }, { "epoch": 0.059271548148706735, "grad_norm": 0.25575918441640594, "learning_rate": 4.961099525446793e-07, "loss": 1.6462, "step": 393 }, { "epoch": 0.05942236633738029, "grad_norm": 0.2536309275592947, "learning_rate": 4.96090188286134e-07, "loss": 1.4705, "step": 394 }, { "epoch": 0.05957318452605384, "grad_norm": 0.2467647941221563, "learning_rate": 4.960703743864025e-07, "loss": 1.5756, "step": 395 }, { "epoch": 0.059724002714727394, "grad_norm": 0.2542755135779441, "learning_rate": 4.960505108499337e-07, "loss": 1.5743, "step": 396 }, { "epoch": 0.059874820903400953, "grad_norm": 0.2454253182745587, "learning_rate": 4.960305976811874e-07, "loss": 1.6408, "step": 397 }, { "epoch": 0.060025639092074506, "grad_norm": 0.2878230831825292, "learning_rate": 4.960106348846349e-07, "loss": 1.5901, "step": 398 }, { "epoch": 0.06017645728074806, "grad_norm": 0.2396955029775844, "learning_rate": 4.959906224647584e-07, "loss": 1.6232, "step": 399 }, { "epoch": 0.06032727546942161, "grad_norm": 0.254504424632013, "learning_rate": 4.95970560426051e-07, "loss": 1.6626, "step": 400 }, { "epoch": 0.060478093658095165, "grad_norm": 0.24242887605105484, "learning_rate": 4.959504487730175e-07, "loss": 1.5979, "step": 401 }, { "epoch": 0.06062891184676872, "grad_norm": 0.2991997387118549, "learning_rate": 4.959302875101736e-07, "loss": 1.5772, "step": 402 }, { "epoch": 0.06077973003544227, "grad_norm": 0.25635844722182943, "learning_rate": 4.959100766420458e-07, "loss": 1.5976, "step": 403 }, { "epoch": 0.06093054822411583, "grad_norm": 0.25378689447575264, "learning_rate": 4.958898161731722e-07, "loss": 1.6138, "step": 404 }, { "epoch": 0.061081366412789384, "grad_norm": 0.2623760053921337, "learning_rate": 4.958695061081017e-07, "loss": 1.58, "step": 405 }, { "epoch": 0.06123218460146294, "grad_norm": 0.2454756265927881, "learning_rate": 4.958491464513949e-07, "loss": 1.5954, "step": 406 }, { "epoch": 0.06138300279013649, "grad_norm": 0.25948334826498115, "learning_rate": 4.958287372076228e-07, "loss": 1.663, "step": 407 }, { "epoch": 0.06153382097881004, "grad_norm": 0.25222923306553524, "learning_rate": 4.95808278381368e-07, "loss": 1.6045, "step": 408 }, { "epoch": 0.061684639167483596, "grad_norm": 0.25613180784161454, "learning_rate": 4.95787769977224e-07, "loss": 1.5402, "step": 409 }, { "epoch": 0.061835457356157156, "grad_norm": 0.2421104210544763, "learning_rate": 4.957672119997956e-07, "loss": 1.5712, "step": 410 }, { "epoch": 0.06198627554483071, "grad_norm": 0.29376753209022727, "learning_rate": 4.957466044536987e-07, "loss": 1.6257, "step": 411 }, { "epoch": 0.06213709373350426, "grad_norm": 0.24950584536301929, "learning_rate": 4.957259473435601e-07, "loss": 1.5806, "step": 412 }, { "epoch": 0.062287911922177815, "grad_norm": 0.29461741859092144, "learning_rate": 4.957052406740182e-07, "loss": 1.5995, "step": 413 }, { "epoch": 0.06243873011085137, "grad_norm": 0.2575517622877305, "learning_rate": 4.956844844497221e-07, "loss": 1.5807, "step": 414 }, { "epoch": 0.06258954829952493, "grad_norm": 0.2657087176303521, "learning_rate": 4.956636786753322e-07, "loss": 1.6092, "step": 415 }, { "epoch": 0.06274036648819847, "grad_norm": 0.25543517571389057, "learning_rate": 4.9564282335552e-07, "loss": 1.6333, "step": 416 }, { "epoch": 0.06289118467687203, "grad_norm": 0.2825468297311831, "learning_rate": 4.95621918494968e-07, "loss": 1.5348, "step": 417 }, { "epoch": 0.06304200286554558, "grad_norm": 0.44887284182032045, "learning_rate": 4.956009640983702e-07, "loss": 1.5886, "step": 418 }, { "epoch": 0.06319282105421914, "grad_norm": 0.2663367815663453, "learning_rate": 4.955799601704315e-07, "loss": 1.6115, "step": 419 }, { "epoch": 0.0633436392428927, "grad_norm": 0.5364400057464868, "learning_rate": 4.955589067158675e-07, "loss": 1.558, "step": 420 }, { "epoch": 0.06349445743156625, "grad_norm": 0.28752305934187716, "learning_rate": 4.955378037394056e-07, "loss": 1.5967, "step": 421 }, { "epoch": 0.0636452756202398, "grad_norm": 0.25938858933203635, "learning_rate": 4.955166512457841e-07, "loss": 1.5468, "step": 422 }, { "epoch": 0.06379609380891335, "grad_norm": 0.309542792113621, "learning_rate": 4.954954492397521e-07, "loss": 1.5073, "step": 423 }, { "epoch": 0.06394691199758691, "grad_norm": 0.2576084447428103, "learning_rate": 4.954741977260703e-07, "loss": 1.5529, "step": 424 }, { "epoch": 0.06409773018626046, "grad_norm": 0.2529029416333881, "learning_rate": 4.954528967095101e-07, "loss": 1.5312, "step": 425 }, { "epoch": 0.06424854837493402, "grad_norm": 0.25953587371630393, "learning_rate": 4.954315461948545e-07, "loss": 1.6635, "step": 426 }, { "epoch": 0.06439936656360758, "grad_norm": 0.26141101300702185, "learning_rate": 4.95410146186897e-07, "loss": 1.6324, "step": 427 }, { "epoch": 0.06455018475228112, "grad_norm": 0.2703321534764104, "learning_rate": 4.953886966904425e-07, "loss": 1.5481, "step": 428 }, { "epoch": 0.06470100294095468, "grad_norm": 0.2584943248632401, "learning_rate": 4.953671977103073e-07, "loss": 1.6144, "step": 429 }, { "epoch": 0.06485182112962823, "grad_norm": 1.222515281227258, "learning_rate": 4.953456492513184e-07, "loss": 1.601, "step": 430 }, { "epoch": 0.06500263931830179, "grad_norm": 0.2658086216081372, "learning_rate": 4.95324051318314e-07, "loss": 1.6449, "step": 431 }, { "epoch": 0.06515345750697533, "grad_norm": 0.38610334921098405, "learning_rate": 4.953024039161436e-07, "loss": 1.6104, "step": 432 }, { "epoch": 0.0653042756956489, "grad_norm": 0.2825054807195639, "learning_rate": 4.952807070496675e-07, "loss": 1.7244, "step": 433 }, { "epoch": 0.06545509388432245, "grad_norm": 0.30899364121719064, "learning_rate": 4.952589607237575e-07, "loss": 1.496, "step": 434 }, { "epoch": 0.065605912072996, "grad_norm": 0.2723169206900093, "learning_rate": 4.952371649432962e-07, "loss": 1.5453, "step": 435 }, { "epoch": 0.06575673026166956, "grad_norm": 7.523062534338598, "learning_rate": 4.952153197131773e-07, "loss": 1.5674, "step": 436 }, { "epoch": 0.0659075484503431, "grad_norm": 0.48768300544830984, "learning_rate": 4.951934250383057e-07, "loss": 1.5901, "step": 437 }, { "epoch": 0.06605836663901667, "grad_norm": 0.2599806063338143, "learning_rate": 4.951714809235974e-07, "loss": 1.5693, "step": 438 }, { "epoch": 0.06620918482769023, "grad_norm": 0.380773588575501, "learning_rate": 4.951494873739796e-07, "loss": 1.5482, "step": 439 }, { "epoch": 0.06636000301636377, "grad_norm": 0.24912068069172005, "learning_rate": 4.951274443943905e-07, "loss": 1.5877, "step": 440 }, { "epoch": 0.06651082120503733, "grad_norm": 2.064434986001262, "learning_rate": 4.951053519897792e-07, "loss": 1.6384, "step": 441 }, { "epoch": 0.06666163939371088, "grad_norm": 0.2536004094108816, "learning_rate": 4.950832101651062e-07, "loss": 1.5778, "step": 442 }, { "epoch": 0.06681245758238444, "grad_norm": 0.24866657130017011, "learning_rate": 4.950610189253431e-07, "loss": 1.5818, "step": 443 }, { "epoch": 0.06696327577105798, "grad_norm": 0.2621890595250137, "learning_rate": 4.950387782754722e-07, "loss": 1.6346, "step": 444 }, { "epoch": 0.06711409395973154, "grad_norm": 0.2510737223460099, "learning_rate": 4.950164882204875e-07, "loss": 1.5448, "step": 445 }, { "epoch": 0.0672649121484051, "grad_norm": 0.2476520678315182, "learning_rate": 4.949941487653936e-07, "loss": 1.6327, "step": 446 }, { "epoch": 0.06741573033707865, "grad_norm": 0.26233079006484494, "learning_rate": 4.949717599152063e-07, "loss": 1.6294, "step": 447 }, { "epoch": 0.06756654852575221, "grad_norm": 0.24716726351344304, "learning_rate": 4.949493216749525e-07, "loss": 1.5614, "step": 448 }, { "epoch": 0.06771736671442576, "grad_norm": 0.26748648356688093, "learning_rate": 4.949268340496706e-07, "loss": 1.5706, "step": 449 }, { "epoch": 0.06786818490309932, "grad_norm": 0.251706135191505, "learning_rate": 4.949042970444092e-07, "loss": 1.6084, "step": 450 }, { "epoch": 0.06801900309177286, "grad_norm": 0.25831712897398074, "learning_rate": 4.94881710664229e-07, "loss": 1.5103, "step": 451 }, { "epoch": 0.06816982128044642, "grad_norm": 0.2518111435993509, "learning_rate": 4.948590749142011e-07, "loss": 1.5769, "step": 452 }, { "epoch": 0.06832063946911998, "grad_norm": 0.25077512227541265, "learning_rate": 4.948363897994078e-07, "loss": 1.5992, "step": 453 }, { "epoch": 0.06847145765779353, "grad_norm": 0.25227964846590806, "learning_rate": 4.948136553249426e-07, "loss": 1.574, "step": 454 }, { "epoch": 0.06862227584646709, "grad_norm": 0.2457294422180872, "learning_rate": 4.947908714959102e-07, "loss": 1.5769, "step": 455 }, { "epoch": 0.06877309403514063, "grad_norm": 0.2496239078167245, "learning_rate": 4.94768038317426e-07, "loss": 1.5436, "step": 456 }, { "epoch": 0.0689239122238142, "grad_norm": 0.29062933294218735, "learning_rate": 4.94745155794617e-07, "loss": 1.5727, "step": 457 }, { "epoch": 0.06907473041248774, "grad_norm": 0.25627733322126045, "learning_rate": 4.947222239326207e-07, "loss": 1.61, "step": 458 }, { "epoch": 0.0692255486011613, "grad_norm": 0.2532772075106132, "learning_rate": 4.946992427365862e-07, "loss": 1.6688, "step": 459 }, { "epoch": 0.06937636678983486, "grad_norm": 0.2584382481261571, "learning_rate": 4.946762122116733e-07, "loss": 1.6578, "step": 460 }, { "epoch": 0.0695271849785084, "grad_norm": 0.26526267071855675, "learning_rate": 4.946531323630531e-07, "loss": 1.6717, "step": 461 }, { "epoch": 0.06967800316718196, "grad_norm": 0.25305928479834555, "learning_rate": 4.946300031959077e-07, "loss": 1.6204, "step": 462 }, { "epoch": 0.06982882135585551, "grad_norm": 0.28864451322495427, "learning_rate": 4.946068247154303e-07, "loss": 1.5951, "step": 463 }, { "epoch": 0.06997963954452907, "grad_norm": 0.24532974198684246, "learning_rate": 4.94583596926825e-07, "loss": 1.5852, "step": 464 }, { "epoch": 0.07013045773320263, "grad_norm": 3.851715573924243, "learning_rate": 4.945603198353073e-07, "loss": 1.5603, "step": 465 }, { "epoch": 0.07028127592187618, "grad_norm": 0.27613690978194105, "learning_rate": 4.945369934461034e-07, "loss": 1.6249, "step": 466 }, { "epoch": 0.07043209411054974, "grad_norm": 0.2514935014317457, "learning_rate": 4.945136177644509e-07, "loss": 1.5245, "step": 467 }, { "epoch": 0.07058291229922328, "grad_norm": 0.8068622232940601, "learning_rate": 4.944901927955982e-07, "loss": 1.6069, "step": 468 }, { "epoch": 0.07073373048789684, "grad_norm": 0.24230146231946875, "learning_rate": 4.94466718544805e-07, "loss": 1.565, "step": 469 }, { "epoch": 0.07088454867657039, "grad_norm": 0.25797039870888167, "learning_rate": 4.944431950173419e-07, "loss": 1.5914, "step": 470 }, { "epoch": 0.07103536686524395, "grad_norm": 0.28046800099114216, "learning_rate": 4.944196222184907e-07, "loss": 1.5747, "step": 471 }, { "epoch": 0.07118618505391751, "grad_norm": 0.26447198542846684, "learning_rate": 4.94396000153544e-07, "loss": 1.607, "step": 472 }, { "epoch": 0.07133700324259105, "grad_norm": 0.25729676049336647, "learning_rate": 4.943723288278059e-07, "loss": 1.5715, "step": 473 }, { "epoch": 0.07148782143126461, "grad_norm": 0.25074184707404273, "learning_rate": 4.94348608246591e-07, "loss": 1.6102, "step": 474 }, { "epoch": 0.07163863961993816, "grad_norm": 0.28631414643129693, "learning_rate": 4.943248384152255e-07, "loss": 1.5604, "step": 475 }, { "epoch": 0.07178945780861172, "grad_norm": 0.2491117852003348, "learning_rate": 4.943010193390463e-07, "loss": 1.5763, "step": 476 }, { "epoch": 0.07194027599728527, "grad_norm": 0.6450301432730091, "learning_rate": 4.942771510234015e-07, "loss": 1.6482, "step": 477 }, { "epoch": 0.07209109418595883, "grad_norm": 0.25779434871207685, "learning_rate": 4.942532334736502e-07, "loss": 1.6369, "step": 478 }, { "epoch": 0.07224191237463239, "grad_norm": 0.255749488451868, "learning_rate": 4.942292666951626e-07, "loss": 1.5625, "step": 479 }, { "epoch": 0.07239273056330593, "grad_norm": 0.24329694843771454, "learning_rate": 4.942052506933201e-07, "loss": 1.545, "step": 480 }, { "epoch": 0.07254354875197949, "grad_norm": 0.2639578426050398, "learning_rate": 4.941811854735148e-07, "loss": 1.5715, "step": 481 }, { "epoch": 0.07269436694065304, "grad_norm": 0.2555493641454379, "learning_rate": 4.9415707104115e-07, "loss": 1.5908, "step": 482 }, { "epoch": 0.0728451851293266, "grad_norm": 0.24771916790007276, "learning_rate": 4.941329074016404e-07, "loss": 1.6032, "step": 483 }, { "epoch": 0.07299600331800016, "grad_norm": 0.25813027877620537, "learning_rate": 4.94108694560411e-07, "loss": 1.6065, "step": 484 }, { "epoch": 0.0731468215066737, "grad_norm": 0.2617232017934573, "learning_rate": 4.940844325228986e-07, "loss": 1.5383, "step": 485 }, { "epoch": 0.07329763969534726, "grad_norm": 0.9969708511757501, "learning_rate": 4.940601212945508e-07, "loss": 1.5394, "step": 486 }, { "epoch": 0.07344845788402081, "grad_norm": 0.26599817022694655, "learning_rate": 4.940357608808258e-07, "loss": 1.56, "step": 487 }, { "epoch": 0.07359927607269437, "grad_norm": 0.2882472701105029, "learning_rate": 4.940113512871937e-07, "loss": 1.5607, "step": 488 }, { "epoch": 0.07375009426136792, "grad_norm": 0.26344165366825933, "learning_rate": 4.939868925191347e-07, "loss": 1.5867, "step": 489 }, { "epoch": 0.07390091245004148, "grad_norm": 0.2559759462159106, "learning_rate": 4.939623845821408e-07, "loss": 1.661, "step": 490 }, { "epoch": 0.07405173063871504, "grad_norm": 0.2494797400109043, "learning_rate": 4.939378274817147e-07, "loss": 1.5513, "step": 491 }, { "epoch": 0.07420254882738858, "grad_norm": 0.2564563520992107, "learning_rate": 4.939132212233701e-07, "loss": 1.6087, "step": 492 }, { "epoch": 0.07435336701606214, "grad_norm": 0.2501704535942094, "learning_rate": 4.938885658126319e-07, "loss": 1.5295, "step": 493 }, { "epoch": 0.07450418520473569, "grad_norm": 0.2707003912399451, "learning_rate": 4.93863861255036e-07, "loss": 1.6212, "step": 494 }, { "epoch": 0.07465500339340925, "grad_norm": 0.2610328885695742, "learning_rate": 4.938391075561292e-07, "loss": 1.6279, "step": 495 }, { "epoch": 0.0748058215820828, "grad_norm": 0.24482384823431727, "learning_rate": 4.938143047214695e-07, "loss": 1.6057, "step": 496 }, { "epoch": 0.07495663977075635, "grad_norm": 0.2662320490545685, "learning_rate": 4.937894527566257e-07, "loss": 1.5947, "step": 497 }, { "epoch": 0.07510745795942991, "grad_norm": 0.24651792237577014, "learning_rate": 4.93764551667178e-07, "loss": 1.6602, "step": 498 }, { "epoch": 0.07525827614810346, "grad_norm": 0.24301487876995637, "learning_rate": 4.937396014587173e-07, "loss": 1.5758, "step": 499 }, { "epoch": 0.07540909433677702, "grad_norm": 0.24840742131244414, "learning_rate": 4.937146021368456e-07, "loss": 1.562, "step": 500 }, { "epoch": 0.07555991252545056, "grad_norm": 0.3866236043766646, "learning_rate": 4.936895537071761e-07, "loss": 1.6029, "step": 501 }, { "epoch": 0.07571073071412412, "grad_norm": 0.26485826630020837, "learning_rate": 4.936644561753329e-07, "loss": 1.5556, "step": 502 }, { "epoch": 0.07586154890279767, "grad_norm": 0.2489058113043106, "learning_rate": 4.93639309546951e-07, "loss": 1.5792, "step": 503 }, { "epoch": 0.07601236709147123, "grad_norm": 0.26597721973837385, "learning_rate": 4.936141138276766e-07, "loss": 1.5991, "step": 504 }, { "epoch": 0.07616318528014479, "grad_norm": 0.2607421928543872, "learning_rate": 4.93588869023167e-07, "loss": 1.5498, "step": 505 }, { "epoch": 0.07631400346881834, "grad_norm": 0.2895465689620655, "learning_rate": 4.935635751390903e-07, "loss": 1.5785, "step": 506 }, { "epoch": 0.0764648216574919, "grad_norm": 0.26139600362308274, "learning_rate": 4.935382321811255e-07, "loss": 1.5805, "step": 507 }, { "epoch": 0.07661563984616544, "grad_norm": 0.2638969582987421, "learning_rate": 4.935128401549633e-07, "loss": 1.625, "step": 508 }, { "epoch": 0.076766458034839, "grad_norm": 0.4096579467303753, "learning_rate": 4.934873990663046e-07, "loss": 1.6052, "step": 509 }, { "epoch": 0.07691727622351256, "grad_norm": 0.2651690016150467, "learning_rate": 4.934619089208617e-07, "loss": 1.5982, "step": 510 }, { "epoch": 0.07706809441218611, "grad_norm": 0.2608025596360598, "learning_rate": 4.93436369724358e-07, "loss": 1.5265, "step": 511 }, { "epoch": 0.07721891260085967, "grad_norm": 0.24775914712884162, "learning_rate": 4.934107814825277e-07, "loss": 1.6348, "step": 512 }, { "epoch": 0.07736973078953321, "grad_norm": 0.23982029055690735, "learning_rate": 4.93385144201116e-07, "loss": 1.5263, "step": 513 }, { "epoch": 0.07752054897820677, "grad_norm": 0.24440445690472806, "learning_rate": 4.933594578858796e-07, "loss": 1.6087, "step": 514 }, { "epoch": 0.07767136716688032, "grad_norm": 0.29016356740639526, "learning_rate": 4.933337225425854e-07, "loss": 1.582, "step": 515 }, { "epoch": 0.07782218535555388, "grad_norm": 4.683371732453072, "learning_rate": 4.93307938177012e-07, "loss": 1.6012, "step": 516 }, { "epoch": 0.07797300354422744, "grad_norm": 0.32311562025321017, "learning_rate": 4.932821047949486e-07, "loss": 1.6044, "step": 517 }, { "epoch": 0.07812382173290099, "grad_norm": 0.2557945762461551, "learning_rate": 4.932562224021955e-07, "loss": 1.5517, "step": 518 }, { "epoch": 0.07827463992157455, "grad_norm": 0.3261593598067262, "learning_rate": 4.932302910045642e-07, "loss": 1.5681, "step": 519 }, { "epoch": 0.07842545811024809, "grad_norm": 0.3704288545771335, "learning_rate": 4.932043106078772e-07, "loss": 1.5855, "step": 520 }, { "epoch": 0.07857627629892165, "grad_norm": 0.27059697971029173, "learning_rate": 4.931782812179674e-07, "loss": 1.5812, "step": 521 }, { "epoch": 0.0787270944875952, "grad_norm": 0.256274062882238, "learning_rate": 4.931522028406795e-07, "loss": 1.5326, "step": 522 }, { "epoch": 0.07887791267626876, "grad_norm": 0.3795916976539053, "learning_rate": 4.931260754818688e-07, "loss": 1.5828, "step": 523 }, { "epoch": 0.07902873086494232, "grad_norm": 0.2872991368841093, "learning_rate": 4.930998991474017e-07, "loss": 1.5398, "step": 524 }, { "epoch": 0.07917954905361586, "grad_norm": 0.24318254003701104, "learning_rate": 4.930736738431553e-07, "loss": 1.5622, "step": 525 }, { "epoch": 0.07933036724228942, "grad_norm": 0.25638241361288794, "learning_rate": 4.930473995750181e-07, "loss": 1.5693, "step": 526 }, { "epoch": 0.07948118543096297, "grad_norm": 0.25050071504745147, "learning_rate": 4.930210763488896e-07, "loss": 1.5284, "step": 527 }, { "epoch": 0.07963200361963653, "grad_norm": 0.24898287643190337, "learning_rate": 4.929947041706799e-07, "loss": 1.5946, "step": 528 }, { "epoch": 0.07978282180831008, "grad_norm": 0.2506867625821907, "learning_rate": 4.929682830463105e-07, "loss": 1.5109, "step": 529 }, { "epoch": 0.07993363999698364, "grad_norm": 0.24183988226515585, "learning_rate": 4.929418129817135e-07, "loss": 1.5645, "step": 530 }, { "epoch": 0.0800844581856572, "grad_norm": 0.2543083634039026, "learning_rate": 4.929152939828324e-07, "loss": 1.5981, "step": 531 }, { "epoch": 0.08023527637433074, "grad_norm": 0.2590778218402505, "learning_rate": 4.928887260556214e-07, "loss": 1.6484, "step": 532 }, { "epoch": 0.0803860945630043, "grad_norm": 0.254819692911704, "learning_rate": 4.928621092060457e-07, "loss": 1.56, "step": 533 }, { "epoch": 0.08053691275167785, "grad_norm": 0.2452102835738855, "learning_rate": 4.928354434400817e-07, "loss": 1.5742, "step": 534 }, { "epoch": 0.08068773094035141, "grad_norm": 0.2525657819458198, "learning_rate": 4.928087287637166e-07, "loss": 1.5722, "step": 535 }, { "epoch": 0.08083854912902497, "grad_norm": 0.2509074090624925, "learning_rate": 4.927819651829486e-07, "loss": 1.5607, "step": 536 }, { "epoch": 0.08098936731769851, "grad_norm": 0.25115918674482207, "learning_rate": 4.927551527037868e-07, "loss": 1.5402, "step": 537 }, { "epoch": 0.08114018550637207, "grad_norm": 0.24342202294545484, "learning_rate": 4.927282913322516e-07, "loss": 1.6124, "step": 538 }, { "epoch": 0.08129100369504562, "grad_norm": 0.2832028861128905, "learning_rate": 4.927013810743739e-07, "loss": 1.6304, "step": 539 }, { "epoch": 0.08144182188371918, "grad_norm": 0.2508552678273836, "learning_rate": 4.926744219361962e-07, "loss": 1.5912, "step": 540 }, { "epoch": 0.08159264007239272, "grad_norm": 0.2437905171553333, "learning_rate": 4.926474139237713e-07, "loss": 1.4971, "step": 541 }, { "epoch": 0.08174345826106628, "grad_norm": 0.24457479507873567, "learning_rate": 4.926203570431633e-07, "loss": 1.6038, "step": 542 }, { "epoch": 0.08189427644973984, "grad_norm": 0.2513609050923867, "learning_rate": 4.925932513004475e-07, "loss": 1.5137, "step": 543 }, { "epoch": 0.08204509463841339, "grad_norm": 0.2539532399251499, "learning_rate": 4.925660967017097e-07, "loss": 1.5587, "step": 544 }, { "epoch": 0.08219591282708695, "grad_norm": 0.2637711952751605, "learning_rate": 4.925388932530469e-07, "loss": 1.5986, "step": 545 }, { "epoch": 0.0823467310157605, "grad_norm": 0.25790794585338084, "learning_rate": 4.925116409605672e-07, "loss": 1.5677, "step": 546 }, { "epoch": 0.08249754920443406, "grad_norm": 0.3729386590116179, "learning_rate": 4.924843398303895e-07, "loss": 1.5004, "step": 547 }, { "epoch": 0.0826483673931076, "grad_norm": 0.2503856209292245, "learning_rate": 4.924569898686435e-07, "loss": 1.5944, "step": 548 }, { "epoch": 0.08279918558178116, "grad_norm": 0.24346750355850505, "learning_rate": 4.924295910814703e-07, "loss": 1.5523, "step": 549 }, { "epoch": 0.08295000377045472, "grad_norm": 0.24624694608717898, "learning_rate": 4.924021434750218e-07, "loss": 1.6148, "step": 550 }, { "epoch": 0.08310082195912827, "grad_norm": 0.2535930007858961, "learning_rate": 4.923746470554604e-07, "loss": 1.5205, "step": 551 }, { "epoch": 0.08325164014780183, "grad_norm": 0.24827404197581623, "learning_rate": 4.923471018289602e-07, "loss": 1.5817, "step": 552 }, { "epoch": 0.08340245833647537, "grad_norm": 0.2503802600818671, "learning_rate": 4.923195078017058e-07, "loss": 1.5844, "step": 553 }, { "epoch": 0.08355327652514893, "grad_norm": 0.27055106614329255, "learning_rate": 4.922918649798929e-07, "loss": 1.5673, "step": 554 }, { "epoch": 0.0837040947138225, "grad_norm": 0.25205217660052004, "learning_rate": 4.922641733697281e-07, "loss": 1.5818, "step": 555 }, { "epoch": 0.08385491290249604, "grad_norm": 0.25579080228321643, "learning_rate": 4.922364329774287e-07, "loss": 1.5709, "step": 556 }, { "epoch": 0.0840057310911696, "grad_norm": 0.249415396867393, "learning_rate": 4.922086438092238e-07, "loss": 1.5273, "step": 557 }, { "epoch": 0.08415654927984315, "grad_norm": 0.28056615620061776, "learning_rate": 4.921808058713524e-07, "loss": 1.5693, "step": 558 }, { "epoch": 0.0843073674685167, "grad_norm": 0.2844995932486672, "learning_rate": 4.92152919170065e-07, "loss": 1.5175, "step": 559 }, { "epoch": 0.08445818565719025, "grad_norm": 0.2608428394441073, "learning_rate": 4.921249837116231e-07, "loss": 1.6353, "step": 560 }, { "epoch": 0.08460900384586381, "grad_norm": 0.24713048021592293, "learning_rate": 4.920969995022992e-07, "loss": 1.5715, "step": 561 }, { "epoch": 0.08475982203453737, "grad_norm": 0.46568785557221054, "learning_rate": 4.92068966548376e-07, "loss": 1.612, "step": 562 }, { "epoch": 0.08491064022321092, "grad_norm": 0.2434062672346236, "learning_rate": 4.920408848561483e-07, "loss": 1.5673, "step": 563 }, { "epoch": 0.08506145841188448, "grad_norm": 0.28399027202323046, "learning_rate": 4.920127544319208e-07, "loss": 1.5214, "step": 564 }, { "epoch": 0.08521227660055802, "grad_norm": 0.2504036265338775, "learning_rate": 4.9198457528201e-07, "loss": 1.6365, "step": 565 }, { "epoch": 0.08536309478923158, "grad_norm": 0.2559157777595285, "learning_rate": 4.919563474127428e-07, "loss": 1.5891, "step": 566 }, { "epoch": 0.08551391297790513, "grad_norm": 0.29339247140505426, "learning_rate": 4.91928070830457e-07, "loss": 1.6133, "step": 567 }, { "epoch": 0.08566473116657869, "grad_norm": 0.2373443895115084, "learning_rate": 4.918997455415017e-07, "loss": 1.6091, "step": 568 }, { "epoch": 0.08581554935525225, "grad_norm": 0.2551840980360237, "learning_rate": 4.918713715522367e-07, "loss": 1.5759, "step": 569 }, { "epoch": 0.0859663675439258, "grad_norm": 0.2874474579323459, "learning_rate": 4.918429488690329e-07, "loss": 1.5687, "step": 570 }, { "epoch": 0.08611718573259936, "grad_norm": 0.24438925052557178, "learning_rate": 4.918144774982718e-07, "loss": 1.6112, "step": 571 }, { "epoch": 0.0862680039212729, "grad_norm": 0.24410876540004492, "learning_rate": 4.917859574463461e-07, "loss": 1.5914, "step": 572 }, { "epoch": 0.08641882210994646, "grad_norm": 0.27929429191497995, "learning_rate": 4.917573887196596e-07, "loss": 1.6207, "step": 573 }, { "epoch": 0.08656964029862001, "grad_norm": 0.2521870396625732, "learning_rate": 4.917287713246266e-07, "loss": 1.5708, "step": 574 }, { "epoch": 0.08672045848729357, "grad_norm": 0.2689103195119159, "learning_rate": 4.917001052676726e-07, "loss": 1.6346, "step": 575 }, { "epoch": 0.08687127667596713, "grad_norm": 0.24086423482406905, "learning_rate": 4.916713905552341e-07, "loss": 1.6522, "step": 576 }, { "epoch": 0.08702209486464067, "grad_norm": 0.2627511498959766, "learning_rate": 4.91642627193758e-07, "loss": 1.5834, "step": 577 }, { "epoch": 0.08717291305331423, "grad_norm": 0.256738420406437, "learning_rate": 4.916138151897029e-07, "loss": 1.5748, "step": 578 }, { "epoch": 0.08732373124198778, "grad_norm": 0.24545574836575051, "learning_rate": 4.915849545495378e-07, "loss": 1.5448, "step": 579 }, { "epoch": 0.08747454943066134, "grad_norm": 0.31967227244550045, "learning_rate": 4.915560452797427e-07, "loss": 1.532, "step": 580 }, { "epoch": 0.0876253676193349, "grad_norm": 0.2968673763523125, "learning_rate": 4.915270873868087e-07, "loss": 1.6339, "step": 581 }, { "epoch": 0.08777618580800844, "grad_norm": 0.28728768833492335, "learning_rate": 4.914980808772375e-07, "loss": 1.5853, "step": 582 }, { "epoch": 0.087927003996682, "grad_norm": 0.2462113832568665, "learning_rate": 4.914690257575422e-07, "loss": 1.5725, "step": 583 }, { "epoch": 0.08807782218535555, "grad_norm": 0.23829183228187617, "learning_rate": 4.914399220342461e-07, "loss": 1.5606, "step": 584 }, { "epoch": 0.08822864037402911, "grad_norm": 0.3152762690498193, "learning_rate": 4.914107697138843e-07, "loss": 1.5535, "step": 585 }, { "epoch": 0.08837945856270266, "grad_norm": 0.24961819664817447, "learning_rate": 4.91381568803002e-07, "loss": 1.5568, "step": 586 }, { "epoch": 0.08853027675137622, "grad_norm": 0.2676329327281572, "learning_rate": 4.913523193081558e-07, "loss": 1.6027, "step": 587 }, { "epoch": 0.08868109494004978, "grad_norm": 0.2441440934420083, "learning_rate": 4.913230212359131e-07, "loss": 1.5824, "step": 588 }, { "epoch": 0.08883191312872332, "grad_norm": 0.2538836446076873, "learning_rate": 4.91293674592852e-07, "loss": 1.5529, "step": 589 }, { "epoch": 0.08898273131739688, "grad_norm": 0.41769465370401243, "learning_rate": 4.912642793855619e-07, "loss": 1.5893, "step": 590 }, { "epoch": 0.08913354950607043, "grad_norm": 0.278803470964098, "learning_rate": 4.912348356206426e-07, "loss": 1.5982, "step": 591 }, { "epoch": 0.08928436769474399, "grad_norm": 0.5837272165753525, "learning_rate": 4.912053433047054e-07, "loss": 1.5819, "step": 592 }, { "epoch": 0.08943518588341753, "grad_norm": 0.25424687096423615, "learning_rate": 4.911758024443721e-07, "loss": 1.6297, "step": 593 }, { "epoch": 0.0895860040720911, "grad_norm": 0.2601339721261789, "learning_rate": 4.911462130462752e-07, "loss": 1.5876, "step": 594 }, { "epoch": 0.08973682226076465, "grad_norm": 0.3393900866903272, "learning_rate": 4.911165751170587e-07, "loss": 1.5898, "step": 595 }, { "epoch": 0.0898876404494382, "grad_norm": 0.7180948377283978, "learning_rate": 4.910868886633771e-07, "loss": 1.6494, "step": 596 }, { "epoch": 0.09003845863811176, "grad_norm": 0.2653601165598602, "learning_rate": 4.910571536918958e-07, "loss": 1.6252, "step": 597 }, { "epoch": 0.0901892768267853, "grad_norm": 0.24954419790830248, "learning_rate": 4.910273702092913e-07, "loss": 1.5972, "step": 598 }, { "epoch": 0.09034009501545887, "grad_norm": 0.2677338162505918, "learning_rate": 4.909975382222508e-07, "loss": 1.5955, "step": 599 }, { "epoch": 0.09049091320413241, "grad_norm": 0.25788426217463334, "learning_rate": 4.909676577374722e-07, "loss": 1.6125, "step": 600 }, { "epoch": 0.09064173139280597, "grad_norm": 0.25188480941816443, "learning_rate": 4.909377287616649e-07, "loss": 1.5452, "step": 601 }, { "epoch": 0.09079254958147953, "grad_norm": 0.27570661850707645, "learning_rate": 4.909077513015488e-07, "loss": 1.6192, "step": 602 }, { "epoch": 0.09094336777015308, "grad_norm": 0.24156826274527765, "learning_rate": 4.908777253638545e-07, "loss": 1.554, "step": 603 }, { "epoch": 0.09109418595882664, "grad_norm": 0.25224612585820494, "learning_rate": 4.908476509553239e-07, "loss": 1.5563, "step": 604 }, { "epoch": 0.09124500414750018, "grad_norm": 0.306557401649218, "learning_rate": 4.908175280827094e-07, "loss": 1.6025, "step": 605 }, { "epoch": 0.09139582233617374, "grad_norm": 0.23952494340105374, "learning_rate": 4.907873567527744e-07, "loss": 1.5947, "step": 606 }, { "epoch": 0.0915466405248473, "grad_norm": 0.24856125896105166, "learning_rate": 4.907571369722936e-07, "loss": 1.6013, "step": 607 }, { "epoch": 0.09169745871352085, "grad_norm": 0.2508258193786405, "learning_rate": 4.907268687480518e-07, "loss": 1.5779, "step": 608 }, { "epoch": 0.09184827690219441, "grad_norm": 0.25781742553896553, "learning_rate": 4.906965520868456e-07, "loss": 1.6299, "step": 609 }, { "epoch": 0.09199909509086796, "grad_norm": 0.2550011985174638, "learning_rate": 4.906661869954815e-07, "loss": 1.6228, "step": 610 }, { "epoch": 0.09214991327954151, "grad_norm": 0.2470914926519195, "learning_rate": 4.906357734807775e-07, "loss": 1.6089, "step": 611 }, { "epoch": 0.09230073146821506, "grad_norm": 0.2610545421881108, "learning_rate": 4.906053115495624e-07, "loss": 1.6606, "step": 612 }, { "epoch": 0.09245154965688862, "grad_norm": 0.26931972604669374, "learning_rate": 4.905748012086756e-07, "loss": 1.6334, "step": 613 }, { "epoch": 0.09260236784556218, "grad_norm": 0.36010264957923965, "learning_rate": 4.905442424649679e-07, "loss": 1.5844, "step": 614 }, { "epoch": 0.09275318603423573, "grad_norm": 0.2562554805903904, "learning_rate": 4.905136353253003e-07, "loss": 1.5654, "step": 615 }, { "epoch": 0.09290400422290929, "grad_norm": 0.2648496245111277, "learning_rate": 4.904829797965452e-07, "loss": 1.6259, "step": 616 }, { "epoch": 0.09305482241158283, "grad_norm": 0.28004205716450364, "learning_rate": 4.904522758855855e-07, "loss": 1.6303, "step": 617 }, { "epoch": 0.09320564060025639, "grad_norm": 0.2543298564080976, "learning_rate": 4.904215235993152e-07, "loss": 1.5825, "step": 618 }, { "epoch": 0.09335645878892994, "grad_norm": 0.2708365964823325, "learning_rate": 4.903907229446392e-07, "loss": 1.6123, "step": 619 }, { "epoch": 0.0935072769776035, "grad_norm": 0.2434521472967361, "learning_rate": 4.90359873928473e-07, "loss": 1.6018, "step": 620 }, { "epoch": 0.09365809516627706, "grad_norm": 0.33511641774018264, "learning_rate": 4.90328976557743e-07, "loss": 1.5607, "step": 621 }, { "epoch": 0.0938089133549506, "grad_norm": 0.2958147063552225, "learning_rate": 4.902980308393868e-07, "loss": 1.556, "step": 622 }, { "epoch": 0.09395973154362416, "grad_norm": 0.28854360372385784, "learning_rate": 4.902670367803526e-07, "loss": 1.6016, "step": 623 }, { "epoch": 0.09411054973229771, "grad_norm": 0.2508804168130778, "learning_rate": 4.902359943875992e-07, "loss": 1.6528, "step": 624 }, { "epoch": 0.09426136792097127, "grad_norm": 0.24637422112807927, "learning_rate": 4.902049036680967e-07, "loss": 1.5707, "step": 625 }, { "epoch": 0.09441218610964483, "grad_norm": 0.27532911514444836, "learning_rate": 4.901737646288259e-07, "loss": 1.5714, "step": 626 }, { "epoch": 0.09456300429831838, "grad_norm": 0.2510146750886741, "learning_rate": 4.901425772767784e-07, "loss": 1.5771, "step": 627 }, { "epoch": 0.09471382248699194, "grad_norm": 0.3652566414579146, "learning_rate": 4.901113416189567e-07, "loss": 1.5934, "step": 628 }, { "epoch": 0.09486464067566548, "grad_norm": 0.264710597437996, "learning_rate": 4.90080057662374e-07, "loss": 1.5961, "step": 629 }, { "epoch": 0.09501545886433904, "grad_norm": 0.2722196620450524, "learning_rate": 4.900487254140546e-07, "loss": 1.5784, "step": 630 }, { "epoch": 0.09516627705301259, "grad_norm": 0.2494468175683074, "learning_rate": 4.900173448810334e-07, "loss": 1.5588, "step": 631 }, { "epoch": 0.09531709524168615, "grad_norm": 0.25564128850962825, "learning_rate": 4.899859160703563e-07, "loss": 1.5296, "step": 632 }, { "epoch": 0.09546791343035971, "grad_norm": 0.24464482213775124, "learning_rate": 4.899544389890798e-07, "loss": 1.5731, "step": 633 }, { "epoch": 0.09561873161903325, "grad_norm": 0.25286317801092784, "learning_rate": 4.899229136442717e-07, "loss": 1.6368, "step": 634 }, { "epoch": 0.09576954980770681, "grad_norm": 0.2616059791844806, "learning_rate": 4.898913400430103e-07, "loss": 1.5624, "step": 635 }, { "epoch": 0.09592036799638036, "grad_norm": 0.2543080959885784, "learning_rate": 4.898597181923847e-07, "loss": 1.612, "step": 636 }, { "epoch": 0.09607118618505392, "grad_norm": 0.26165761435102747, "learning_rate": 4.89828048099495e-07, "loss": 1.5321, "step": 637 }, { "epoch": 0.09622200437372747, "grad_norm": 0.25832811520315874, "learning_rate": 4.89796329771452e-07, "loss": 1.5823, "step": 638 }, { "epoch": 0.09637282256240103, "grad_norm": 0.24902143318525818, "learning_rate": 4.897645632153773e-07, "loss": 1.5841, "step": 639 }, { "epoch": 0.09652364075107459, "grad_norm": 2.155448583198823, "learning_rate": 4.897327484384037e-07, "loss": 1.5538, "step": 640 }, { "epoch": 0.09667445893974813, "grad_norm": 0.29087209938764197, "learning_rate": 4.897008854476741e-07, "loss": 1.5588, "step": 641 }, { "epoch": 0.09682527712842169, "grad_norm": 0.2653741871413294, "learning_rate": 4.896689742503432e-07, "loss": 1.5673, "step": 642 }, { "epoch": 0.09697609531709524, "grad_norm": 0.25968820863722764, "learning_rate": 4.896370148535756e-07, "loss": 1.5111, "step": 643 }, { "epoch": 0.0971269135057688, "grad_norm": 0.25714475293882755, "learning_rate": 4.896050072645471e-07, "loss": 1.5501, "step": 644 }, { "epoch": 0.09727773169444234, "grad_norm": 0.2567542152360163, "learning_rate": 4.895729514904447e-07, "loss": 1.582, "step": 645 }, { "epoch": 0.0974285498831159, "grad_norm": 0.2437153380303258, "learning_rate": 4.895408475384656e-07, "loss": 1.5133, "step": 646 }, { "epoch": 0.09757936807178946, "grad_norm": 0.298717931310982, "learning_rate": 4.895086954158181e-07, "loss": 1.5918, "step": 647 }, { "epoch": 0.09773018626046301, "grad_norm": 0.2545301551858995, "learning_rate": 4.894764951297213e-07, "loss": 1.5648, "step": 648 }, { "epoch": 0.09788100444913657, "grad_norm": 0.26439815919936716, "learning_rate": 4.894442466874052e-07, "loss": 1.6693, "step": 649 }, { "epoch": 0.09803182263781011, "grad_norm": 0.2567876593677127, "learning_rate": 4.894119500961103e-07, "loss": 1.6272, "step": 650 }, { "epoch": 0.09818264082648367, "grad_norm": 0.2703561761668927, "learning_rate": 4.893796053630881e-07, "loss": 1.6037, "step": 651 }, { "epoch": 0.09833345901515723, "grad_norm": 0.24462862373595118, "learning_rate": 4.893472124956013e-07, "loss": 1.6007, "step": 652 }, { "epoch": 0.09848427720383078, "grad_norm": 0.27868827457946876, "learning_rate": 4.893147715009226e-07, "loss": 1.5594, "step": 653 }, { "epoch": 0.09863509539250434, "grad_norm": 0.2694330648878804, "learning_rate": 4.892822823863363e-07, "loss": 1.5226, "step": 654 }, { "epoch": 0.09878591358117789, "grad_norm": 0.2508658391074199, "learning_rate": 4.89249745159137e-07, "loss": 1.5539, "step": 655 }, { "epoch": 0.09893673176985145, "grad_norm": 0.25138448088883353, "learning_rate": 4.892171598266304e-07, "loss": 1.6533, "step": 656 }, { "epoch": 0.09908754995852499, "grad_norm": 0.25195751337900707, "learning_rate": 4.891845263961325e-07, "loss": 1.5837, "step": 657 }, { "epoch": 0.09923836814719855, "grad_norm": 0.2665540846353779, "learning_rate": 4.891518448749708e-07, "loss": 1.5776, "step": 658 }, { "epoch": 0.09938918633587211, "grad_norm": 0.2593627553827404, "learning_rate": 4.891191152704831e-07, "loss": 1.5844, "step": 659 }, { "epoch": 0.09954000452454566, "grad_norm": 0.2649200832665812, "learning_rate": 4.890863375900182e-07, "loss": 1.5452, "step": 660 }, { "epoch": 0.09969082271321922, "grad_norm": 0.26580937811421534, "learning_rate": 4.890535118409355e-07, "loss": 1.6071, "step": 661 }, { "epoch": 0.09984164090189276, "grad_norm": 0.2951878007593459, "learning_rate": 4.890206380306056e-07, "loss": 1.5651, "step": 662 }, { "epoch": 0.09999245909056632, "grad_norm": 0.25869622571347445, "learning_rate": 4.889877161664096e-07, "loss": 1.5353, "step": 663 }, { "epoch": 0.10014327727923987, "grad_norm": 0.24044540900199096, "learning_rate": 4.889547462557391e-07, "loss": 1.5694, "step": 664 }, { "epoch": 0.10029409546791343, "grad_norm": 0.2565865665249769, "learning_rate": 4.889217283059971e-07, "loss": 1.6183, "step": 665 }, { "epoch": 0.10044491365658699, "grad_norm": 0.27025535522679817, "learning_rate": 4.888886623245969e-07, "loss": 1.6222, "step": 666 }, { "epoch": 0.10059573184526054, "grad_norm": 0.24652564398079863, "learning_rate": 4.888555483189632e-07, "loss": 1.5629, "step": 667 }, { "epoch": 0.1007465500339341, "grad_norm": 0.4814577649778317, "learning_rate": 4.888223862965304e-07, "loss": 1.589, "step": 668 }, { "epoch": 0.10089736822260764, "grad_norm": 0.23937179549886292, "learning_rate": 4.887891762647449e-07, "loss": 1.5531, "step": 669 }, { "epoch": 0.1010481864112812, "grad_norm": 0.2466135765774502, "learning_rate": 4.887559182310629e-07, "loss": 1.6277, "step": 670 }, { "epoch": 0.10119900459995475, "grad_norm": 0.24673261333382188, "learning_rate": 4.887226122029522e-07, "loss": 1.5716, "step": 671 }, { "epoch": 0.10134982278862831, "grad_norm": 0.35516279706374765, "learning_rate": 4.886892581878906e-07, "loss": 1.5149, "step": 672 }, { "epoch": 0.10150064097730187, "grad_norm": 0.24643752858072748, "learning_rate": 4.886558561933673e-07, "loss": 1.5287, "step": 673 }, { "epoch": 0.10165145916597541, "grad_norm": 0.25727848949346177, "learning_rate": 4.886224062268819e-07, "loss": 1.544, "step": 674 }, { "epoch": 0.10180227735464897, "grad_norm": 0.2672375663532377, "learning_rate": 4.88588908295945e-07, "loss": 1.6145, "step": 675 }, { "epoch": 0.10195309554332252, "grad_norm": 0.23783832868779106, "learning_rate": 4.885553624080777e-07, "loss": 1.5434, "step": 676 }, { "epoch": 0.10210391373199608, "grad_norm": 0.2628061693510539, "learning_rate": 4.885217685708123e-07, "loss": 1.6046, "step": 677 }, { "epoch": 0.10225473192066964, "grad_norm": 0.2498006629530914, "learning_rate": 4.884881267916913e-07, "loss": 1.5738, "step": 678 }, { "epoch": 0.10240555010934319, "grad_norm": 0.2670401903771944, "learning_rate": 4.884544370782682e-07, "loss": 1.6719, "step": 679 }, { "epoch": 0.10255636829801675, "grad_norm": 0.26786252982477077, "learning_rate": 4.884206994381078e-07, "loss": 1.5068, "step": 680 }, { "epoch": 0.10270718648669029, "grad_norm": 0.2424177526813027, "learning_rate": 4.883869138787846e-07, "loss": 1.6749, "step": 681 }, { "epoch": 0.10285800467536385, "grad_norm": 0.2513530982644478, "learning_rate": 4.883530804078849e-07, "loss": 1.5723, "step": 682 }, { "epoch": 0.1030088228640374, "grad_norm": 0.25744150006151734, "learning_rate": 4.883191990330051e-07, "loss": 1.5466, "step": 683 }, { "epoch": 0.10315964105271096, "grad_norm": 0.27196436155706344, "learning_rate": 4.882852697617525e-07, "loss": 1.594, "step": 684 }, { "epoch": 0.10331045924138452, "grad_norm": 0.2533812358300188, "learning_rate": 4.882512926017453e-07, "loss": 1.5704, "step": 685 }, { "epoch": 0.10346127743005806, "grad_norm": 0.2651528199593723, "learning_rate": 4.882172675606125e-07, "loss": 1.5445, "step": 686 }, { "epoch": 0.10361209561873162, "grad_norm": 0.26571572796536885, "learning_rate": 4.881831946459933e-07, "loss": 1.54, "step": 687 }, { "epoch": 0.10376291380740517, "grad_norm": 0.24786037357426793, "learning_rate": 4.881490738655386e-07, "loss": 1.5229, "step": 688 }, { "epoch": 0.10391373199607873, "grad_norm": 0.25004542288508147, "learning_rate": 4.88114905226909e-07, "loss": 1.5828, "step": 689 }, { "epoch": 0.10406455018475227, "grad_norm": 0.2473729216074345, "learning_rate": 4.880806887377768e-07, "loss": 1.5187, "step": 690 }, { "epoch": 0.10421536837342583, "grad_norm": 0.251002717251045, "learning_rate": 4.880464244058242e-07, "loss": 1.591, "step": 691 }, { "epoch": 0.1043661865620994, "grad_norm": 0.2625610469104862, "learning_rate": 4.880121122387447e-07, "loss": 1.6434, "step": 692 }, { "epoch": 0.10451700475077294, "grad_norm": 0.3151656817305251, "learning_rate": 4.879777522442425e-07, "loss": 1.5391, "step": 693 }, { "epoch": 0.1046678229394465, "grad_norm": 0.24445992489148152, "learning_rate": 4.879433444300323e-07, "loss": 1.5501, "step": 694 }, { "epoch": 0.10481864112812005, "grad_norm": 4.8427358903679405, "learning_rate": 4.879088888038397e-07, "loss": 1.6371, "step": 695 }, { "epoch": 0.1049694593167936, "grad_norm": 0.26416228036696915, "learning_rate": 4.87874385373401e-07, "loss": 1.6385, "step": 696 }, { "epoch": 0.10512027750546717, "grad_norm": 0.2496641311371161, "learning_rate": 4.878398341464631e-07, "loss": 1.5365, "step": 697 }, { "epoch": 0.10527109569414071, "grad_norm": 0.2545167000001841, "learning_rate": 4.87805235130784e-07, "loss": 1.5949, "step": 698 }, { "epoch": 0.10542191388281427, "grad_norm": 0.27524392309011514, "learning_rate": 4.877705883341319e-07, "loss": 1.6025, "step": 699 }, { "epoch": 0.10557273207148782, "grad_norm": 0.2531395600898579, "learning_rate": 4.877358937642863e-07, "loss": 1.5657, "step": 700 }, { "epoch": 0.10572355026016138, "grad_norm": 0.24654464345054394, "learning_rate": 4.877011514290369e-07, "loss": 1.5843, "step": 701 }, { "epoch": 0.10587436844883492, "grad_norm": 0.3149996418781548, "learning_rate": 4.876663613361844e-07, "loss": 1.5075, "step": 702 }, { "epoch": 0.10602518663750848, "grad_norm": 0.2589830127512137, "learning_rate": 4.876315234935403e-07, "loss": 1.6373, "step": 703 }, { "epoch": 0.10617600482618204, "grad_norm": 0.31577873026015885, "learning_rate": 4.875966379089267e-07, "loss": 1.5944, "step": 704 }, { "epoch": 0.10632682301485559, "grad_norm": 12.03237047152764, "learning_rate": 4.875617045901763e-07, "loss": 1.5977, "step": 705 }, { "epoch": 0.10647764120352915, "grad_norm": 0.41114235828480006, "learning_rate": 4.875267235451328e-07, "loss": 1.5744, "step": 706 }, { "epoch": 0.1066284593922027, "grad_norm": 0.25165928626641787, "learning_rate": 4.874916947816504e-07, "loss": 1.5568, "step": 707 }, { "epoch": 0.10677927758087626, "grad_norm": 0.2548859391150305, "learning_rate": 4.874566183075942e-07, "loss": 1.503, "step": 708 }, { "epoch": 0.1069300957695498, "grad_norm": 0.3172895827908948, "learning_rate": 4.874214941308395e-07, "loss": 1.6655, "step": 709 }, { "epoch": 0.10708091395822336, "grad_norm": 0.2738482410825109, "learning_rate": 4.873863222592732e-07, "loss": 1.5105, "step": 710 }, { "epoch": 0.10723173214689692, "grad_norm": 0.2551922177918959, "learning_rate": 4.873511027007921e-07, "loss": 1.5545, "step": 711 }, { "epoch": 0.10738255033557047, "grad_norm": 0.26786371214739546, "learning_rate": 4.873158354633041e-07, "loss": 1.5731, "step": 712 }, { "epoch": 0.10753336852424403, "grad_norm": 0.251925518723259, "learning_rate": 4.872805205547276e-07, "loss": 1.5932, "step": 713 }, { "epoch": 0.10768418671291757, "grad_norm": 0.2430974358880398, "learning_rate": 4.872451579829922e-07, "loss": 1.5401, "step": 714 }, { "epoch": 0.10783500490159113, "grad_norm": 0.25083348588807275, "learning_rate": 4.872097477560374e-07, "loss": 1.5348, "step": 715 }, { "epoch": 0.10798582309026468, "grad_norm": 0.24851262093970097, "learning_rate": 4.87174289881814e-07, "loss": 1.5336, "step": 716 }, { "epoch": 0.10813664127893824, "grad_norm": 0.2580987044490573, "learning_rate": 4.871387843682834e-07, "loss": 1.5337, "step": 717 }, { "epoch": 0.1082874594676118, "grad_norm": 0.24269265338452864, "learning_rate": 4.871032312234175e-07, "loss": 1.6074, "step": 718 }, { "epoch": 0.10843827765628535, "grad_norm": 0.262540479014235, "learning_rate": 4.870676304551991e-07, "loss": 1.5434, "step": 719 }, { "epoch": 0.1085890958449589, "grad_norm": 0.25733851943997205, "learning_rate": 4.870319820716215e-07, "loss": 1.5998, "step": 720 }, { "epoch": 0.10873991403363245, "grad_norm": 0.30099795570434335, "learning_rate": 4.869962860806889e-07, "loss": 1.5917, "step": 721 }, { "epoch": 0.10889073222230601, "grad_norm": 0.24529089302262863, "learning_rate": 4.869605424904161e-07, "loss": 1.6178, "step": 722 }, { "epoch": 0.10904155041097957, "grad_norm": 0.3216523435582841, "learning_rate": 4.869247513088285e-07, "loss": 1.6293, "step": 723 }, { "epoch": 0.10919236859965312, "grad_norm": 0.2537191894534603, "learning_rate": 4.868889125439622e-07, "loss": 1.5027, "step": 724 }, { "epoch": 0.10934318678832668, "grad_norm": 0.23406712357409545, "learning_rate": 4.868530262038642e-07, "loss": 1.5506, "step": 725 }, { "epoch": 0.10949400497700022, "grad_norm": 0.24424934141970756, "learning_rate": 4.868170922965921e-07, "loss": 1.62, "step": 726 }, { "epoch": 0.10964482316567378, "grad_norm": 0.2529106313948061, "learning_rate": 4.867811108302139e-07, "loss": 1.6838, "step": 727 }, { "epoch": 0.10979564135434733, "grad_norm": 0.24268045391162135, "learning_rate": 4.867450818128086e-07, "loss": 1.5843, "step": 728 }, { "epoch": 0.10994645954302089, "grad_norm": 0.2782845964267312, "learning_rate": 4.867090052524657e-07, "loss": 1.5276, "step": 729 }, { "epoch": 0.11009727773169445, "grad_norm": 0.2607417299475057, "learning_rate": 4.866728811572855e-07, "loss": 1.6077, "step": 730 }, { "epoch": 0.110248095920368, "grad_norm": 0.2995072103389239, "learning_rate": 4.866367095353788e-07, "loss": 1.531, "step": 731 }, { "epoch": 0.11039891410904155, "grad_norm": 0.26231666932903636, "learning_rate": 4.866004903948674e-07, "loss": 1.6113, "step": 732 }, { "epoch": 0.1105497322977151, "grad_norm": 0.255610027357904, "learning_rate": 4.865642237438832e-07, "loss": 1.5377, "step": 733 }, { "epoch": 0.11070055048638866, "grad_norm": 0.24761027967399435, "learning_rate": 4.865279095905696e-07, "loss": 1.5794, "step": 734 }, { "epoch": 0.1108513686750622, "grad_norm": 0.3683492891040185, "learning_rate": 4.864915479430799e-07, "loss": 1.6031, "step": 735 }, { "epoch": 0.11100218686373577, "grad_norm": 0.2984691028171984, "learning_rate": 4.864551388095782e-07, "loss": 1.5659, "step": 736 }, { "epoch": 0.11115300505240933, "grad_norm": 0.24189257268944578, "learning_rate": 4.864186821982397e-07, "loss": 1.5652, "step": 737 }, { "epoch": 0.11130382324108287, "grad_norm": 0.2370044032129546, "learning_rate": 4.863821781172499e-07, "loss": 1.571, "step": 738 }, { "epoch": 0.11145464142975643, "grad_norm": 0.2555003933014576, "learning_rate": 4.863456265748048e-07, "loss": 1.5888, "step": 739 }, { "epoch": 0.11160545961842998, "grad_norm": 0.543423385722154, "learning_rate": 4.863090275791116e-07, "loss": 1.5519, "step": 740 }, { "epoch": 0.11175627780710354, "grad_norm": 0.27599159260109296, "learning_rate": 4.862723811383877e-07, "loss": 1.602, "step": 741 }, { "epoch": 0.11190709599577708, "grad_norm": 0.2465362472728453, "learning_rate": 4.862356872608612e-07, "loss": 1.5474, "step": 742 }, { "epoch": 0.11205791418445064, "grad_norm": 0.2800510702511786, "learning_rate": 4.861989459547712e-07, "loss": 1.5227, "step": 743 }, { "epoch": 0.1122087323731242, "grad_norm": 0.32072850809900044, "learning_rate": 4.861621572283669e-07, "loss": 1.6118, "step": 744 }, { "epoch": 0.11235955056179775, "grad_norm": 0.2510399301195033, "learning_rate": 4.861253210899088e-07, "loss": 1.6143, "step": 745 }, { "epoch": 0.11251036875047131, "grad_norm": 0.2410159118441585, "learning_rate": 4.860884375476672e-07, "loss": 1.5808, "step": 746 }, { "epoch": 0.11266118693914486, "grad_norm": 0.2564249523301703, "learning_rate": 4.86051506609924e-07, "loss": 1.5642, "step": 747 }, { "epoch": 0.11281200512781842, "grad_norm": 0.2697763677983635, "learning_rate": 4.86014528284971e-07, "loss": 1.6284, "step": 748 }, { "epoch": 0.11296282331649198, "grad_norm": 0.24748116614837487, "learning_rate": 4.85977502581111e-07, "loss": 1.5848, "step": 749 }, { "epoch": 0.11311364150516552, "grad_norm": 0.24419802618009506, "learning_rate": 4.859404295066573e-07, "loss": 1.5726, "step": 750 }, { "epoch": 0.11326445969383908, "grad_norm": 0.2658317522730182, "learning_rate": 4.859033090699338e-07, "loss": 1.5658, "step": 751 }, { "epoch": 0.11341527788251263, "grad_norm": 0.2585887332026061, "learning_rate": 4.858661412792754e-07, "loss": 1.6313, "step": 752 }, { "epoch": 0.11356609607118619, "grad_norm": 0.265337834018552, "learning_rate": 4.858289261430271e-07, "loss": 1.5918, "step": 753 }, { "epoch": 0.11371691425985973, "grad_norm": 0.28811771044154877, "learning_rate": 4.857916636695448e-07, "loss": 1.7301, "step": 754 }, { "epoch": 0.1138677324485333, "grad_norm": 0.2775887071260114, "learning_rate": 4.857543538671952e-07, "loss": 1.6004, "step": 755 }, { "epoch": 0.11401855063720685, "grad_norm": 1.307067680039299, "learning_rate": 4.857169967443553e-07, "loss": 1.5489, "step": 756 }, { "epoch": 0.1141693688258804, "grad_norm": 0.25603868221014153, "learning_rate": 4.856795923094128e-07, "loss": 1.5895, "step": 757 }, { "epoch": 0.11432018701455396, "grad_norm": 0.26609703406081053, "learning_rate": 4.856421405707662e-07, "loss": 1.5814, "step": 758 }, { "epoch": 0.1144710052032275, "grad_norm": 0.24417096229180008, "learning_rate": 4.856046415368244e-07, "loss": 1.5387, "step": 759 }, { "epoch": 0.11462182339190106, "grad_norm": 0.25096964734102967, "learning_rate": 4.855670952160073e-07, "loss": 1.5615, "step": 760 }, { "epoch": 0.11477264158057461, "grad_norm": 0.28760577193425085, "learning_rate": 4.855295016167447e-07, "loss": 1.5391, "step": 761 }, { "epoch": 0.11492345976924817, "grad_norm": 0.2699115214561961, "learning_rate": 4.854918607474777e-07, "loss": 1.5244, "step": 762 }, { "epoch": 0.11507427795792173, "grad_norm": 0.25905586243470197, "learning_rate": 4.854541726166579e-07, "loss": 1.4722, "step": 763 }, { "epoch": 0.11522509614659528, "grad_norm": 0.275105267887951, "learning_rate": 4.854164372327471e-07, "loss": 1.5588, "step": 764 }, { "epoch": 0.11537591433526884, "grad_norm": 0.23764545222034308, "learning_rate": 4.853786546042184e-07, "loss": 1.5995, "step": 765 }, { "epoch": 0.11552673252394238, "grad_norm": 0.2509047647028819, "learning_rate": 4.853408247395545e-07, "loss": 1.5454, "step": 766 }, { "epoch": 0.11567755071261594, "grad_norm": 0.24100699069586992, "learning_rate": 4.853029476472499e-07, "loss": 1.6153, "step": 767 }, { "epoch": 0.1158283689012895, "grad_norm": 0.24582457784503464, "learning_rate": 4.852650233358088e-07, "loss": 1.5976, "step": 768 }, { "epoch": 0.11597918708996305, "grad_norm": 0.2515327610744451, "learning_rate": 4.852270518137463e-07, "loss": 1.5088, "step": 769 }, { "epoch": 0.11613000527863661, "grad_norm": 0.28108513747998987, "learning_rate": 4.851890330895882e-07, "loss": 1.5592, "step": 770 }, { "epoch": 0.11628082346731015, "grad_norm": 0.25724095740981906, "learning_rate": 4.851509671718708e-07, "loss": 1.5793, "step": 771 }, { "epoch": 0.11643164165598371, "grad_norm": 0.3379293339765616, "learning_rate": 4.851128540691409e-07, "loss": 1.5388, "step": 772 }, { "epoch": 0.11658245984465726, "grad_norm": 0.266271201005545, "learning_rate": 4.850746937899562e-07, "loss": 1.6516, "step": 773 }, { "epoch": 0.11673327803333082, "grad_norm": 0.27413359706065926, "learning_rate": 4.850364863428847e-07, "loss": 1.5685, "step": 774 }, { "epoch": 0.11688409622200438, "grad_norm": 0.2532201394692201, "learning_rate": 4.84998231736505e-07, "loss": 1.6207, "step": 775 }, { "epoch": 0.11703491441067793, "grad_norm": 0.24793843250269984, "learning_rate": 4.849599299794065e-07, "loss": 1.6016, "step": 776 }, { "epoch": 0.11718573259935149, "grad_norm": 2.5487673069833954, "learning_rate": 4.849215810801891e-07, "loss": 1.5861, "step": 777 }, { "epoch": 0.11733655078802503, "grad_norm": 0.2585038483321872, "learning_rate": 4.84883185047463e-07, "loss": 1.547, "step": 778 }, { "epoch": 0.11748736897669859, "grad_norm": 0.26306792481033825, "learning_rate": 4.848447418898495e-07, "loss": 1.5976, "step": 779 }, { "epoch": 0.11763818716537214, "grad_norm": 0.25036190245725615, "learning_rate": 4.8480625161598e-07, "loss": 1.6552, "step": 780 }, { "epoch": 0.1177890053540457, "grad_norm": 0.2788788641094789, "learning_rate": 4.847677142344968e-07, "loss": 1.5277, "step": 781 }, { "epoch": 0.11793982354271926, "grad_norm": 0.26398224032511813, "learning_rate": 4.847291297540528e-07, "loss": 1.6165, "step": 782 }, { "epoch": 0.1180906417313928, "grad_norm": 0.2923197817267704, "learning_rate": 4.84690498183311e-07, "loss": 1.6169, "step": 783 }, { "epoch": 0.11824145992006636, "grad_norm": 0.2663331591259635, "learning_rate": 4.846518195309456e-07, "loss": 1.5577, "step": 784 }, { "epoch": 0.11839227810873991, "grad_norm": 0.2535735629174223, "learning_rate": 4.846130938056408e-07, "loss": 1.6244, "step": 785 }, { "epoch": 0.11854309629741347, "grad_norm": 0.4299089038786934, "learning_rate": 4.84574321016092e-07, "loss": 1.5036, "step": 786 }, { "epoch": 0.11869391448608702, "grad_norm": 0.2611073794072947, "learning_rate": 4.845355011710046e-07, "loss": 1.6248, "step": 787 }, { "epoch": 0.11884473267476058, "grad_norm": 0.34084186540244277, "learning_rate": 4.844966342790949e-07, "loss": 1.5502, "step": 788 }, { "epoch": 0.11899555086343414, "grad_norm": 0.6743003737430772, "learning_rate": 4.844577203490895e-07, "loss": 1.4726, "step": 789 }, { "epoch": 0.11914636905210768, "grad_norm": 0.25381971085918803, "learning_rate": 4.844187593897258e-07, "loss": 1.5452, "step": 790 }, { "epoch": 0.11929718724078124, "grad_norm": 0.27775315318526334, "learning_rate": 4.843797514097518e-07, "loss": 1.5751, "step": 791 }, { "epoch": 0.11944800542945479, "grad_norm": 0.27009087648015045, "learning_rate": 4.843406964179256e-07, "loss": 1.4924, "step": 792 }, { "epoch": 0.11959882361812835, "grad_norm": 0.2625048270437366, "learning_rate": 4.843015944230165e-07, "loss": 1.6697, "step": 793 }, { "epoch": 0.11974964180680191, "grad_norm": 0.25396359169098826, "learning_rate": 4.84262445433804e-07, "loss": 1.5235, "step": 794 }, { "epoch": 0.11990045999547545, "grad_norm": 0.24250088924809574, "learning_rate": 4.842232494590779e-07, "loss": 1.6287, "step": 795 }, { "epoch": 0.12005127818414901, "grad_norm": 0.26372563090965606, "learning_rate": 4.841840065076392e-07, "loss": 1.6462, "step": 796 }, { "epoch": 0.12020209637282256, "grad_norm": 0.2527762179440438, "learning_rate": 4.841447165882988e-07, "loss": 1.5983, "step": 797 }, { "epoch": 0.12035291456149612, "grad_norm": 0.2408908261902159, "learning_rate": 4.841053797098787e-07, "loss": 1.6323, "step": 798 }, { "epoch": 0.12050373275016966, "grad_norm": 0.33754799043680367, "learning_rate": 4.84065995881211e-07, "loss": 1.5269, "step": 799 }, { "epoch": 0.12065455093884322, "grad_norm": 0.23290067250893015, "learning_rate": 4.840265651111384e-07, "loss": 1.5957, "step": 800 }, { "epoch": 0.12080536912751678, "grad_norm": 0.2521050639135633, "learning_rate": 4.839870874085147e-07, "loss": 1.5405, "step": 801 }, { "epoch": 0.12095618731619033, "grad_norm": 0.2658430314654051, "learning_rate": 4.839475627822033e-07, "loss": 1.5346, "step": 802 }, { "epoch": 0.12110700550486389, "grad_norm": 0.2605551980486632, "learning_rate": 4.839079912410788e-07, "loss": 1.5528, "step": 803 }, { "epoch": 0.12125782369353744, "grad_norm": 0.25637582876320203, "learning_rate": 4.838683727940262e-07, "loss": 1.514, "step": 804 }, { "epoch": 0.121408641882211, "grad_norm": 0.2964856263950721, "learning_rate": 4.83828707449941e-07, "loss": 1.5536, "step": 805 }, { "epoch": 0.12155946007088454, "grad_norm": 0.24779737219807155, "learning_rate": 4.837889952177293e-07, "loss": 1.5783, "step": 806 }, { "epoch": 0.1217102782595581, "grad_norm": 0.24262288865254855, "learning_rate": 4.837492361063075e-07, "loss": 1.5754, "step": 807 }, { "epoch": 0.12186109644823166, "grad_norm": 0.25703278119585443, "learning_rate": 4.837094301246027e-07, "loss": 1.5389, "step": 808 }, { "epoch": 0.12201191463690521, "grad_norm": 0.24606774271785903, "learning_rate": 4.836695772815527e-07, "loss": 1.5562, "step": 809 }, { "epoch": 0.12216273282557877, "grad_norm": 0.2638024822506071, "learning_rate": 4.836296775861053e-07, "loss": 1.5749, "step": 810 }, { "epoch": 0.12231355101425231, "grad_norm": 0.2443613491949051, "learning_rate": 4.835897310472194e-07, "loss": 1.5665, "step": 811 }, { "epoch": 0.12246436920292587, "grad_norm": 0.26880244150319627, "learning_rate": 4.835497376738641e-07, "loss": 1.5263, "step": 812 }, { "epoch": 0.12261518739159942, "grad_norm": 0.24512045156197035, "learning_rate": 4.83509697475019e-07, "loss": 1.5692, "step": 813 }, { "epoch": 0.12276600558027298, "grad_norm": 0.24820930430496924, "learning_rate": 4.834696104596743e-07, "loss": 1.5603, "step": 814 }, { "epoch": 0.12291682376894654, "grad_norm": 0.42535360243645115, "learning_rate": 4.834294766368308e-07, "loss": 1.6325, "step": 815 }, { "epoch": 0.12306764195762009, "grad_norm": 0.25780958887909644, "learning_rate": 4.833892960154997e-07, "loss": 1.6142, "step": 816 }, { "epoch": 0.12321846014629365, "grad_norm": 0.27275400494629926, "learning_rate": 4.833490686047026e-07, "loss": 1.5863, "step": 817 }, { "epoch": 0.12336927833496719, "grad_norm": 0.2476330246188185, "learning_rate": 4.833087944134719e-07, "loss": 1.6803, "step": 818 }, { "epoch": 0.12352009652364075, "grad_norm": 0.24146678674912134, "learning_rate": 4.832684734508501e-07, "loss": 1.528, "step": 819 }, { "epoch": 0.12367091471231431, "grad_norm": 0.250043437502425, "learning_rate": 4.832281057258907e-07, "loss": 1.5728, "step": 820 }, { "epoch": 0.12382173290098786, "grad_norm": 0.24954721250421497, "learning_rate": 4.831876912476572e-07, "loss": 1.6055, "step": 821 }, { "epoch": 0.12397255108966142, "grad_norm": 0.40705819595065323, "learning_rate": 4.83147230025224e-07, "loss": 1.6186, "step": 822 }, { "epoch": 0.12412336927833496, "grad_norm": 0.32773363272612815, "learning_rate": 4.831067220676756e-07, "loss": 1.5751, "step": 823 }, { "epoch": 0.12427418746700852, "grad_norm": 0.26807163443396315, "learning_rate": 4.830661673841075e-07, "loss": 1.5544, "step": 824 }, { "epoch": 0.12442500565568207, "grad_norm": 0.24261889068804798, "learning_rate": 4.830255659836252e-07, "loss": 1.6377, "step": 825 }, { "epoch": 0.12457582384435563, "grad_norm": 0.34382157172301875, "learning_rate": 4.829849178753449e-07, "loss": 1.5715, "step": 826 }, { "epoch": 0.12472664203302919, "grad_norm": 0.316169860210875, "learning_rate": 4.829442230683935e-07, "loss": 1.581, "step": 827 }, { "epoch": 0.12487746022170274, "grad_norm": 0.2865289199919012, "learning_rate": 4.82903481571908e-07, "loss": 1.5738, "step": 828 }, { "epoch": 0.12502827841037628, "grad_norm": 0.2648798194129721, "learning_rate": 4.828626933950359e-07, "loss": 1.624, "step": 829 }, { "epoch": 0.12517909659904985, "grad_norm": 0.29238962734359203, "learning_rate": 4.828218585469357e-07, "loss": 1.509, "step": 830 }, { "epoch": 0.1253299147877234, "grad_norm": 0.5511438047008799, "learning_rate": 4.827809770367757e-07, "loss": 1.5524, "step": 831 }, { "epoch": 0.12548073297639695, "grad_norm": 0.26740428045118964, "learning_rate": 4.827400488737351e-07, "loss": 1.5907, "step": 832 }, { "epoch": 0.12563155116507052, "grad_norm": 0.26308461790104193, "learning_rate": 4.826990740670035e-07, "loss": 1.64, "step": 833 }, { "epoch": 0.12578236935374407, "grad_norm": 0.24263120576507818, "learning_rate": 4.826580526257809e-07, "loss": 1.5427, "step": 834 }, { "epoch": 0.1259331875424176, "grad_norm": 0.24279432366476036, "learning_rate": 4.826169845592778e-07, "loss": 1.5294, "step": 835 }, { "epoch": 0.12608400573109116, "grad_norm": 0.2590934265442616, "learning_rate": 4.825758698767152e-07, "loss": 1.607, "step": 836 }, { "epoch": 0.12623482391976473, "grad_norm": 1.123102233200613, "learning_rate": 4.825347085873245e-07, "loss": 1.4723, "step": 837 }, { "epoch": 0.12638564210843828, "grad_norm": 0.27876322471312537, "learning_rate": 4.824935007003478e-07, "loss": 1.5882, "step": 838 }, { "epoch": 0.12653646029711182, "grad_norm": 0.26911085288321157, "learning_rate": 4.824522462250371e-07, "loss": 1.6014, "step": 839 }, { "epoch": 0.1266872784857854, "grad_norm": 0.2425461341792373, "learning_rate": 4.824109451706556e-07, "loss": 1.5061, "step": 840 }, { "epoch": 0.12683809667445894, "grad_norm": 0.24613790002709662, "learning_rate": 4.823695975464764e-07, "loss": 1.5627, "step": 841 }, { "epoch": 0.1269889148631325, "grad_norm": 0.25215547445900216, "learning_rate": 4.823282033617832e-07, "loss": 1.5776, "step": 842 }, { "epoch": 0.12713973305180604, "grad_norm": 0.24529552413588745, "learning_rate": 4.822867626258704e-07, "loss": 1.6023, "step": 843 }, { "epoch": 0.1272905512404796, "grad_norm": 0.2550713983118439, "learning_rate": 4.822452753480425e-07, "loss": 1.6527, "step": 844 }, { "epoch": 0.12744136942915316, "grad_norm": 0.2676107121499945, "learning_rate": 4.822037415376146e-07, "loss": 1.6495, "step": 845 }, { "epoch": 0.1275921876178267, "grad_norm": 0.2819066663763421, "learning_rate": 4.821621612039123e-07, "loss": 1.5682, "step": 846 }, { "epoch": 0.12774300580650028, "grad_norm": 0.326742076073443, "learning_rate": 4.821205343562714e-07, "loss": 1.561, "step": 847 }, { "epoch": 0.12789382399517382, "grad_norm": 0.253293756714276, "learning_rate": 4.820788610040388e-07, "loss": 1.5217, "step": 848 }, { "epoch": 0.12804464218384737, "grad_norm": 0.24698120971930787, "learning_rate": 4.82037141156571e-07, "loss": 1.5475, "step": 849 }, { "epoch": 0.12819546037252091, "grad_norm": 0.2552342548833313, "learning_rate": 4.819953748232354e-07, "loss": 1.6362, "step": 850 }, { "epoch": 0.1283462785611945, "grad_norm": 0.2489531724751376, "learning_rate": 4.819535620134098e-07, "loss": 1.4661, "step": 851 }, { "epoch": 0.12849709674986803, "grad_norm": 0.2561641259473689, "learning_rate": 4.819117027364824e-07, "loss": 1.6147, "step": 852 }, { "epoch": 0.12864791493854158, "grad_norm": 0.248751198019234, "learning_rate": 4.818697970018517e-07, "loss": 1.5935, "step": 853 }, { "epoch": 0.12879873312721515, "grad_norm": 0.259225532039798, "learning_rate": 4.81827844818927e-07, "loss": 1.5667, "step": 854 }, { "epoch": 0.1289495513158887, "grad_norm": 0.36433050928979993, "learning_rate": 4.817858461971275e-07, "loss": 1.5721, "step": 855 }, { "epoch": 0.12910036950456225, "grad_norm": 0.2495349007213541, "learning_rate": 4.817438011458835e-07, "loss": 1.5774, "step": 856 }, { "epoch": 0.1292511876932358, "grad_norm": 0.26017903422953986, "learning_rate": 4.817017096746349e-07, "loss": 1.6409, "step": 857 }, { "epoch": 0.12940200588190937, "grad_norm": 0.2577039923092752, "learning_rate": 4.816595717928327e-07, "loss": 1.5082, "step": 858 }, { "epoch": 0.1295528240705829, "grad_norm": 0.25190571322660016, "learning_rate": 4.81617387509938e-07, "loss": 1.6199, "step": 859 }, { "epoch": 0.12970364225925646, "grad_norm": 0.25108026522987853, "learning_rate": 4.815751568354225e-07, "loss": 1.608, "step": 860 }, { "epoch": 0.12985446044793003, "grad_norm": 0.2812535177205302, "learning_rate": 4.815328797787681e-07, "loss": 1.5383, "step": 861 }, { "epoch": 0.13000527863660358, "grad_norm": 0.2912235666420491, "learning_rate": 4.814905563494674e-07, "loss": 1.6123, "step": 862 }, { "epoch": 0.13015609682527712, "grad_norm": 0.23793333653321677, "learning_rate": 4.81448186557023e-07, "loss": 1.5155, "step": 863 }, { "epoch": 0.13030691501395067, "grad_norm": 0.24211422876016309, "learning_rate": 4.814057704109483e-07, "loss": 1.5627, "step": 864 }, { "epoch": 0.13045773320262424, "grad_norm": 0.3005613543933885, "learning_rate": 4.81363307920767e-07, "loss": 1.5363, "step": 865 }, { "epoch": 0.1306085513912978, "grad_norm": 0.26748788516683486, "learning_rate": 4.813207990960131e-07, "loss": 1.5645, "step": 866 }, { "epoch": 0.13075936957997134, "grad_norm": 0.2467019690508945, "learning_rate": 4.812782439462311e-07, "loss": 1.6062, "step": 867 }, { "epoch": 0.1309101877686449, "grad_norm": 0.24050701469204028, "learning_rate": 4.812356424809756e-07, "loss": 1.5592, "step": 868 }, { "epoch": 0.13106100595731845, "grad_norm": 0.24566700637291505, "learning_rate": 4.811929947098125e-07, "loss": 1.4948, "step": 869 }, { "epoch": 0.131211824145992, "grad_norm": 0.24578461985233002, "learning_rate": 4.811503006423168e-07, "loss": 1.5725, "step": 870 }, { "epoch": 0.13136264233466555, "grad_norm": 0.2766474882116995, "learning_rate": 4.81107560288075e-07, "loss": 1.5085, "step": 871 }, { "epoch": 0.13151346052333912, "grad_norm": 0.3032106098903552, "learning_rate": 4.810647736566833e-07, "loss": 1.5573, "step": 872 }, { "epoch": 0.13166427871201267, "grad_norm": 0.26227182055075776, "learning_rate": 4.810219407577488e-07, "loss": 1.5504, "step": 873 }, { "epoch": 0.1318150969006862, "grad_norm": 0.30231661285945577, "learning_rate": 4.809790616008884e-07, "loss": 1.6157, "step": 874 }, { "epoch": 0.1319659150893598, "grad_norm": 0.24138594952190437, "learning_rate": 4.8093613619573e-07, "loss": 1.5301, "step": 875 }, { "epoch": 0.13211673327803333, "grad_norm": 0.28015148103115745, "learning_rate": 4.808931645519115e-07, "loss": 1.6091, "step": 876 }, { "epoch": 0.13226755146670688, "grad_norm": 0.25583472974312804, "learning_rate": 4.808501466790813e-07, "loss": 1.6112, "step": 877 }, { "epoch": 0.13241836965538045, "grad_norm": 0.2550840996404767, "learning_rate": 4.808070825868981e-07, "loss": 1.5721, "step": 878 }, { "epoch": 0.132569187844054, "grad_norm": 0.2386014317840139, "learning_rate": 4.80763972285031e-07, "loss": 1.561, "step": 879 }, { "epoch": 0.13272000603272754, "grad_norm": 0.26463646113746936, "learning_rate": 4.807208157831598e-07, "loss": 1.5963, "step": 880 }, { "epoch": 0.1328708242214011, "grad_norm": 0.24789221098898107, "learning_rate": 4.806776130909741e-07, "loss": 1.5585, "step": 881 }, { "epoch": 0.13302164241007466, "grad_norm": 0.27954549479073476, "learning_rate": 4.806343642181743e-07, "loss": 1.5257, "step": 882 }, { "epoch": 0.1331724605987482, "grad_norm": 0.2583555304842065, "learning_rate": 4.80591069174471e-07, "loss": 1.6395, "step": 883 }, { "epoch": 0.13332327878742176, "grad_norm": 0.2747169063226922, "learning_rate": 4.805477279695852e-07, "loss": 1.6829, "step": 884 }, { "epoch": 0.13347409697609533, "grad_norm": 0.24600349536308114, "learning_rate": 4.805043406132482e-07, "loss": 1.5371, "step": 885 }, { "epoch": 0.13362491516476888, "grad_norm": 0.25980027742127176, "learning_rate": 4.804609071152018e-07, "loss": 1.6184, "step": 886 }, { "epoch": 0.13377573335344242, "grad_norm": 0.24508614968710926, "learning_rate": 4.804174274851981e-07, "loss": 1.6147, "step": 887 }, { "epoch": 0.13392655154211597, "grad_norm": 0.24804260240667214, "learning_rate": 4.803739017329996e-07, "loss": 1.6067, "step": 888 }, { "epoch": 0.13407736973078954, "grad_norm": 0.25405274236319236, "learning_rate": 4.803303298683789e-07, "loss": 1.5351, "step": 889 }, { "epoch": 0.1342281879194631, "grad_norm": 0.27238479371505225, "learning_rate": 4.802867119011193e-07, "loss": 1.5735, "step": 890 }, { "epoch": 0.13437900610813663, "grad_norm": 0.23928993849704544, "learning_rate": 4.802430478410142e-07, "loss": 1.542, "step": 891 }, { "epoch": 0.1345298242968102, "grad_norm": 0.27162527646949547, "learning_rate": 4.801993376978676e-07, "loss": 1.5939, "step": 892 }, { "epoch": 0.13468064248548375, "grad_norm": 0.24118953502840654, "learning_rate": 4.801555814814936e-07, "loss": 1.5485, "step": 893 }, { "epoch": 0.1348314606741573, "grad_norm": 0.24615309745922606, "learning_rate": 4.801117792017168e-07, "loss": 1.6679, "step": 894 }, { "epoch": 0.13498227886283085, "grad_norm": 0.26089481234827916, "learning_rate": 4.80067930868372e-07, "loss": 1.5373, "step": 895 }, { "epoch": 0.13513309705150442, "grad_norm": 0.24563967703761258, "learning_rate": 4.800240364913044e-07, "loss": 1.6071, "step": 896 }, { "epoch": 0.13528391524017797, "grad_norm": 0.3388244706396415, "learning_rate": 4.799800960803698e-07, "loss": 1.581, "step": 897 }, { "epoch": 0.1354347334288515, "grad_norm": 0.2625542380631137, "learning_rate": 4.79936109645434e-07, "loss": 1.523, "step": 898 }, { "epoch": 0.13558555161752509, "grad_norm": 0.25532417159291754, "learning_rate": 4.798920771963731e-07, "loss": 1.6067, "step": 899 }, { "epoch": 0.13573636980619863, "grad_norm": 0.268227095836578, "learning_rate": 4.798479987430737e-07, "loss": 1.5463, "step": 900 }, { "epoch": 0.13588718799487218, "grad_norm": 0.26815668103982254, "learning_rate": 4.798038742954327e-07, "loss": 1.586, "step": 901 }, { "epoch": 0.13603800618354572, "grad_norm": 0.24824025017149767, "learning_rate": 4.797597038633576e-07, "loss": 1.5606, "step": 902 }, { "epoch": 0.1361888243722193, "grad_norm": 0.26842140212655446, "learning_rate": 4.797154874567656e-07, "loss": 1.6167, "step": 903 }, { "epoch": 0.13633964256089284, "grad_norm": 0.2571476298071355, "learning_rate": 4.796712250855846e-07, "loss": 1.5371, "step": 904 }, { "epoch": 0.1364904607495664, "grad_norm": 0.27375277855080987, "learning_rate": 4.796269167597529e-07, "loss": 1.5679, "step": 905 }, { "epoch": 0.13664127893823996, "grad_norm": 0.3057162574463642, "learning_rate": 4.795825624892191e-07, "loss": 1.6045, "step": 906 }, { "epoch": 0.1367920971269135, "grad_norm": 0.2498937260128442, "learning_rate": 4.795381622839418e-07, "loss": 1.6371, "step": 907 }, { "epoch": 0.13694291531558705, "grad_norm": 0.2862944084345334, "learning_rate": 4.794937161538903e-07, "loss": 1.5801, "step": 908 }, { "epoch": 0.1370937335042606, "grad_norm": 0.26138244883566913, "learning_rate": 4.794492241090441e-07, "loss": 1.6468, "step": 909 }, { "epoch": 0.13724455169293417, "grad_norm": 0.26629757391984443, "learning_rate": 4.794046861593928e-07, "loss": 1.5691, "step": 910 }, { "epoch": 0.13739536988160772, "grad_norm": 0.30809370729255164, "learning_rate": 4.793601023149366e-07, "loss": 1.4806, "step": 911 }, { "epoch": 0.13754618807028127, "grad_norm": 0.46536836965333095, "learning_rate": 4.793154725856857e-07, "loss": 1.5647, "step": 912 }, { "epoch": 0.13769700625895484, "grad_norm": 0.2522687711040882, "learning_rate": 4.79270796981661e-07, "loss": 1.5733, "step": 913 }, { "epoch": 0.1378478244476284, "grad_norm": 0.2690001943708058, "learning_rate": 4.792260755128932e-07, "loss": 1.59, "step": 914 }, { "epoch": 0.13799864263630193, "grad_norm": 0.24545110405555004, "learning_rate": 4.791813081894237e-07, "loss": 1.6301, "step": 915 }, { "epoch": 0.13814946082497548, "grad_norm": 0.26644633088028224, "learning_rate": 4.791364950213043e-07, "loss": 1.5873, "step": 916 }, { "epoch": 0.13830027901364905, "grad_norm": 0.251546512526234, "learning_rate": 4.790916360185965e-07, "loss": 1.5496, "step": 917 }, { "epoch": 0.1384510972023226, "grad_norm": 0.25316144412679326, "learning_rate": 4.790467311913727e-07, "loss": 1.5564, "step": 918 }, { "epoch": 0.13860191539099614, "grad_norm": 0.32773966673237986, "learning_rate": 4.790017805497152e-07, "loss": 1.5444, "step": 919 }, { "epoch": 0.13875273357966972, "grad_norm": 0.24546044641204529, "learning_rate": 4.789567841037168e-07, "loss": 1.5814, "step": 920 }, { "epoch": 0.13890355176834326, "grad_norm": 0.29277978064100285, "learning_rate": 4.789117418634804e-07, "loss": 1.5894, "step": 921 }, { "epoch": 0.1390543699570168, "grad_norm": 0.3549813518300409, "learning_rate": 4.788666538391196e-07, "loss": 1.5953, "step": 922 }, { "epoch": 0.13920518814569038, "grad_norm": 0.2685881754179385, "learning_rate": 4.788215200407576e-07, "loss": 1.5124, "step": 923 }, { "epoch": 0.13935600633436393, "grad_norm": 0.23461190696685494, "learning_rate": 4.787763404785284e-07, "loss": 1.5263, "step": 924 }, { "epoch": 0.13950682452303748, "grad_norm": 0.2769659797431462, "learning_rate": 4.787311151625762e-07, "loss": 1.6239, "step": 925 }, { "epoch": 0.13965764271171102, "grad_norm": 0.25856439910948137, "learning_rate": 4.786858441030553e-07, "loss": 1.6261, "step": 926 }, { "epoch": 0.1398084609003846, "grad_norm": 0.30055130559881404, "learning_rate": 4.786405273101304e-07, "loss": 1.5665, "step": 927 }, { "epoch": 0.13995927908905814, "grad_norm": 0.26524355572498903, "learning_rate": 4.785951647939765e-07, "loss": 1.5662, "step": 928 }, { "epoch": 0.1401100972777317, "grad_norm": 0.2615300648761517, "learning_rate": 4.785497565647787e-07, "loss": 1.6526, "step": 929 }, { "epoch": 0.14026091546640526, "grad_norm": 0.24452930953345128, "learning_rate": 4.785043026327327e-07, "loss": 1.6293, "step": 930 }, { "epoch": 0.1404117336550788, "grad_norm": 0.2509944116710881, "learning_rate": 4.784588030080439e-07, "loss": 1.5249, "step": 931 }, { "epoch": 0.14056255184375235, "grad_norm": 0.28509033083799773, "learning_rate": 4.784132577009284e-07, "loss": 1.5579, "step": 932 }, { "epoch": 0.1407133700324259, "grad_norm": 0.25327312061491164, "learning_rate": 4.783676667216125e-07, "loss": 1.5977, "step": 933 }, { "epoch": 0.14086418822109947, "grad_norm": 0.244814281253799, "learning_rate": 4.783220300803329e-07, "loss": 1.5698, "step": 934 }, { "epoch": 0.14101500640977302, "grad_norm": 0.24512065385327506, "learning_rate": 4.78276347787336e-07, "loss": 1.581, "step": 935 }, { "epoch": 0.14116582459844657, "grad_norm": 0.2407057281097465, "learning_rate": 4.78230619852879e-07, "loss": 1.573, "step": 936 }, { "epoch": 0.14131664278712014, "grad_norm": 0.2957663472009309, "learning_rate": 4.781848462872291e-07, "loss": 1.6006, "step": 937 }, { "epoch": 0.14146746097579369, "grad_norm": 0.2523508100196154, "learning_rate": 4.781390271006638e-07, "loss": 1.5686, "step": 938 }, { "epoch": 0.14161827916446723, "grad_norm": 0.26587967742482643, "learning_rate": 4.780931623034708e-07, "loss": 1.588, "step": 939 }, { "epoch": 0.14176909735314078, "grad_norm": 0.24226402477563086, "learning_rate": 4.780472519059482e-07, "loss": 1.57, "step": 940 }, { "epoch": 0.14191991554181435, "grad_norm": 0.32111658741147964, "learning_rate": 4.780012959184042e-07, "loss": 1.5811, "step": 941 }, { "epoch": 0.1420707337304879, "grad_norm": 0.2603441769850403, "learning_rate": 4.779552943511572e-07, "loss": 1.5989, "step": 942 }, { "epoch": 0.14222155191916144, "grad_norm": 0.24997121269396286, "learning_rate": 4.779092472145359e-07, "loss": 1.5094, "step": 943 }, { "epoch": 0.14237237010783502, "grad_norm": 0.24301016836612505, "learning_rate": 4.778631545188791e-07, "loss": 1.5627, "step": 944 }, { "epoch": 0.14252318829650856, "grad_norm": 0.389051844125457, "learning_rate": 4.778170162745363e-07, "loss": 1.5494, "step": 945 }, { "epoch": 0.1426740064851821, "grad_norm": 0.2547230822653501, "learning_rate": 4.777708324918665e-07, "loss": 1.5722, "step": 946 }, { "epoch": 0.14282482467385565, "grad_norm": 0.24358299425546492, "learning_rate": 4.777246031812395e-07, "loss": 1.6627, "step": 947 }, { "epoch": 0.14297564286252923, "grad_norm": 0.2658494143360813, "learning_rate": 4.776783283530351e-07, "loss": 1.6202, "step": 948 }, { "epoch": 0.14312646105120277, "grad_norm": 0.2876997672428938, "learning_rate": 4.776320080176433e-07, "loss": 1.6024, "step": 949 }, { "epoch": 0.14327727923987632, "grad_norm": 0.36983100061415125, "learning_rate": 4.775856421854645e-07, "loss": 1.5943, "step": 950 }, { "epoch": 0.1434280974285499, "grad_norm": 0.2506670735811908, "learning_rate": 4.775392308669091e-07, "loss": 1.5817, "step": 951 }, { "epoch": 0.14357891561722344, "grad_norm": 0.2371938577597567, "learning_rate": 4.774927740723977e-07, "loss": 1.5705, "step": 952 }, { "epoch": 0.143729733805897, "grad_norm": 0.23952843693509399, "learning_rate": 4.774462718123614e-07, "loss": 1.4839, "step": 953 }, { "epoch": 0.14388055199457053, "grad_norm": 0.23949570105139512, "learning_rate": 4.773997240972409e-07, "loss": 1.5263, "step": 954 }, { "epoch": 0.1440313701832441, "grad_norm": 0.2551375984425037, "learning_rate": 4.773531309374881e-07, "loss": 1.6084, "step": 955 }, { "epoch": 0.14418218837191765, "grad_norm": 0.24541644561947223, "learning_rate": 4.773064923435641e-07, "loss": 1.5743, "step": 956 }, { "epoch": 0.1443330065605912, "grad_norm": 0.25515556428521197, "learning_rate": 4.772598083259408e-07, "loss": 1.5043, "step": 957 }, { "epoch": 0.14448382474926477, "grad_norm": 0.2750279009285979, "learning_rate": 4.772130788951e-07, "loss": 1.5227, "step": 958 }, { "epoch": 0.14463464293793832, "grad_norm": 0.25507085734973817, "learning_rate": 4.77166304061534e-07, "loss": 1.5771, "step": 959 }, { "epoch": 0.14478546112661186, "grad_norm": 0.2620215918819778, "learning_rate": 4.771194838357449e-07, "loss": 1.559, "step": 960 }, { "epoch": 0.1449362793152854, "grad_norm": 0.2733625850021945, "learning_rate": 4.770726182282454e-07, "loss": 1.5437, "step": 961 }, { "epoch": 0.14508709750395898, "grad_norm": 0.28518365346685176, "learning_rate": 4.770257072495581e-07, "loss": 1.5703, "step": 962 }, { "epoch": 0.14523791569263253, "grad_norm": 0.25041762802372103, "learning_rate": 4.769787509102158e-07, "loss": 1.6308, "step": 963 }, { "epoch": 0.14538873388130608, "grad_norm": 0.25820967916794474, "learning_rate": 4.769317492207617e-07, "loss": 1.5527, "step": 964 }, { "epoch": 0.14553955206997965, "grad_norm": 0.334784271673938, "learning_rate": 4.76884702191749e-07, "loss": 1.4922, "step": 965 }, { "epoch": 0.1456903702586532, "grad_norm": 0.24810790355705478, "learning_rate": 4.768376098337411e-07, "loss": 1.5765, "step": 966 }, { "epoch": 0.14584118844732674, "grad_norm": 0.28700061399749227, "learning_rate": 4.767904721573117e-07, "loss": 1.6496, "step": 967 }, { "epoch": 0.14599200663600032, "grad_norm": 0.34732089248600745, "learning_rate": 4.767432891730444e-07, "loss": 1.5558, "step": 968 }, { "epoch": 0.14614282482467386, "grad_norm": 0.2673344082419073, "learning_rate": 4.766960608915333e-07, "loss": 1.5894, "step": 969 }, { "epoch": 0.1462936430133474, "grad_norm": 2.653973193991139, "learning_rate": 4.7664878732338253e-07, "loss": 1.5334, "step": 970 }, { "epoch": 0.14644446120202095, "grad_norm": 0.24993909864096386, "learning_rate": 4.7660146847920635e-07, "loss": 1.5961, "step": 971 }, { "epoch": 0.14659527939069453, "grad_norm": 0.2565497081102889, "learning_rate": 4.7655410436962916e-07, "loss": 1.6027, "step": 972 }, { "epoch": 0.14674609757936807, "grad_norm": 0.254279538292664, "learning_rate": 4.765066950052857e-07, "loss": 1.6422, "step": 973 }, { "epoch": 0.14689691576804162, "grad_norm": 0.2657242795261302, "learning_rate": 4.7645924039682066e-07, "loss": 1.6159, "step": 974 }, { "epoch": 0.1470477339567152, "grad_norm": 0.2551289716846203, "learning_rate": 4.7641174055488903e-07, "loss": 1.5766, "step": 975 }, { "epoch": 0.14719855214538874, "grad_norm": 0.2487164615456287, "learning_rate": 4.7636419549015587e-07, "loss": 1.6085, "step": 976 }, { "epoch": 0.14734937033406229, "grad_norm": 0.24934246137057814, "learning_rate": 4.763166052132964e-07, "loss": 1.5588, "step": 977 }, { "epoch": 0.14750018852273583, "grad_norm": 0.25608798985597103, "learning_rate": 4.7626896973499615e-07, "loss": 1.5647, "step": 978 }, { "epoch": 0.1476510067114094, "grad_norm": 0.24874879011014975, "learning_rate": 4.7622128906595054e-07, "loss": 1.5121, "step": 979 }, { "epoch": 0.14780182490008295, "grad_norm": 0.24432845222015379, "learning_rate": 4.761735632168653e-07, "loss": 1.6303, "step": 980 }, { "epoch": 0.1479526430887565, "grad_norm": 0.252842065429307, "learning_rate": 4.761257921984564e-07, "loss": 1.59, "step": 981 }, { "epoch": 0.14810346127743007, "grad_norm": 0.2514307218624979, "learning_rate": 4.760779760214496e-07, "loss": 1.6196, "step": 982 }, { "epoch": 0.14825427946610362, "grad_norm": 0.2667127561644108, "learning_rate": 4.7603011469658126e-07, "loss": 1.58, "step": 983 }, { "epoch": 0.14840509765477716, "grad_norm": 0.24168970436906176, "learning_rate": 4.7598220823459744e-07, "loss": 1.5355, "step": 984 }, { "epoch": 0.1485559158434507, "grad_norm": 0.25620873924454746, "learning_rate": 4.759342566462546e-07, "loss": 1.6269, "step": 985 }, { "epoch": 0.14870673403212428, "grad_norm": 0.2452078703443453, "learning_rate": 4.758862599423194e-07, "loss": 1.6141, "step": 986 }, { "epoch": 0.14885755222079783, "grad_norm": 0.24225881503880753, "learning_rate": 4.758382181335684e-07, "loss": 1.5921, "step": 987 }, { "epoch": 0.14900837040947137, "grad_norm": 0.2612762223581993, "learning_rate": 4.757901312307882e-07, "loss": 1.5693, "step": 988 }, { "epoch": 0.14915918859814495, "grad_norm": 0.303459353963245, "learning_rate": 4.7574199924477605e-07, "loss": 1.5342, "step": 989 }, { "epoch": 0.1493100067868185, "grad_norm": 0.3088514981989386, "learning_rate": 4.7569382218633877e-07, "loss": 1.6037, "step": 990 }, { "epoch": 0.14946082497549204, "grad_norm": 0.2562445993286538, "learning_rate": 4.756456000662935e-07, "loss": 1.5493, "step": 991 }, { "epoch": 0.1496116431641656, "grad_norm": 0.28232305406411545, "learning_rate": 4.755973328954676e-07, "loss": 1.6046, "step": 992 }, { "epoch": 0.14976246135283916, "grad_norm": 0.23837206086373763, "learning_rate": 4.755490206846985e-07, "loss": 1.5531, "step": 993 }, { "epoch": 0.1499132795415127, "grad_norm": 0.28518372943400533, "learning_rate": 4.7550066344483355e-07, "loss": 1.5578, "step": 994 }, { "epoch": 0.15006409773018625, "grad_norm": 0.2549768269904049, "learning_rate": 4.754522611867304e-07, "loss": 1.5646, "step": 995 }, { "epoch": 0.15021491591885983, "grad_norm": 0.2562706132685018, "learning_rate": 4.754038139212569e-07, "loss": 1.589, "step": 996 }, { "epoch": 0.15036573410753337, "grad_norm": 0.2410268153834008, "learning_rate": 4.753553216592907e-07, "loss": 1.4844, "step": 997 }, { "epoch": 0.15051655229620692, "grad_norm": 0.26281148717796365, "learning_rate": 4.7530678441171976e-07, "loss": 1.5636, "step": 998 }, { "epoch": 0.15066737048488046, "grad_norm": 0.30961820231400505, "learning_rate": 4.752582021894422e-07, "loss": 1.6057, "step": 999 }, { "epoch": 0.15081818867355404, "grad_norm": 0.3321593493732633, "learning_rate": 4.752095750033661e-07, "loss": 1.6641, "step": 1000 }, { "epoch": 0.15096900686222758, "grad_norm": 0.2969182772224481, "learning_rate": 4.7516090286440963e-07, "loss": 1.6061, "step": 1001 }, { "epoch": 0.15111982505090113, "grad_norm": 0.2477958419058681, "learning_rate": 4.751121857835012e-07, "loss": 1.597, "step": 1002 }, { "epoch": 0.1512706432395747, "grad_norm": 0.24983981603131114, "learning_rate": 4.7506342377157904e-07, "loss": 1.6094, "step": 1003 }, { "epoch": 0.15142146142824825, "grad_norm": 0.2543220894110329, "learning_rate": 4.750146168395918e-07, "loss": 1.5846, "step": 1004 }, { "epoch": 0.1515722796169218, "grad_norm": 0.2587477761766272, "learning_rate": 4.7496576499849807e-07, "loss": 1.56, "step": 1005 }, { "epoch": 0.15172309780559534, "grad_norm": 0.24154994402700736, "learning_rate": 4.749168682592664e-07, "loss": 1.6037, "step": 1006 }, { "epoch": 0.15187391599426892, "grad_norm": 0.2508792993278851, "learning_rate": 4.748679266328755e-07, "loss": 1.5603, "step": 1007 }, { "epoch": 0.15202473418294246, "grad_norm": 0.2493414013337419, "learning_rate": 4.748189401303144e-07, "loss": 1.5396, "step": 1008 }, { "epoch": 0.152175552371616, "grad_norm": 0.658800165472276, "learning_rate": 4.7476990876258185e-07, "loss": 1.5619, "step": 1009 }, { "epoch": 0.15232637056028958, "grad_norm": 0.23745478283721239, "learning_rate": 4.7472083254068686e-07, "loss": 1.5544, "step": 1010 }, { "epoch": 0.15247718874896313, "grad_norm": 0.305400220510485, "learning_rate": 4.7467171147564835e-07, "loss": 1.6674, "step": 1011 }, { "epoch": 0.15262800693763667, "grad_norm": 0.2785138739432395, "learning_rate": 4.7462254557849557e-07, "loss": 1.5855, "step": 1012 }, { "epoch": 0.15277882512631022, "grad_norm": 0.2434299468753599, "learning_rate": 4.745733348602677e-07, "loss": 1.6288, "step": 1013 }, { "epoch": 0.1529296433149838, "grad_norm": 0.2451373266031179, "learning_rate": 4.745240793320139e-07, "loss": 1.5515, "step": 1014 }, { "epoch": 0.15308046150365734, "grad_norm": 0.2701367378557065, "learning_rate": 4.744747790047935e-07, "loss": 1.516, "step": 1015 }, { "epoch": 0.15323127969233089, "grad_norm": 0.3157527268124156, "learning_rate": 4.7442543388967587e-07, "loss": 1.6163, "step": 1016 }, { "epoch": 0.15338209788100446, "grad_norm": 0.24232869990901512, "learning_rate": 4.7437604399774047e-07, "loss": 1.6275, "step": 1017 }, { "epoch": 0.153532916069678, "grad_norm": 0.24900214045309305, "learning_rate": 4.7432660934007665e-07, "loss": 1.5733, "step": 1018 }, { "epoch": 0.15368373425835155, "grad_norm": 0.2822160839916749, "learning_rate": 4.742771299277841e-07, "loss": 1.594, "step": 1019 }, { "epoch": 0.15383455244702512, "grad_norm": 0.2733829280739275, "learning_rate": 4.742276057719722e-07, "loss": 1.57, "step": 1020 }, { "epoch": 0.15398537063569867, "grad_norm": 0.2460228836578742, "learning_rate": 4.741780368837607e-07, "loss": 1.5509, "step": 1021 }, { "epoch": 0.15413618882437222, "grad_norm": 0.25260122809000746, "learning_rate": 4.7412842327427914e-07, "loss": 1.5806, "step": 1022 }, { "epoch": 0.15428700701304576, "grad_norm": 0.23750399144418952, "learning_rate": 4.7407876495466737e-07, "loss": 1.5771, "step": 1023 }, { "epoch": 0.15443782520171934, "grad_norm": 0.26808088140916786, "learning_rate": 4.7402906193607506e-07, "loss": 1.6318, "step": 1024 }, { "epoch": 0.15458864339039288, "grad_norm": 0.24828288210625818, "learning_rate": 4.73979314229662e-07, "loss": 1.5416, "step": 1025 }, { "epoch": 0.15473946157906643, "grad_norm": 0.2436099220209187, "learning_rate": 4.739295218465979e-07, "loss": 1.5476, "step": 1026 }, { "epoch": 0.15489027976774, "grad_norm": 0.3083906756582321, "learning_rate": 4.7387968479806266e-07, "loss": 1.5377, "step": 1027 }, { "epoch": 0.15504109795641355, "grad_norm": 0.4550389188653999, "learning_rate": 4.738298030952462e-07, "loss": 1.6148, "step": 1028 }, { "epoch": 0.1551919161450871, "grad_norm": 0.2626493565680759, "learning_rate": 4.737798767493484e-07, "loss": 1.5929, "step": 1029 }, { "epoch": 0.15534273433376064, "grad_norm": 0.25785948030370415, "learning_rate": 4.7372990577157917e-07, "loss": 1.5832, "step": 1030 }, { "epoch": 0.15549355252243421, "grad_norm": 0.2610654083785875, "learning_rate": 4.736798901731585e-07, "loss": 1.5268, "step": 1031 }, { "epoch": 0.15564437071110776, "grad_norm": 0.2810034490504235, "learning_rate": 4.736298299653161e-07, "loss": 1.577, "step": 1032 }, { "epoch": 0.1557951888997813, "grad_norm": 0.5772108063423994, "learning_rate": 4.7357972515929227e-07, "loss": 1.561, "step": 1033 }, { "epoch": 0.15594600708845488, "grad_norm": 0.25471771878994864, "learning_rate": 4.7352957576633683e-07, "loss": 1.6665, "step": 1034 }, { "epoch": 0.15609682527712843, "grad_norm": 0.25694693768929777, "learning_rate": 4.7347938179770985e-07, "loss": 1.5935, "step": 1035 }, { "epoch": 0.15624764346580197, "grad_norm": 0.3104727488339877, "learning_rate": 4.734291432646813e-07, "loss": 1.4999, "step": 1036 }, { "epoch": 0.15639846165447552, "grad_norm": 0.2975832370733511, "learning_rate": 4.733788601785311e-07, "loss": 1.5515, "step": 1037 }, { "epoch": 0.1565492798431491, "grad_norm": 0.27642272849022204, "learning_rate": 4.733285325505494e-07, "loss": 1.6036, "step": 1038 }, { "epoch": 0.15670009803182264, "grad_norm": 0.24721769073668393, "learning_rate": 4.7327816039203624e-07, "loss": 1.5687, "step": 1039 }, { "epoch": 0.15685091622049618, "grad_norm": 0.2956038050337575, "learning_rate": 4.732277437143015e-07, "loss": 1.5981, "step": 1040 }, { "epoch": 0.15700173440916976, "grad_norm": 0.2718469053700773, "learning_rate": 4.731772825286653e-07, "loss": 1.5874, "step": 1041 }, { "epoch": 0.1571525525978433, "grad_norm": 0.25197611213411847, "learning_rate": 4.7312677684645755e-07, "loss": 1.5553, "step": 1042 }, { "epoch": 0.15730337078651685, "grad_norm": 0.29647496304966825, "learning_rate": 4.730762266790184e-07, "loss": 1.5247, "step": 1043 }, { "epoch": 0.1574541889751904, "grad_norm": 0.26208550422646826, "learning_rate": 4.730256320376977e-07, "loss": 1.5189, "step": 1044 }, { "epoch": 0.15760500716386397, "grad_norm": 0.2647759645027192, "learning_rate": 4.7297499293385545e-07, "loss": 1.6032, "step": 1045 }, { "epoch": 0.15775582535253752, "grad_norm": 0.31315008832451746, "learning_rate": 4.729243093788617e-07, "loss": 1.6115, "step": 1046 }, { "epoch": 0.15790664354121106, "grad_norm": 0.2924108623243284, "learning_rate": 4.728735813840962e-07, "loss": 1.6144, "step": 1047 }, { "epoch": 0.15805746172988464, "grad_norm": 0.2616373957240464, "learning_rate": 4.72822808960949e-07, "loss": 1.5285, "step": 1048 }, { "epoch": 0.15820827991855818, "grad_norm": 0.24923023915333234, "learning_rate": 4.7277199212082e-07, "loss": 1.6548, "step": 1049 }, { "epoch": 0.15835909810723173, "grad_norm": 0.2551933398409467, "learning_rate": 4.727211308751189e-07, "loss": 1.6165, "step": 1050 }, { "epoch": 0.15850991629590527, "grad_norm": 0.2593180448603392, "learning_rate": 4.726702252352657e-07, "loss": 1.5991, "step": 1051 }, { "epoch": 0.15866073448457885, "grad_norm": 0.2522277006422934, "learning_rate": 4.726192752126902e-07, "loss": 1.577, "step": 1052 }, { "epoch": 0.1588115526732524, "grad_norm": 0.24196721815033448, "learning_rate": 4.72568280818832e-07, "loss": 1.5836, "step": 1053 }, { "epoch": 0.15896237086192594, "grad_norm": 0.3618317509241435, "learning_rate": 4.72517242065141e-07, "loss": 1.5203, "step": 1054 }, { "epoch": 0.1591131890505995, "grad_norm": 0.25454939085154465, "learning_rate": 4.724661589630768e-07, "loss": 1.5784, "step": 1055 }, { "epoch": 0.15926400723927306, "grad_norm": 0.3109584087928939, "learning_rate": 4.7241503152410907e-07, "loss": 1.5591, "step": 1056 }, { "epoch": 0.1594148254279466, "grad_norm": 0.7106449826146803, "learning_rate": 4.723638597597174e-07, "loss": 1.5349, "step": 1057 }, { "epoch": 0.15956564361662015, "grad_norm": 0.25701180952300506, "learning_rate": 4.7231264368139134e-07, "loss": 1.5358, "step": 1058 }, { "epoch": 0.15971646180529372, "grad_norm": 0.2617384036431046, "learning_rate": 4.7226138330063037e-07, "loss": 1.6136, "step": 1059 }, { "epoch": 0.15986727999396727, "grad_norm": 0.248514514370961, "learning_rate": 4.72210078628944e-07, "loss": 1.5699, "step": 1060 }, { "epoch": 0.16001809818264082, "grad_norm": 0.4566482953295286, "learning_rate": 4.7215872967785157e-07, "loss": 1.4869, "step": 1061 }, { "epoch": 0.1601689163713144, "grad_norm": 0.24761995402178832, "learning_rate": 4.7210733645888246e-07, "loss": 1.6156, "step": 1062 }, { "epoch": 0.16031973455998794, "grad_norm": 0.250070974124865, "learning_rate": 4.720558989835758e-07, "loss": 1.5712, "step": 1063 }, { "epoch": 0.16047055274866148, "grad_norm": 0.2786032458759379, "learning_rate": 4.720044172634811e-07, "loss": 1.583, "step": 1064 }, { "epoch": 0.16062137093733506, "grad_norm": 0.23855863462524987, "learning_rate": 4.719528913101572e-07, "loss": 1.5799, "step": 1065 }, { "epoch": 0.1607721891260086, "grad_norm": 0.32775340260671904, "learning_rate": 4.719013211351733e-07, "loss": 1.5525, "step": 1066 }, { "epoch": 0.16092300731468215, "grad_norm": 0.2502199020901237, "learning_rate": 4.718497067501084e-07, "loss": 1.5793, "step": 1067 }, { "epoch": 0.1610738255033557, "grad_norm": 0.2769500848160153, "learning_rate": 4.717980481665515e-07, "loss": 1.578, "step": 1068 }, { "epoch": 0.16122464369202927, "grad_norm": 0.2811566565102702, "learning_rate": 4.7174634539610135e-07, "loss": 1.5422, "step": 1069 }, { "epoch": 0.16137546188070281, "grad_norm": 0.26690201580123935, "learning_rate": 4.7169459845036685e-07, "loss": 1.5062, "step": 1070 }, { "epoch": 0.16152628006937636, "grad_norm": 0.24278929565756457, "learning_rate": 4.716428073409665e-07, "loss": 1.5764, "step": 1071 }, { "epoch": 0.16167709825804993, "grad_norm": 0.26380707944919146, "learning_rate": 4.7159097207952907e-07, "loss": 1.5759, "step": 1072 }, { "epoch": 0.16182791644672348, "grad_norm": 0.2401141242040836, "learning_rate": 4.715390926776931e-07, "loss": 1.553, "step": 1073 }, { "epoch": 0.16197873463539703, "grad_norm": 0.29678926738553935, "learning_rate": 4.7148716914710696e-07, "loss": 1.5737, "step": 1074 }, { "epoch": 0.16212955282407057, "grad_norm": 0.24725309737718767, "learning_rate": 4.7143520149942894e-07, "loss": 1.5078, "step": 1075 }, { "epoch": 0.16228037101274415, "grad_norm": 0.25803787562110775, "learning_rate": 4.713831897463274e-07, "loss": 1.5073, "step": 1076 }, { "epoch": 0.1624311892014177, "grad_norm": 0.26833123309240886, "learning_rate": 4.7133113389948044e-07, "loss": 1.5977, "step": 1077 }, { "epoch": 0.16258200739009124, "grad_norm": 0.24091943366927152, "learning_rate": 4.712790339705761e-07, "loss": 1.5915, "step": 1078 }, { "epoch": 0.1627328255787648, "grad_norm": 0.2454951827903659, "learning_rate": 4.7122688997131244e-07, "loss": 1.5016, "step": 1079 }, { "epoch": 0.16288364376743836, "grad_norm": 0.327938893492718, "learning_rate": 4.711747019133972e-07, "loss": 1.5499, "step": 1080 }, { "epoch": 0.1630344619561119, "grad_norm": 0.2709383069557141, "learning_rate": 4.71122469808548e-07, "loss": 1.5399, "step": 1081 }, { "epoch": 0.16318528014478545, "grad_norm": 0.2581676243463674, "learning_rate": 4.7107019366849276e-07, "loss": 1.6087, "step": 1082 }, { "epoch": 0.16333609833345902, "grad_norm": 0.3892574896237313, "learning_rate": 4.710178735049688e-07, "loss": 1.5812, "step": 1083 }, { "epoch": 0.16348691652213257, "grad_norm": 0.270203428554805, "learning_rate": 4.709655093297236e-07, "loss": 1.5764, "step": 1084 }, { "epoch": 0.16363773471080612, "grad_norm": 0.28517256024517534, "learning_rate": 4.7091310115451437e-07, "loss": 1.6103, "step": 1085 }, { "epoch": 0.1637885528994797, "grad_norm": 0.27840669121440514, "learning_rate": 4.7086064899110837e-07, "loss": 1.5659, "step": 1086 }, { "epoch": 0.16393937108815324, "grad_norm": 0.2461045836784305, "learning_rate": 4.708081528512825e-07, "loss": 1.6039, "step": 1087 }, { "epoch": 0.16409018927682678, "grad_norm": 0.7459017336078355, "learning_rate": 4.707556127468238e-07, "loss": 1.6061, "step": 1088 }, { "epoch": 0.16424100746550033, "grad_norm": 0.2561621447257331, "learning_rate": 4.70703028689529e-07, "loss": 1.5939, "step": 1089 }, { "epoch": 0.1643918256541739, "grad_norm": 0.2849028517073751, "learning_rate": 4.7065040069120474e-07, "loss": 1.5602, "step": 1090 }, { "epoch": 0.16454264384284745, "grad_norm": 0.24979491525986122, "learning_rate": 4.7059772876366755e-07, "loss": 1.5781, "step": 1091 }, { "epoch": 0.164693462031521, "grad_norm": 0.24168740791476312, "learning_rate": 4.705450129187438e-07, "loss": 1.5414, "step": 1092 }, { "epoch": 0.16484428022019457, "grad_norm": 0.24173643436887862, "learning_rate": 4.7049225316826986e-07, "loss": 1.4875, "step": 1093 }, { "epoch": 0.1649950984088681, "grad_norm": 0.3580945143021086, "learning_rate": 4.704394495240916e-07, "loss": 1.5554, "step": 1094 }, { "epoch": 0.16514591659754166, "grad_norm": 0.27131705981391885, "learning_rate": 4.7038660199806514e-07, "loss": 1.5368, "step": 1095 }, { "epoch": 0.1652967347862152, "grad_norm": 0.2525411683704331, "learning_rate": 4.7033371060205626e-07, "loss": 1.5617, "step": 1096 }, { "epoch": 0.16544755297488878, "grad_norm": 0.2426665694554285, "learning_rate": 4.702807753479406e-07, "loss": 1.5246, "step": 1097 }, { "epoch": 0.16559837116356232, "grad_norm": 0.25346540257856687, "learning_rate": 4.702277962476036e-07, "loss": 1.5931, "step": 1098 }, { "epoch": 0.16574918935223587, "grad_norm": 0.24568700514771802, "learning_rate": 4.7017477331294075e-07, "loss": 1.6475, "step": 1099 }, { "epoch": 0.16590000754090944, "grad_norm": 0.2506897867451099, "learning_rate": 4.701217065558572e-07, "loss": 1.6134, "step": 1100 }, { "epoch": 0.166050825729583, "grad_norm": 0.24960037855627876, "learning_rate": 4.7006859598826787e-07, "loss": 1.5192, "step": 1101 }, { "epoch": 0.16620164391825654, "grad_norm": 0.2545680815817411, "learning_rate": 4.700154416220978e-07, "loss": 1.6317, "step": 1102 }, { "epoch": 0.16635246210693008, "grad_norm": 0.2876698360184517, "learning_rate": 4.699622434692816e-07, "loss": 1.551, "step": 1103 }, { "epoch": 0.16650328029560366, "grad_norm": 0.28949239161364276, "learning_rate": 4.6990900154176374e-07, "loss": 1.5751, "step": 1104 }, { "epoch": 0.1666540984842772, "grad_norm": 0.27888008836701456, "learning_rate": 4.698557158514987e-07, "loss": 1.5704, "step": 1105 }, { "epoch": 0.16680491667295075, "grad_norm": 0.25306042514402155, "learning_rate": 4.6980238641045063e-07, "loss": 1.6322, "step": 1106 }, { "epoch": 0.16695573486162432, "grad_norm": 0.27416420160692634, "learning_rate": 4.6974901323059344e-07, "loss": 1.5494, "step": 1107 }, { "epoch": 0.16710655305029787, "grad_norm": 0.27633683131886433, "learning_rate": 4.6969559632391126e-07, "loss": 1.5587, "step": 1108 }, { "epoch": 0.16725737123897141, "grad_norm": 0.23453200202259325, "learning_rate": 4.6964213570239734e-07, "loss": 1.5099, "step": 1109 }, { "epoch": 0.167408189427645, "grad_norm": 0.25539051826846015, "learning_rate": 4.6958863137805555e-07, "loss": 1.5002, "step": 1110 }, { "epoch": 0.16755900761631853, "grad_norm": 0.24623451238381902, "learning_rate": 4.6953508336289884e-07, "loss": 1.5422, "step": 1111 }, { "epoch": 0.16770982580499208, "grad_norm": 0.24411710831121344, "learning_rate": 4.6948149166895047e-07, "loss": 1.5739, "step": 1112 }, { "epoch": 0.16786064399366563, "grad_norm": 0.7545924295706974, "learning_rate": 4.694278563082432e-07, "loss": 1.5068, "step": 1113 }, { "epoch": 0.1680114621823392, "grad_norm": 0.24330881130548362, "learning_rate": 4.693741772928199e-07, "loss": 1.538, "step": 1114 }, { "epoch": 0.16816228037101275, "grad_norm": 0.25616718301837377, "learning_rate": 4.6932045463473296e-07, "loss": 1.6199, "step": 1115 }, { "epoch": 0.1683130985596863, "grad_norm": 0.247945338635809, "learning_rate": 4.6926668834604485e-07, "loss": 1.5797, "step": 1116 }, { "epoch": 0.16846391674835987, "grad_norm": 0.2753322662696697, "learning_rate": 4.692128784388274e-07, "loss": 1.5547, "step": 1117 }, { "epoch": 0.1686147349370334, "grad_norm": 0.2896067230267288, "learning_rate": 4.6915902492516264e-07, "loss": 1.6004, "step": 1118 }, { "epoch": 0.16876555312570696, "grad_norm": 0.2941113608129385, "learning_rate": 4.6910512781714227e-07, "loss": 1.5139, "step": 1119 }, { "epoch": 0.1689163713143805, "grad_norm": 0.25923390975079347, "learning_rate": 4.6905118712686775e-07, "loss": 1.532, "step": 1120 }, { "epoch": 0.16906718950305408, "grad_norm": 0.24243194337292528, "learning_rate": 4.689972028664503e-07, "loss": 1.5278, "step": 1121 }, { "epoch": 0.16921800769172762, "grad_norm": 0.2608954624809244, "learning_rate": 4.689431750480111e-07, "loss": 1.5048, "step": 1122 }, { "epoch": 0.16936882588040117, "grad_norm": 0.24984681694708277, "learning_rate": 4.688891036836807e-07, "loss": 1.5663, "step": 1123 }, { "epoch": 0.16951964406907474, "grad_norm": 0.3197763053893994, "learning_rate": 4.6883498878559996e-07, "loss": 1.5756, "step": 1124 }, { "epoch": 0.1696704622577483, "grad_norm": 0.24561609481331534, "learning_rate": 4.68780830365919e-07, "loss": 1.5683, "step": 1125 }, { "epoch": 0.16982128044642184, "grad_norm": 0.28356897070283404, "learning_rate": 4.687266284367982e-07, "loss": 1.5617, "step": 1126 }, { "epoch": 0.16997209863509538, "grad_norm": 0.2818725478575507, "learning_rate": 4.686723830104073e-07, "loss": 1.5588, "step": 1127 }, { "epoch": 0.17012291682376895, "grad_norm": 0.24702190052737416, "learning_rate": 4.686180940989262e-07, "loss": 1.65, "step": 1128 }, { "epoch": 0.1702737350124425, "grad_norm": 0.235138614450344, "learning_rate": 4.6856376171454407e-07, "loss": 1.5549, "step": 1129 }, { "epoch": 0.17042455320111605, "grad_norm": 0.2628161495605817, "learning_rate": 4.685093858694603e-07, "loss": 1.5571, "step": 1130 }, { "epoch": 0.17057537138978962, "grad_norm": 0.3027638920075395, "learning_rate": 4.684549665758838e-07, "loss": 1.5786, "step": 1131 }, { "epoch": 0.17072618957846317, "grad_norm": 0.25499204449953544, "learning_rate": 4.684005038460332e-07, "loss": 1.552, "step": 1132 }, { "epoch": 0.1708770077671367, "grad_norm": 0.2709751281073486, "learning_rate": 4.6834599769213706e-07, "loss": 1.6273, "step": 1133 }, { "epoch": 0.17102782595581026, "grad_norm": 0.24112103680979602, "learning_rate": 4.682914481264336e-07, "loss": 1.595, "step": 1134 }, { "epoch": 0.17117864414448383, "grad_norm": 0.24147697799095597, "learning_rate": 4.682368551611707e-07, "loss": 1.588, "step": 1135 }, { "epoch": 0.17132946233315738, "grad_norm": 0.2804421696394345, "learning_rate": 4.6818221880860624e-07, "loss": 1.5457, "step": 1136 }, { "epoch": 0.17148028052183092, "grad_norm": 0.2744210161984986, "learning_rate": 4.6812753908100755e-07, "loss": 1.5371, "step": 1137 }, { "epoch": 0.1716310987105045, "grad_norm": 0.262449872729529, "learning_rate": 4.680728159906518e-07, "loss": 1.5695, "step": 1138 }, { "epoch": 0.17178191689917804, "grad_norm": 0.24442008725733502, "learning_rate": 4.680180495498259e-07, "loss": 1.6474, "step": 1139 }, { "epoch": 0.1719327350878516, "grad_norm": 0.2704581218611592, "learning_rate": 4.6796323977082673e-07, "loss": 1.5642, "step": 1140 }, { "epoch": 0.17208355327652514, "grad_norm": 0.25987740785314206, "learning_rate": 4.679083866659604e-07, "loss": 1.5364, "step": 1141 }, { "epoch": 0.1722343714651987, "grad_norm": 0.25998785665147006, "learning_rate": 4.6785349024754316e-07, "loss": 1.5816, "step": 1142 }, { "epoch": 0.17238518965387226, "grad_norm": 0.25789553553819966, "learning_rate": 4.677985505279009e-07, "loss": 1.5617, "step": 1143 }, { "epoch": 0.1725360078425458, "grad_norm": 0.28434801761107686, "learning_rate": 4.677435675193691e-07, "loss": 1.5721, "step": 1144 }, { "epoch": 0.17268682603121938, "grad_norm": 0.23661054651052396, "learning_rate": 4.6768854123429315e-07, "loss": 1.5955, "step": 1145 }, { "epoch": 0.17283764421989292, "grad_norm": 0.2629669952322758, "learning_rate": 4.676334716850279e-07, "loss": 1.6193, "step": 1146 }, { "epoch": 0.17298846240856647, "grad_norm": 0.26201173668106353, "learning_rate": 4.675783588839381e-07, "loss": 1.5898, "step": 1147 }, { "epoch": 0.17313928059724001, "grad_norm": 0.24778101315185597, "learning_rate": 4.675232028433983e-07, "loss": 1.5255, "step": 1148 }, { "epoch": 0.1732900987859136, "grad_norm": 0.24200358612342673, "learning_rate": 4.674680035757925e-07, "loss": 1.5684, "step": 1149 }, { "epoch": 0.17344091697458713, "grad_norm": 0.25224952718930654, "learning_rate": 4.6741276109351475e-07, "loss": 1.5593, "step": 1150 }, { "epoch": 0.17359173516326068, "grad_norm": 0.3384173809607694, "learning_rate": 4.6735747540896834e-07, "loss": 1.5833, "step": 1151 }, { "epoch": 0.17374255335193425, "grad_norm": 0.2524479053804888, "learning_rate": 4.673021465345667e-07, "loss": 1.5164, "step": 1152 }, { "epoch": 0.1738933715406078, "grad_norm": 0.28287920840389524, "learning_rate": 4.6724677448273266e-07, "loss": 1.5673, "step": 1153 }, { "epoch": 0.17404418972928135, "grad_norm": 0.2591593514038448, "learning_rate": 4.6719135926589894e-07, "loss": 1.6015, "step": 1154 }, { "epoch": 0.1741950079179549, "grad_norm": 0.24488990280488418, "learning_rate": 4.671359008965077e-07, "loss": 1.5585, "step": 1155 }, { "epoch": 0.17434582610662847, "grad_norm": 0.4787738584719552, "learning_rate": 4.6708039938701125e-07, "loss": 1.5247, "step": 1156 }, { "epoch": 0.174496644295302, "grad_norm": 0.25065181476230003, "learning_rate": 4.6702485474987106e-07, "loss": 1.5832, "step": 1157 }, { "epoch": 0.17464746248397556, "grad_norm": 0.2562710623733999, "learning_rate": 4.669692669975587e-07, "loss": 1.5467, "step": 1158 }, { "epoch": 0.17479828067264913, "grad_norm": 0.25990482455141345, "learning_rate": 4.6691363614255495e-07, "loss": 1.5911, "step": 1159 }, { "epoch": 0.17494909886132268, "grad_norm": 1.1721035851483081, "learning_rate": 4.6685796219735085e-07, "loss": 1.6056, "step": 1160 }, { "epoch": 0.17509991704999622, "grad_norm": 0.4569980453387458, "learning_rate": 4.6680224517444677e-07, "loss": 1.536, "step": 1161 }, { "epoch": 0.1752507352386698, "grad_norm": 0.26157794262597983, "learning_rate": 4.6674648508635265e-07, "loss": 1.5472, "step": 1162 }, { "epoch": 0.17540155342734334, "grad_norm": 0.5498314042925766, "learning_rate": 4.6669068194558845e-07, "loss": 1.5841, "step": 1163 }, { "epoch": 0.1755523716160169, "grad_norm": 0.25302157568776995, "learning_rate": 4.666348357646835e-07, "loss": 1.5322, "step": 1164 }, { "epoch": 0.17570318980469044, "grad_norm": 0.2614995286773213, "learning_rate": 4.665789465561768e-07, "loss": 1.6134, "step": 1165 }, { "epoch": 0.175854007993364, "grad_norm": 0.26952547735276366, "learning_rate": 4.665230143326172e-07, "loss": 1.552, "step": 1166 }, { "epoch": 0.17600482618203755, "grad_norm": 0.24225918451469886, "learning_rate": 4.6646703910656323e-07, "loss": 1.5364, "step": 1167 }, { "epoch": 0.1761556443707111, "grad_norm": 0.25182051284504964, "learning_rate": 4.664110208905828e-07, "loss": 1.6144, "step": 1168 }, { "epoch": 0.17630646255938467, "grad_norm": 0.2730450933795206, "learning_rate": 4.663549596972536e-07, "loss": 1.6151, "step": 1169 }, { "epoch": 0.17645728074805822, "grad_norm": 0.2557698545135914, "learning_rate": 4.6629885553916317e-07, "loss": 1.5636, "step": 1170 }, { "epoch": 0.17660809893673177, "grad_norm": 0.2650047668822396, "learning_rate": 4.6624270842890837e-07, "loss": 1.4867, "step": 1171 }, { "epoch": 0.1767589171254053, "grad_norm": 0.2763224194988131, "learning_rate": 4.6618651837909594e-07, "loss": 1.5732, "step": 1172 }, { "epoch": 0.1769097353140789, "grad_norm": 0.24415437934266487, "learning_rate": 4.661302854023422e-07, "loss": 1.5438, "step": 1173 }, { "epoch": 0.17706055350275243, "grad_norm": 0.2499587713726338, "learning_rate": 4.6607400951127303e-07, "loss": 1.6016, "step": 1174 }, { "epoch": 0.17721137169142598, "grad_norm": 0.25085110818024176, "learning_rate": 4.6601769071852404e-07, "loss": 1.5574, "step": 1175 }, { "epoch": 0.17736218988009955, "grad_norm": 0.28681072029014526, "learning_rate": 4.6596132903674045e-07, "loss": 1.6581, "step": 1176 }, { "epoch": 0.1775130080687731, "grad_norm": 0.24192430815712648, "learning_rate": 4.6590492447857703e-07, "loss": 1.5414, "step": 1177 }, { "epoch": 0.17766382625744664, "grad_norm": 0.24901022969070188, "learning_rate": 4.6584847705669836e-07, "loss": 1.6126, "step": 1178 }, { "epoch": 0.1778146444461202, "grad_norm": 0.2693770304622494, "learning_rate": 4.657919867837785e-07, "loss": 1.6803, "step": 1179 }, { "epoch": 0.17796546263479376, "grad_norm": 0.27253275460786325, "learning_rate": 4.657354536725011e-07, "loss": 1.5254, "step": 1180 }, { "epoch": 0.1781162808234673, "grad_norm": 0.28893955719910797, "learning_rate": 4.656788777355594e-07, "loss": 1.6278, "step": 1181 }, { "epoch": 0.17826709901214086, "grad_norm": 0.25047897466701746, "learning_rate": 4.656222589856566e-07, "loss": 1.6276, "step": 1182 }, { "epoch": 0.17841791720081443, "grad_norm": 0.4119551108041056, "learning_rate": 4.6556559743550503e-07, "loss": 1.5875, "step": 1183 }, { "epoch": 0.17856873538948798, "grad_norm": 0.25089661197311985, "learning_rate": 4.6550889309782706e-07, "loss": 1.6185, "step": 1184 }, { "epoch": 0.17871955357816152, "grad_norm": 0.26179055051036315, "learning_rate": 4.6545214598535425e-07, "loss": 1.6112, "step": 1185 }, { "epoch": 0.17887037176683507, "grad_norm": 0.26197715582258846, "learning_rate": 4.653953561108282e-07, "loss": 1.5322, "step": 1186 }, { "epoch": 0.17902118995550864, "grad_norm": 0.26613377733800503, "learning_rate": 4.653385234869997e-07, "loss": 1.5794, "step": 1187 }, { "epoch": 0.1791720081441822, "grad_norm": 0.27556089428854624, "learning_rate": 4.652816481266294e-07, "loss": 1.5722, "step": 1188 }, { "epoch": 0.17932282633285573, "grad_norm": 0.24865359633810788, "learning_rate": 4.6522473004248753e-07, "loss": 1.6073, "step": 1189 }, { "epoch": 0.1794736445215293, "grad_norm": 0.257044327015407, "learning_rate": 4.6516776924735377e-07, "loss": 1.5689, "step": 1190 }, { "epoch": 0.17962446271020285, "grad_norm": 0.26973739450249506, "learning_rate": 4.651107657540176e-07, "loss": 1.6174, "step": 1191 }, { "epoch": 0.1797752808988764, "grad_norm": 0.2868057332040489, "learning_rate": 4.650537195752778e-07, "loss": 1.6308, "step": 1192 }, { "epoch": 0.17992609908754995, "grad_norm": 0.3133909161724188, "learning_rate": 4.6499663072394297e-07, "loss": 1.6096, "step": 1193 }, { "epoch": 0.18007691727622352, "grad_norm": 0.24387235006379238, "learning_rate": 4.649394992128313e-07, "loss": 1.6036, "step": 1194 }, { "epoch": 0.18022773546489707, "grad_norm": 0.25947348393779196, "learning_rate": 4.648823250547703e-07, "loss": 1.5708, "step": 1195 }, { "epoch": 0.1803785536535706, "grad_norm": 0.2523203292864254, "learning_rate": 4.648251082625975e-07, "loss": 1.5055, "step": 1196 }, { "epoch": 0.18052937184224419, "grad_norm": 0.2652017675976741, "learning_rate": 4.647678488491594e-07, "loss": 1.5491, "step": 1197 }, { "epoch": 0.18068019003091773, "grad_norm": 0.26212973358226843, "learning_rate": 4.6471054682731265e-07, "loss": 1.5581, "step": 1198 }, { "epoch": 0.18083100821959128, "grad_norm": 0.25981702383470334, "learning_rate": 4.6465320220992313e-07, "loss": 1.5524, "step": 1199 }, { "epoch": 0.18098182640826482, "grad_norm": 0.2526846428134517, "learning_rate": 4.6459581500986643e-07, "loss": 1.5485, "step": 1200 }, { "epoch": 0.1811326445969384, "grad_norm": 0.26353334465494704, "learning_rate": 4.6453838524002766e-07, "loss": 1.5731, "step": 1201 }, { "epoch": 0.18128346278561194, "grad_norm": 0.243742854481381, "learning_rate": 4.644809129133014e-07, "loss": 1.5511, "step": 1202 }, { "epoch": 0.1814342809742855, "grad_norm": 0.26479887844214733, "learning_rate": 4.644233980425919e-07, "loss": 1.6034, "step": 1203 }, { "epoch": 0.18158509916295906, "grad_norm": 0.2691086005875682, "learning_rate": 4.643658406408129e-07, "loss": 1.6087, "step": 1204 }, { "epoch": 0.1817359173516326, "grad_norm": 0.2697227597209044, "learning_rate": 4.643082407208878e-07, "loss": 1.5873, "step": 1205 }, { "epoch": 0.18188673554030615, "grad_norm": 0.24146757915863593, "learning_rate": 4.642505982957493e-07, "loss": 1.5692, "step": 1206 }, { "epoch": 0.18203755372897973, "grad_norm": 0.2532703897938818, "learning_rate": 4.6419291337834e-07, "loss": 1.5492, "step": 1207 }, { "epoch": 0.18218837191765327, "grad_norm": 1.700524666935398, "learning_rate": 4.641351859816117e-07, "loss": 1.5766, "step": 1208 }, { "epoch": 0.18233919010632682, "grad_norm": 0.25003078506309195, "learning_rate": 4.6407741611852586e-07, "loss": 1.6277, "step": 1209 }, { "epoch": 0.18249000829500037, "grad_norm": 0.25690734785468633, "learning_rate": 4.640196038020536e-07, "loss": 1.5871, "step": 1210 }, { "epoch": 0.18264082648367394, "grad_norm": 0.24526265142746212, "learning_rate": 4.6396174904517536e-07, "loss": 1.5454, "step": 1211 }, { "epoch": 0.1827916446723475, "grad_norm": 0.29939366930004707, "learning_rate": 4.639038518608814e-07, "loss": 1.5375, "step": 1212 }, { "epoch": 0.18294246286102103, "grad_norm": 0.25046227529852316, "learning_rate": 4.6384591226217116e-07, "loss": 1.6298, "step": 1213 }, { "epoch": 0.1830932810496946, "grad_norm": 0.45612339086244286, "learning_rate": 4.6378793026205384e-07, "loss": 1.5488, "step": 1214 }, { "epoch": 0.18324409923836815, "grad_norm": 0.24891897568860974, "learning_rate": 4.6372990587354805e-07, "loss": 1.5294, "step": 1215 }, { "epoch": 0.1833949174270417, "grad_norm": 0.24876130723764878, "learning_rate": 4.6367183910968195e-07, "loss": 1.6293, "step": 1216 }, { "epoch": 0.18354573561571524, "grad_norm": 0.2574961763836179, "learning_rate": 4.636137299834932e-07, "loss": 1.5755, "step": 1217 }, { "epoch": 0.18369655380438882, "grad_norm": 0.24504201176849688, "learning_rate": 4.6355557850802914e-07, "loss": 1.5819, "step": 1218 }, { "epoch": 0.18384737199306236, "grad_norm": 0.25803038007649165, "learning_rate": 4.6349738469634624e-07, "loss": 1.52, "step": 1219 }, { "epoch": 0.1839981901817359, "grad_norm": 0.37056793420273376, "learning_rate": 4.634391485615109e-07, "loss": 1.5206, "step": 1220 }, { "epoch": 0.18414900837040948, "grad_norm": 0.23740715237675503, "learning_rate": 4.633808701165988e-07, "loss": 1.519, "step": 1221 }, { "epoch": 0.18429982655908303, "grad_norm": 0.256097921253572, "learning_rate": 4.63322549374695e-07, "loss": 1.6017, "step": 1222 }, { "epoch": 0.18445064474775658, "grad_norm": 0.24119311605941288, "learning_rate": 4.632641863488944e-07, "loss": 1.5682, "step": 1223 }, { "epoch": 0.18460146293643012, "grad_norm": 0.2398564783834201, "learning_rate": 4.63205781052301e-07, "loss": 1.5638, "step": 1224 }, { "epoch": 0.1847522811251037, "grad_norm": 0.3588221629085808, "learning_rate": 4.6314733349802867e-07, "loss": 1.5388, "step": 1225 }, { "epoch": 0.18490309931377724, "grad_norm": 0.4161033662354953, "learning_rate": 4.630888436992005e-07, "loss": 1.639, "step": 1226 }, { "epoch": 0.1850539175024508, "grad_norm": 0.24893733198183843, "learning_rate": 4.6303031166894914e-07, "loss": 1.5986, "step": 1227 }, { "epoch": 0.18520473569112436, "grad_norm": 0.25661111198037884, "learning_rate": 4.629717374204168e-07, "loss": 1.5917, "step": 1228 }, { "epoch": 0.1853555538797979, "grad_norm": 0.23584522799787572, "learning_rate": 4.6291312096675507e-07, "loss": 1.5144, "step": 1229 }, { "epoch": 0.18550637206847145, "grad_norm": 0.35466238153603863, "learning_rate": 4.6285446232112494e-07, "loss": 1.6056, "step": 1230 }, { "epoch": 0.185657190257145, "grad_norm": 0.2761629325029236, "learning_rate": 4.6279576149669717e-07, "loss": 1.6149, "step": 1231 }, { "epoch": 0.18580800844581857, "grad_norm": 0.24320364119864546, "learning_rate": 4.6273701850665176e-07, "loss": 1.5697, "step": 1232 }, { "epoch": 0.18595882663449212, "grad_norm": 0.28039155604174815, "learning_rate": 4.6267823336417817e-07, "loss": 1.5901, "step": 1233 }, { "epoch": 0.18610964482316567, "grad_norm": 0.3016051065041896, "learning_rate": 4.626194060824754e-07, "loss": 1.5678, "step": 1234 }, { "epoch": 0.18626046301183924, "grad_norm": 0.3978296304643922, "learning_rate": 4.6256053667475187e-07, "loss": 1.6408, "step": 1235 }, { "epoch": 0.18641128120051279, "grad_norm": 0.2618642345392666, "learning_rate": 4.625016251542255e-07, "loss": 1.5403, "step": 1236 }, { "epoch": 0.18656209938918633, "grad_norm": 0.28061797524066334, "learning_rate": 4.6244267153412363e-07, "loss": 1.6287, "step": 1237 }, { "epoch": 0.18671291757785988, "grad_norm": 0.2539669193134728, "learning_rate": 4.6238367582768304e-07, "loss": 1.571, "step": 1238 }, { "epoch": 0.18686373576653345, "grad_norm": 0.2510040514153634, "learning_rate": 4.6232463804815003e-07, "loss": 1.5547, "step": 1239 }, { "epoch": 0.187014553955207, "grad_norm": 0.25538695923375226, "learning_rate": 4.622655582087803e-07, "loss": 1.5653, "step": 1240 }, { "epoch": 0.18716537214388054, "grad_norm": 0.34510024150041224, "learning_rate": 4.62206436322839e-07, "loss": 1.6462, "step": 1241 }, { "epoch": 0.18731619033255412, "grad_norm": 0.25761472217381176, "learning_rate": 4.6214727240360055e-07, "loss": 1.6034, "step": 1242 }, { "epoch": 0.18746700852122766, "grad_norm": 0.2602442266228183, "learning_rate": 4.6208806646434923e-07, "loss": 1.6195, "step": 1243 }, { "epoch": 0.1876178267099012, "grad_norm": 0.3121342337827262, "learning_rate": 4.620288185183784e-07, "loss": 1.5173, "step": 1244 }, { "epoch": 0.18776864489857475, "grad_norm": 0.3112698054707776, "learning_rate": 4.619695285789909e-07, "loss": 1.5601, "step": 1245 }, { "epoch": 0.18791946308724833, "grad_norm": 0.6375963358679859, "learning_rate": 4.619101966594991e-07, "loss": 1.5594, "step": 1246 }, { "epoch": 0.18807028127592187, "grad_norm": 0.24807667638603587, "learning_rate": 4.6185082277322465e-07, "loss": 1.5797, "step": 1247 }, { "epoch": 0.18822109946459542, "grad_norm": 0.2747542330598371, "learning_rate": 4.617914069334989e-07, "loss": 1.5427, "step": 1248 }, { "epoch": 0.188371917653269, "grad_norm": 0.24848149709894782, "learning_rate": 4.617319491536623e-07, "loss": 1.5991, "step": 1249 }, { "epoch": 0.18852273584194254, "grad_norm": 0.25170883005078193, "learning_rate": 4.616724494470648e-07, "loss": 1.6421, "step": 1250 }, { "epoch": 0.1886735540306161, "grad_norm": 0.2679738449785433, "learning_rate": 4.616129078270659e-07, "loss": 1.5997, "step": 1251 }, { "epoch": 0.18882437221928966, "grad_norm": 0.2577440112764784, "learning_rate": 4.6155332430703456e-07, "loss": 1.5468, "step": 1252 }, { "epoch": 0.1889751904079632, "grad_norm": 0.2444515358662243, "learning_rate": 4.614936989003487e-07, "loss": 1.606, "step": 1253 }, { "epoch": 0.18912600859663675, "grad_norm": 0.3533797144661275, "learning_rate": 4.614340316203963e-07, "loss": 1.5285, "step": 1254 }, { "epoch": 0.1892768267853103, "grad_norm": 0.24621271282013607, "learning_rate": 4.613743224805741e-07, "loss": 1.5976, "step": 1255 }, { "epoch": 0.18942764497398387, "grad_norm": 0.25113303023350053, "learning_rate": 4.613145714942888e-07, "loss": 1.5832, "step": 1256 }, { "epoch": 0.18957846316265742, "grad_norm": 0.25694879956941913, "learning_rate": 4.6125477867495603e-07, "loss": 1.5284, "step": 1257 }, { "epoch": 0.18972928135133096, "grad_norm": 0.2501853930979283, "learning_rate": 4.6119494403600123e-07, "loss": 1.6257, "step": 1258 }, { "epoch": 0.18988009954000454, "grad_norm": 0.2657571741616329, "learning_rate": 4.611350675908588e-07, "loss": 1.55, "step": 1259 }, { "epoch": 0.19003091772867808, "grad_norm": 0.24410102087236463, "learning_rate": 4.6107514935297285e-07, "loss": 1.5801, "step": 1260 }, { "epoch": 0.19018173591735163, "grad_norm": 0.24922626928952427, "learning_rate": 4.6101518933579674e-07, "loss": 1.5757, "step": 1261 }, { "epoch": 0.19033255410602518, "grad_norm": 0.2651313594595526, "learning_rate": 4.609551875527933e-07, "loss": 1.6483, "step": 1262 }, { "epoch": 0.19048337229469875, "grad_norm": 0.25817997842493584, "learning_rate": 4.6089514401743466e-07, "loss": 1.5612, "step": 1263 }, { "epoch": 0.1906341904833723, "grad_norm": 0.3206507643473013, "learning_rate": 4.608350587432023e-07, "loss": 1.5033, "step": 1264 }, { "epoch": 0.19078500867204584, "grad_norm": 0.23999627909004798, "learning_rate": 4.6077493174358715e-07, "loss": 1.5554, "step": 1265 }, { "epoch": 0.19093582686071942, "grad_norm": 0.24365895893821485, "learning_rate": 4.607147630320894e-07, "loss": 1.5166, "step": 1266 }, { "epoch": 0.19108664504939296, "grad_norm": 0.25204093453281395, "learning_rate": 4.6065455262221884e-07, "loss": 1.5883, "step": 1267 }, { "epoch": 0.1912374632380665, "grad_norm": 0.24931428905056033, "learning_rate": 4.605943005274943e-07, "loss": 1.4928, "step": 1268 }, { "epoch": 0.19138828142674005, "grad_norm": 0.27467672421431577, "learning_rate": 4.605340067614442e-07, "loss": 1.5958, "step": 1269 }, { "epoch": 0.19153909961541363, "grad_norm": 0.24682085106543605, "learning_rate": 4.6047367133760623e-07, "loss": 1.5171, "step": 1270 }, { "epoch": 0.19168991780408717, "grad_norm": 0.2553685144848191, "learning_rate": 4.604132942695275e-07, "loss": 1.5606, "step": 1271 }, { "epoch": 0.19184073599276072, "grad_norm": 0.2537348129260827, "learning_rate": 4.603528755707642e-07, "loss": 1.5597, "step": 1272 }, { "epoch": 0.1919915541814343, "grad_norm": 0.25380935336276694, "learning_rate": 4.6029241525488246e-07, "loss": 1.521, "step": 1273 }, { "epoch": 0.19214237237010784, "grad_norm": 0.25771573527963487, "learning_rate": 4.6023191333545707e-07, "loss": 1.5038, "step": 1274 }, { "epoch": 0.19229319055878139, "grad_norm": 0.24367657069761992, "learning_rate": 4.6017136982607265e-07, "loss": 1.5537, "step": 1275 }, { "epoch": 0.19244400874745493, "grad_norm": 0.252004375801695, "learning_rate": 4.601107847403229e-07, "loss": 1.5619, "step": 1276 }, { "epoch": 0.1925948269361285, "grad_norm": 0.24426652256256345, "learning_rate": 4.6005015809181094e-07, "loss": 1.6252, "step": 1277 }, { "epoch": 0.19274564512480205, "grad_norm": 0.24547818197943544, "learning_rate": 4.5998948989414934e-07, "loss": 1.5503, "step": 1278 }, { "epoch": 0.1928964633134756, "grad_norm": 0.24177262746334108, "learning_rate": 4.599287801609596e-07, "loss": 1.5638, "step": 1279 }, { "epoch": 0.19304728150214917, "grad_norm": 0.2613304195476814, "learning_rate": 4.5986802890587304e-07, "loss": 1.5172, "step": 1280 }, { "epoch": 0.19319809969082272, "grad_norm": 0.25227676166352114, "learning_rate": 4.598072361425301e-07, "loss": 1.5577, "step": 1281 }, { "epoch": 0.19334891787949626, "grad_norm": 0.257336503929896, "learning_rate": 4.5974640188458043e-07, "loss": 1.5726, "step": 1282 }, { "epoch": 0.1934997360681698, "grad_norm": 0.25307676147124225, "learning_rate": 4.596855261456831e-07, "loss": 1.572, "step": 1283 }, { "epoch": 0.19365055425684338, "grad_norm": 0.3182845915121795, "learning_rate": 4.596246089395066e-07, "loss": 1.6012, "step": 1284 }, { "epoch": 0.19380137244551693, "grad_norm": 0.27830626585061713, "learning_rate": 4.595636502797284e-07, "loss": 1.6253, "step": 1285 }, { "epoch": 0.19395219063419047, "grad_norm": 0.2625158798232183, "learning_rate": 4.595026501800357e-07, "loss": 1.5688, "step": 1286 }, { "epoch": 0.19410300882286405, "grad_norm": 0.257794216429428, "learning_rate": 4.5944160865412474e-07, "loss": 1.5646, "step": 1287 }, { "epoch": 0.1942538270115376, "grad_norm": 0.24249718273407012, "learning_rate": 4.593805257157011e-07, "loss": 1.568, "step": 1288 }, { "epoch": 0.19440464520021114, "grad_norm": 0.2717055951556258, "learning_rate": 4.5931940137847964e-07, "loss": 1.5762, "step": 1289 }, { "epoch": 0.1945554633888847, "grad_norm": 0.251473646820601, "learning_rate": 4.592582356561846e-07, "loss": 1.6225, "step": 1290 }, { "epoch": 0.19470628157755826, "grad_norm": 0.2530523964470091, "learning_rate": 4.5919702856254947e-07, "loss": 1.4989, "step": 1291 }, { "epoch": 0.1948570997662318, "grad_norm": 0.24090357138386617, "learning_rate": 4.5913578011131703e-07, "loss": 1.5378, "step": 1292 }, { "epoch": 0.19500791795490535, "grad_norm": 0.2529652593382991, "learning_rate": 4.5907449031623936e-07, "loss": 1.6045, "step": 1293 }, { "epoch": 0.19515873614357893, "grad_norm": 0.24554505644207056, "learning_rate": 4.5901315919107777e-07, "loss": 1.5979, "step": 1294 }, { "epoch": 0.19530955433225247, "grad_norm": 0.2462215351501105, "learning_rate": 4.589517867496028e-07, "loss": 1.6437, "step": 1295 }, { "epoch": 0.19546037252092602, "grad_norm": 0.25811189385629774, "learning_rate": 4.588903730055945e-07, "loss": 1.5753, "step": 1296 }, { "epoch": 0.19561119070959956, "grad_norm": 0.26564361953917165, "learning_rate": 4.5882891797284193e-07, "loss": 1.4945, "step": 1297 }, { "epoch": 0.19576200889827314, "grad_norm": 0.26828284814969955, "learning_rate": 4.587674216651437e-07, "loss": 1.5596, "step": 1298 }, { "epoch": 0.19591282708694668, "grad_norm": 0.2518716180925786, "learning_rate": 4.587058840963073e-07, "loss": 1.5467, "step": 1299 }, { "epoch": 0.19606364527562023, "grad_norm": 0.24883212949633934, "learning_rate": 4.5864430528014994e-07, "loss": 1.5684, "step": 1300 }, { "epoch": 0.1962144634642938, "grad_norm": 0.5995044319510145, "learning_rate": 4.585826852304976e-07, "loss": 1.6714, "step": 1301 }, { "epoch": 0.19636528165296735, "grad_norm": 0.2528807116883068, "learning_rate": 4.585210239611859e-07, "loss": 1.49, "step": 1302 }, { "epoch": 0.1965160998416409, "grad_norm": 0.2484612567828872, "learning_rate": 4.584593214860597e-07, "loss": 1.5581, "step": 1303 }, { "epoch": 0.19666691803031447, "grad_norm": 0.2520265580853553, "learning_rate": 4.583975778189728e-07, "loss": 1.5489, "step": 1304 }, { "epoch": 0.19681773621898802, "grad_norm": 0.24330409597101163, "learning_rate": 4.583357929737886e-07, "loss": 1.5461, "step": 1305 }, { "epoch": 0.19696855440766156, "grad_norm": 0.2450801329502804, "learning_rate": 4.582739669643796e-07, "loss": 1.542, "step": 1306 }, { "epoch": 0.1971193725963351, "grad_norm": 0.24670229706809432, "learning_rate": 4.5821209980462737e-07, "loss": 1.5479, "step": 1307 }, { "epoch": 0.19727019078500868, "grad_norm": 0.265775400162428, "learning_rate": 4.581501915084231e-07, "loss": 1.6353, "step": 1308 }, { "epoch": 0.19742100897368223, "grad_norm": 0.2451961552667391, "learning_rate": 4.580882420896669e-07, "loss": 1.624, "step": 1309 }, { "epoch": 0.19757182716235577, "grad_norm": 0.24175200633283164, "learning_rate": 4.580262515622682e-07, "loss": 1.5441, "step": 1310 }, { "epoch": 0.19772264535102935, "grad_norm": 0.266058836376841, "learning_rate": 4.579642199401457e-07, "loss": 1.5569, "step": 1311 }, { "epoch": 0.1978734635397029, "grad_norm": 0.24367772871318977, "learning_rate": 4.5790214723722735e-07, "loss": 1.5444, "step": 1312 }, { "epoch": 0.19802428172837644, "grad_norm": 0.2669581598378095, "learning_rate": 4.5784003346745026e-07, "loss": 1.5663, "step": 1313 }, { "epoch": 0.19817509991704998, "grad_norm": 0.27868587914293996, "learning_rate": 4.5777787864476073e-07, "loss": 1.5606, "step": 1314 }, { "epoch": 0.19832591810572356, "grad_norm": 0.2531823451378717, "learning_rate": 4.577156827831144e-07, "loss": 1.5303, "step": 1315 }, { "epoch": 0.1984767362943971, "grad_norm": 0.26456851904105433, "learning_rate": 4.57653445896476e-07, "loss": 1.5881, "step": 1316 }, { "epoch": 0.19862755448307065, "grad_norm": 0.24625701695281055, "learning_rate": 4.575911679988195e-07, "loss": 1.5938, "step": 1317 }, { "epoch": 0.19877837267174422, "grad_norm": 0.2705797450523775, "learning_rate": 4.575288491041282e-07, "loss": 1.5614, "step": 1318 }, { "epoch": 0.19892919086041777, "grad_norm": 0.3298997867142449, "learning_rate": 4.5746648922639444e-07, "loss": 1.5986, "step": 1319 }, { "epoch": 0.19908000904909132, "grad_norm": 0.260750951121751, "learning_rate": 4.574040883796198e-07, "loss": 1.5308, "step": 1320 }, { "epoch": 0.19923082723776486, "grad_norm": 0.25912363854098036, "learning_rate": 4.5734164657781515e-07, "loss": 1.542, "step": 1321 }, { "epoch": 0.19938164542643844, "grad_norm": 0.25176792843992035, "learning_rate": 4.5727916383500043e-07, "loss": 1.56, "step": 1322 }, { "epoch": 0.19953246361511198, "grad_norm": 0.2843038062909802, "learning_rate": 4.572166401652049e-07, "loss": 1.5762, "step": 1323 }, { "epoch": 0.19968328180378553, "grad_norm": 0.3716912797456824, "learning_rate": 4.571540755824669e-07, "loss": 1.535, "step": 1324 }, { "epoch": 0.1998340999924591, "grad_norm": 0.2509797256223773, "learning_rate": 4.570914701008341e-07, "loss": 1.494, "step": 1325 }, { "epoch": 0.19998491818113265, "grad_norm": 0.24197474440474273, "learning_rate": 4.5702882373436314e-07, "loss": 1.6091, "step": 1326 }, { "epoch": 0.2001357363698062, "grad_norm": 0.27322255667328965, "learning_rate": 4.5696613649711994e-07, "loss": 1.5679, "step": 1327 }, { "epoch": 0.20028655455847974, "grad_norm": 0.2574972467885414, "learning_rate": 4.569034084031797e-07, "loss": 1.5417, "step": 1328 }, { "epoch": 0.20043737274715331, "grad_norm": 0.2476096783459321, "learning_rate": 4.5684063946662663e-07, "loss": 1.5555, "step": 1329 }, { "epoch": 0.20058819093582686, "grad_norm": 0.2543742745683159, "learning_rate": 4.5677782970155425e-07, "loss": 1.561, "step": 1330 }, { "epoch": 0.2007390091245004, "grad_norm": 0.2588741933427589, "learning_rate": 4.567149791220651e-07, "loss": 1.5036, "step": 1331 }, { "epoch": 0.20088982731317398, "grad_norm": 0.24985352822264073, "learning_rate": 4.56652087742271e-07, "loss": 1.551, "step": 1332 }, { "epoch": 0.20104064550184753, "grad_norm": 1.0017865312383014, "learning_rate": 4.56589155576293e-07, "loss": 1.5412, "step": 1333 }, { "epoch": 0.20119146369052107, "grad_norm": 0.24359154642673883, "learning_rate": 4.56526182638261e-07, "loss": 1.5443, "step": 1334 }, { "epoch": 0.20134228187919462, "grad_norm": 0.25702345630740436, "learning_rate": 4.5646316894231447e-07, "loss": 1.6313, "step": 1335 }, { "epoch": 0.2014931000678682, "grad_norm": 0.25280312061914906, "learning_rate": 4.5640011450260174e-07, "loss": 1.5685, "step": 1336 }, { "epoch": 0.20164391825654174, "grad_norm": 0.28617712038991966, "learning_rate": 4.5633701933328026e-07, "loss": 1.5567, "step": 1337 }, { "epoch": 0.20179473644521528, "grad_norm": 0.2513625985066753, "learning_rate": 4.562738834485168e-07, "loss": 1.5928, "step": 1338 }, { "epoch": 0.20194555463388886, "grad_norm": 0.2583391580247105, "learning_rate": 4.5621070686248733e-07, "loss": 1.6059, "step": 1339 }, { "epoch": 0.2020963728225624, "grad_norm": 0.43483098079460936, "learning_rate": 4.561474895893767e-07, "loss": 1.5168, "step": 1340 }, { "epoch": 0.20224719101123595, "grad_norm": 0.2652926044977916, "learning_rate": 4.56084231643379e-07, "loss": 1.5608, "step": 1341 }, { "epoch": 0.2023980091999095, "grad_norm": 0.24995728221583127, "learning_rate": 4.5602093303869757e-07, "loss": 1.6096, "step": 1342 }, { "epoch": 0.20254882738858307, "grad_norm": 0.24987711335700388, "learning_rate": 4.5595759378954473e-07, "loss": 1.5823, "step": 1343 }, { "epoch": 0.20269964557725662, "grad_norm": 0.33269507721172, "learning_rate": 4.558942139101421e-07, "loss": 1.5985, "step": 1344 }, { "epoch": 0.20285046376593016, "grad_norm": 0.24662871345387807, "learning_rate": 4.558307934147202e-07, "loss": 1.597, "step": 1345 }, { "epoch": 0.20300128195460374, "grad_norm": 0.25277950197978016, "learning_rate": 4.557673323175188e-07, "loss": 1.6153, "step": 1346 }, { "epoch": 0.20315210014327728, "grad_norm": 0.25791182532812484, "learning_rate": 4.557038306327867e-07, "loss": 1.5121, "step": 1347 }, { "epoch": 0.20330291833195083, "grad_norm": 0.3057793602717895, "learning_rate": 4.5564028837478196e-07, "loss": 1.5631, "step": 1348 }, { "epoch": 0.2034537365206244, "grad_norm": 0.24552413290619537, "learning_rate": 4.5557670555777174e-07, "loss": 1.5521, "step": 1349 }, { "epoch": 0.20360455470929795, "grad_norm": 0.2430317899241359, "learning_rate": 4.5551308219603205e-07, "loss": 1.5716, "step": 1350 }, { "epoch": 0.2037553728979715, "grad_norm": 0.2505642792908967, "learning_rate": 4.5544941830384837e-07, "loss": 1.5883, "step": 1351 }, { "epoch": 0.20390619108664504, "grad_norm": 0.2659737294912798, "learning_rate": 4.5538571389551496e-07, "loss": 1.5785, "step": 1352 }, { "epoch": 0.2040570092753186, "grad_norm": 0.3284157122003667, "learning_rate": 4.5532196898533545e-07, "loss": 1.564, "step": 1353 }, { "epoch": 0.20420782746399216, "grad_norm": 0.2492202242651036, "learning_rate": 4.5525818358762227e-07, "loss": 1.5739, "step": 1354 }, { "epoch": 0.2043586456526657, "grad_norm": 0.24554436767224772, "learning_rate": 4.551943577166972e-07, "loss": 1.5291, "step": 1355 }, { "epoch": 0.20450946384133928, "grad_norm": 0.27509833380478466, "learning_rate": 4.5513049138689094e-07, "loss": 1.5675, "step": 1356 }, { "epoch": 0.20466028203001282, "grad_norm": 0.2542901920012441, "learning_rate": 4.5506658461254337e-07, "loss": 1.5154, "step": 1357 }, { "epoch": 0.20481110021868637, "grad_norm": 0.2491163040776305, "learning_rate": 4.5500263740800356e-07, "loss": 1.6534, "step": 1358 }, { "epoch": 0.20496191840735992, "grad_norm": 0.25258441943888266, "learning_rate": 4.5493864978762925e-07, "loss": 1.5954, "step": 1359 }, { "epoch": 0.2051127365960335, "grad_norm": 0.3901512693311432, "learning_rate": 4.548746217657878e-07, "loss": 1.5297, "step": 1360 }, { "epoch": 0.20526355478470704, "grad_norm": 0.26586117055017966, "learning_rate": 4.5481055335685517e-07, "loss": 1.5197, "step": 1361 }, { "epoch": 0.20541437297338058, "grad_norm": 0.24156859903027042, "learning_rate": 4.547464445752166e-07, "loss": 1.602, "step": 1362 }, { "epoch": 0.20556519116205416, "grad_norm": 0.288296122042675, "learning_rate": 4.546822954352664e-07, "loss": 1.5139, "step": 1363 }, { "epoch": 0.2057160093507277, "grad_norm": 0.27571681432133305, "learning_rate": 4.546181059514081e-07, "loss": 1.6343, "step": 1364 }, { "epoch": 0.20586682753940125, "grad_norm": 0.26007909076181435, "learning_rate": 4.545538761380539e-07, "loss": 1.5911, "step": 1365 }, { "epoch": 0.2060176457280748, "grad_norm": 0.25431514373043707, "learning_rate": 4.5448960600962525e-07, "loss": 1.5176, "step": 1366 }, { "epoch": 0.20616846391674837, "grad_norm": 0.25302059966266965, "learning_rate": 4.544252955805528e-07, "loss": 1.5248, "step": 1367 }, { "epoch": 0.20631928210542191, "grad_norm": 0.24800911294965144, "learning_rate": 4.5436094486527606e-07, "loss": 1.5458, "step": 1368 }, { "epoch": 0.20647010029409546, "grad_norm": 0.24503612947593298, "learning_rate": 4.542965538782436e-07, "loss": 1.5657, "step": 1369 }, { "epoch": 0.20662091848276903, "grad_norm": 0.2658313769865448, "learning_rate": 4.5423212263391314e-07, "loss": 1.5449, "step": 1370 }, { "epoch": 0.20677173667144258, "grad_norm": 0.26167357035395145, "learning_rate": 4.541676511467513e-07, "loss": 1.542, "step": 1371 }, { "epoch": 0.20692255486011613, "grad_norm": 0.27849759106683974, "learning_rate": 4.541031394312338e-07, "loss": 1.4999, "step": 1372 }, { "epoch": 0.20707337304878967, "grad_norm": 0.2630077162730558, "learning_rate": 4.540385875018455e-07, "loss": 1.4677, "step": 1373 }, { "epoch": 0.20722419123746325, "grad_norm": 0.2518083771440917, "learning_rate": 4.539739953730801e-07, "loss": 1.5819, "step": 1374 }, { "epoch": 0.2073750094261368, "grad_norm": 0.23858991358920195, "learning_rate": 4.5390936305944036e-07, "loss": 1.5444, "step": 1375 }, { "epoch": 0.20752582761481034, "grad_norm": 0.2731108237392616, "learning_rate": 4.5384469057543836e-07, "loss": 1.5505, "step": 1376 }, { "epoch": 0.2076766458034839, "grad_norm": 0.23783912526258008, "learning_rate": 4.537799779355947e-07, "loss": 1.5205, "step": 1377 }, { "epoch": 0.20782746399215746, "grad_norm": 0.26817501745430766, "learning_rate": 4.537152251544393e-07, "loss": 1.555, "step": 1378 }, { "epoch": 0.207978282180831, "grad_norm": 0.31211585887200594, "learning_rate": 4.5365043224651117e-07, "loss": 1.578, "step": 1379 }, { "epoch": 0.20812910036950455, "grad_norm": 0.3369507015026358, "learning_rate": 4.5358559922635806e-07, "loss": 1.5843, "step": 1380 }, { "epoch": 0.20827991855817812, "grad_norm": 0.2468479801615849, "learning_rate": 4.5352072610853704e-07, "loss": 1.5821, "step": 1381 }, { "epoch": 0.20843073674685167, "grad_norm": 0.9366577594549299, "learning_rate": 4.534558129076138e-07, "loss": 1.5913, "step": 1382 }, { "epoch": 0.20858155493552522, "grad_norm": 0.2636202380918008, "learning_rate": 4.533908596381634e-07, "loss": 1.5372, "step": 1383 }, { "epoch": 0.2087323731241988, "grad_norm": 0.2512541914978473, "learning_rate": 4.5332586631476976e-07, "loss": 1.6358, "step": 1384 }, { "epoch": 0.20888319131287233, "grad_norm": 0.2633196962037907, "learning_rate": 4.532608329520256e-07, "loss": 1.5599, "step": 1385 }, { "epoch": 0.20903400950154588, "grad_norm": 0.24785371601498915, "learning_rate": 4.53195759564533e-07, "loss": 1.7057, "step": 1386 }, { "epoch": 0.20918482769021943, "grad_norm": 0.25019900140013046, "learning_rate": 4.531306461669027e-07, "loss": 1.5962, "step": 1387 }, { "epoch": 0.209335645878893, "grad_norm": 0.2633845347548641, "learning_rate": 4.530654927737546e-07, "loss": 1.6358, "step": 1388 }, { "epoch": 0.20948646406756655, "grad_norm": 0.2744402674265749, "learning_rate": 4.5300029939971757e-07, "loss": 1.5347, "step": 1389 }, { "epoch": 0.2096372822562401, "grad_norm": 0.24882511692788956, "learning_rate": 4.5293506605942934e-07, "loss": 1.5777, "step": 1390 }, { "epoch": 0.20978810044491367, "grad_norm": 0.25709926694621, "learning_rate": 4.528697927675367e-07, "loss": 1.5273, "step": 1391 }, { "epoch": 0.2099389186335872, "grad_norm": 0.26502976739542417, "learning_rate": 4.528044795386955e-07, "loss": 1.5539, "step": 1392 }, { "epoch": 0.21008973682226076, "grad_norm": 0.2520151211005096, "learning_rate": 4.5273912638757043e-07, "loss": 1.5474, "step": 1393 }, { "epoch": 0.21024055501093433, "grad_norm": 0.2386463581533624, "learning_rate": 4.5267373332883506e-07, "loss": 1.576, "step": 1394 }, { "epoch": 0.21039137319960788, "grad_norm": 0.24899071043384616, "learning_rate": 4.526083003771722e-07, "loss": 1.5386, "step": 1395 }, { "epoch": 0.21054219138828142, "grad_norm": 0.2770613415174753, "learning_rate": 4.5254282754727337e-07, "loss": 1.6591, "step": 1396 }, { "epoch": 0.21069300957695497, "grad_norm": 0.25022938621669893, "learning_rate": 4.524773148538391e-07, "loss": 1.5791, "step": 1397 }, { "epoch": 0.21084382776562854, "grad_norm": 0.27685533621155517, "learning_rate": 4.52411762311579e-07, "loss": 1.5734, "step": 1398 }, { "epoch": 0.2109946459543021, "grad_norm": 0.25338547389921595, "learning_rate": 4.523461699352115e-07, "loss": 1.5792, "step": 1399 }, { "epoch": 0.21114546414297564, "grad_norm": 0.24111278814349393, "learning_rate": 4.522805377394638e-07, "loss": 1.513, "step": 1400 }, { "epoch": 0.2112962823316492, "grad_norm": 0.24924887701465612, "learning_rate": 4.522148657390725e-07, "loss": 1.6394, "step": 1401 }, { "epoch": 0.21144710052032276, "grad_norm": 0.2564505137840794, "learning_rate": 4.521491539487827e-07, "loss": 1.5265, "step": 1402 }, { "epoch": 0.2115979187089963, "grad_norm": 0.2926745104258473, "learning_rate": 4.5208340238334876e-07, "loss": 1.5805, "step": 1403 }, { "epoch": 0.21174873689766985, "grad_norm": 0.2494023731334625, "learning_rate": 4.520176110575337e-07, "loss": 1.5697, "step": 1404 }, { "epoch": 0.21189955508634342, "grad_norm": 0.2569437009834236, "learning_rate": 4.519517799861097e-07, "loss": 1.5759, "step": 1405 }, { "epoch": 0.21205037327501697, "grad_norm": 0.2476924207888695, "learning_rate": 4.518859091838576e-07, "loss": 1.6233, "step": 1406 }, { "epoch": 0.21220119146369051, "grad_norm": 0.2532654188168525, "learning_rate": 4.5181999866556744e-07, "loss": 1.6218, "step": 1407 }, { "epoch": 0.2123520096523641, "grad_norm": 0.25466059140149927, "learning_rate": 4.5175404844603805e-07, "loss": 1.5523, "step": 1408 }, { "epoch": 0.21250282784103763, "grad_norm": 0.5170922925364997, "learning_rate": 4.5168805854007714e-07, "loss": 1.5398, "step": 1409 }, { "epoch": 0.21265364602971118, "grad_norm": 0.25832194865975405, "learning_rate": 4.516220289625013e-07, "loss": 1.5365, "step": 1410 }, { "epoch": 0.21280446421838473, "grad_norm": 0.2668076476122713, "learning_rate": 4.5155595972813616e-07, "loss": 1.574, "step": 1411 }, { "epoch": 0.2129552824070583, "grad_norm": 0.30558559464052965, "learning_rate": 4.514898508518162e-07, "loss": 1.5048, "step": 1412 }, { "epoch": 0.21310610059573185, "grad_norm": 0.24986675140765469, "learning_rate": 4.514237023483848e-07, "loss": 1.594, "step": 1413 }, { "epoch": 0.2132569187844054, "grad_norm": 0.2967956906681153, "learning_rate": 4.513575142326942e-07, "loss": 1.6564, "step": 1414 }, { "epoch": 0.21340773697307897, "grad_norm": 0.25236193419130576, "learning_rate": 4.512912865196055e-07, "loss": 1.53, "step": 1415 }, { "epoch": 0.2135585551617525, "grad_norm": 0.23697946220883495, "learning_rate": 4.512250192239888e-07, "loss": 1.5201, "step": 1416 }, { "epoch": 0.21370937335042606, "grad_norm": 0.25188233784849035, "learning_rate": 4.511587123607231e-07, "loss": 1.5729, "step": 1417 }, { "epoch": 0.2138601915390996, "grad_norm": 0.24316938283711606, "learning_rate": 4.5109236594469616e-07, "loss": 1.581, "step": 1418 }, { "epoch": 0.21401100972777318, "grad_norm": 0.258347771069871, "learning_rate": 4.510259799908047e-07, "loss": 1.5459, "step": 1419 }, { "epoch": 0.21416182791644672, "grad_norm": 0.2553160362376422, "learning_rate": 4.509595545139542e-07, "loss": 1.593, "step": 1420 }, { "epoch": 0.21431264610512027, "grad_norm": 0.25322419596260776, "learning_rate": 4.508930895290593e-07, "loss": 1.5785, "step": 1421 }, { "epoch": 0.21446346429379384, "grad_norm": 0.24474866831756664, "learning_rate": 4.5082658505104323e-07, "loss": 1.6553, "step": 1422 }, { "epoch": 0.2146142824824674, "grad_norm": 0.23931984187269134, "learning_rate": 4.5076004109483813e-07, "loss": 1.6013, "step": 1423 }, { "epoch": 0.21476510067114093, "grad_norm": 0.27588477665124317, "learning_rate": 4.5069345767538507e-07, "loss": 1.5221, "step": 1424 }, { "epoch": 0.21491591885981448, "grad_norm": 0.2628541784750753, "learning_rate": 4.5062683480763407e-07, "loss": 1.5947, "step": 1425 }, { "epoch": 0.21506673704848805, "grad_norm": 0.24678121117596793, "learning_rate": 4.505601725065439e-07, "loss": 1.5461, "step": 1426 }, { "epoch": 0.2152175552371616, "grad_norm": 0.26435666630384597, "learning_rate": 4.50493470787082e-07, "loss": 1.57, "step": 1427 }, { "epoch": 0.21536837342583515, "grad_norm": 0.3012053269260454, "learning_rate": 4.50426729664225e-07, "loss": 1.5197, "step": 1428 }, { "epoch": 0.21551919161450872, "grad_norm": 0.25146246237406045, "learning_rate": 4.503599491529582e-07, "loss": 1.5976, "step": 1429 }, { "epoch": 0.21567000980318227, "grad_norm": 0.25224063497089616, "learning_rate": 4.502931292682758e-07, "loss": 1.5875, "step": 1430 }, { "epoch": 0.2158208279918558, "grad_norm": 0.25331435975433114, "learning_rate": 4.5022627002518075e-07, "loss": 1.6048, "step": 1431 }, { "epoch": 0.21597164618052936, "grad_norm": 0.25813299552290325, "learning_rate": 4.50159371438685e-07, "loss": 1.5472, "step": 1432 }, { "epoch": 0.21612246436920293, "grad_norm": 0.2404868997906578, "learning_rate": 4.500924335238091e-07, "loss": 1.6059, "step": 1433 }, { "epoch": 0.21627328255787648, "grad_norm": 0.2613901399699812, "learning_rate": 4.500254562955826e-07, "loss": 1.6188, "step": 1434 }, { "epoch": 0.21642410074655002, "grad_norm": 0.24683355561448922, "learning_rate": 4.4995843976904386e-07, "loss": 1.5956, "step": 1435 }, { "epoch": 0.2165749189352236, "grad_norm": 0.25137075226570926, "learning_rate": 4.498913839592401e-07, "loss": 1.566, "step": 1436 }, { "epoch": 0.21672573712389714, "grad_norm": 0.282624118284134, "learning_rate": 4.498242888812272e-07, "loss": 1.6292, "step": 1437 }, { "epoch": 0.2168765553125707, "grad_norm": 0.2990794129963659, "learning_rate": 4.4975715455007e-07, "loss": 1.5933, "step": 1438 }, { "epoch": 0.21702737350124424, "grad_norm": 0.2980897205504419, "learning_rate": 4.496899809808421e-07, "loss": 1.6262, "step": 1439 }, { "epoch": 0.2171781916899178, "grad_norm": 0.23491385301471046, "learning_rate": 4.4962276818862593e-07, "loss": 1.5114, "step": 1440 }, { "epoch": 0.21732900987859136, "grad_norm": 0.24169981231441703, "learning_rate": 4.495555161885128e-07, "loss": 1.5901, "step": 1441 }, { "epoch": 0.2174798280672649, "grad_norm": 0.24587673073324526, "learning_rate": 4.494882249956026e-07, "loss": 1.5081, "step": 1442 }, { "epoch": 0.21763064625593848, "grad_norm": 0.6741389015324207, "learning_rate": 4.4942089462500414e-07, "loss": 1.5852, "step": 1443 }, { "epoch": 0.21778146444461202, "grad_norm": 0.30447488077669177, "learning_rate": 4.4935352509183527e-07, "loss": 1.5588, "step": 1444 }, { "epoch": 0.21793228263328557, "grad_norm": 0.2687597512504896, "learning_rate": 4.4928611641122223e-07, "loss": 1.591, "step": 1445 }, { "epoch": 0.21808310082195914, "grad_norm": 0.24939306790272178, "learning_rate": 4.492186685983004e-07, "loss": 1.5418, "step": 1446 }, { "epoch": 0.2182339190106327, "grad_norm": 0.24246473944579597, "learning_rate": 4.4915118166821346e-07, "loss": 1.5475, "step": 1447 }, { "epoch": 0.21838473719930623, "grad_norm": 0.25160094471473154, "learning_rate": 4.4908365563611456e-07, "loss": 1.5889, "step": 1448 }, { "epoch": 0.21853555538797978, "grad_norm": 0.24502512931851553, "learning_rate": 4.49016090517165e-07, "loss": 1.6334, "step": 1449 }, { "epoch": 0.21868637357665335, "grad_norm": 0.24632534584312824, "learning_rate": 4.489484863265352e-07, "loss": 1.5815, "step": 1450 }, { "epoch": 0.2188371917653269, "grad_norm": 0.7228259896607148, "learning_rate": 4.4888084307940433e-07, "loss": 1.5779, "step": 1451 }, { "epoch": 0.21898800995400045, "grad_norm": 0.28835413330730547, "learning_rate": 4.488131607909602e-07, "loss": 1.5069, "step": 1452 }, { "epoch": 0.21913882814267402, "grad_norm": 0.32849414491868345, "learning_rate": 4.4874543947639946e-07, "loss": 1.5055, "step": 1453 }, { "epoch": 0.21928964633134757, "grad_norm": 0.25614128263544494, "learning_rate": 4.486776791509276e-07, "loss": 1.6218, "step": 1454 }, { "epoch": 0.2194404645200211, "grad_norm": 0.2788075708337781, "learning_rate": 4.486098798297587e-07, "loss": 1.5488, "step": 1455 }, { "epoch": 0.21959128270869466, "grad_norm": 0.25557639661216225, "learning_rate": 4.4854204152811563e-07, "loss": 1.6018, "step": 1456 }, { "epoch": 0.21974210089736823, "grad_norm": 0.2688433402158743, "learning_rate": 4.484741642612302e-07, "loss": 1.5651, "step": 1457 }, { "epoch": 0.21989291908604178, "grad_norm": 0.2744743880824375, "learning_rate": 4.484062480443427e-07, "loss": 1.553, "step": 1458 }, { "epoch": 0.22004373727471532, "grad_norm": 0.2620242735199598, "learning_rate": 4.483382928927025e-07, "loss": 1.5042, "step": 1459 }, { "epoch": 0.2201945554633889, "grad_norm": 0.24288976128343828, "learning_rate": 4.482702988215672e-07, "loss": 1.5489, "step": 1460 }, { "epoch": 0.22034537365206244, "grad_norm": 0.2594367330284345, "learning_rate": 4.482022658462037e-07, "loss": 1.4735, "step": 1461 }, { "epoch": 0.220496191840736, "grad_norm": 0.3167292658690441, "learning_rate": 4.4813419398188736e-07, "loss": 1.5435, "step": 1462 }, { "epoch": 0.22064701002940953, "grad_norm": 0.2519629922479481, "learning_rate": 4.480660832439022e-07, "loss": 1.5606, "step": 1463 }, { "epoch": 0.2207978282180831, "grad_norm": 0.31160173778972206, "learning_rate": 4.4799793364754113e-07, "loss": 1.5988, "step": 1464 }, { "epoch": 0.22094864640675665, "grad_norm": 0.3017651985325454, "learning_rate": 4.479297452081056e-07, "loss": 1.579, "step": 1465 }, { "epoch": 0.2210994645954302, "grad_norm": 0.2439919574688214, "learning_rate": 4.478615179409061e-07, "loss": 1.5537, "step": 1466 }, { "epoch": 0.22125028278410377, "grad_norm": 0.32076252021976376, "learning_rate": 4.4779325186126157e-07, "loss": 1.4989, "step": 1467 }, { "epoch": 0.22140110097277732, "grad_norm": 0.26932891039293677, "learning_rate": 4.4772494698449965e-07, "loss": 1.5191, "step": 1468 }, { "epoch": 0.22155191916145087, "grad_norm": 0.2633568107974307, "learning_rate": 4.476566033259568e-07, "loss": 1.5494, "step": 1469 }, { "epoch": 0.2217027373501244, "grad_norm": 0.28810828200999133, "learning_rate": 4.475882209009782e-07, "loss": 1.5609, "step": 1470 }, { "epoch": 0.221853555538798, "grad_norm": 0.3317982309128049, "learning_rate": 4.475197997249177e-07, "loss": 1.5672, "step": 1471 }, { "epoch": 0.22200437372747153, "grad_norm": 0.2538882922500279, "learning_rate": 4.4745133981313785e-07, "loss": 1.524, "step": 1472 }, { "epoch": 0.22215519191614508, "grad_norm": 0.27207734764137315, "learning_rate": 4.4738284118100984e-07, "loss": 1.5714, "step": 1473 }, { "epoch": 0.22230601010481865, "grad_norm": 0.24683524078912186, "learning_rate": 4.473143038439137e-07, "loss": 1.5806, "step": 1474 }, { "epoch": 0.2224568282934922, "grad_norm": 0.2855048578394907, "learning_rate": 4.4724572781723787e-07, "loss": 1.5427, "step": 1475 }, { "epoch": 0.22260764648216574, "grad_norm": 0.2897639699473296, "learning_rate": 4.471771131163799e-07, "loss": 1.5443, "step": 1476 }, { "epoch": 0.2227584646708393, "grad_norm": 0.2550600619785885, "learning_rate": 4.4710845975674565e-07, "loss": 1.5153, "step": 1477 }, { "epoch": 0.22290928285951286, "grad_norm": 0.25232906029764135, "learning_rate": 4.4703976775374986e-07, "loss": 1.4903, "step": 1478 }, { "epoch": 0.2230601010481864, "grad_norm": 0.25328137332083733, "learning_rate": 4.469710371228158e-07, "loss": 1.5976, "step": 1479 }, { "epoch": 0.22321091923685996, "grad_norm": 0.27936930653293984, "learning_rate": 4.469022678793756e-07, "loss": 1.6363, "step": 1480 }, { "epoch": 0.22336173742553353, "grad_norm": 0.2548072627596578, "learning_rate": 4.4683346003887e-07, "loss": 1.584, "step": 1481 }, { "epoch": 0.22351255561420708, "grad_norm": 0.251863851633017, "learning_rate": 4.4676461361674816e-07, "loss": 1.5825, "step": 1482 }, { "epoch": 0.22366337380288062, "grad_norm": 0.2607448940880632, "learning_rate": 4.466957286284683e-07, "loss": 1.5988, "step": 1483 }, { "epoch": 0.22381419199155417, "grad_norm": 0.2569225382965004, "learning_rate": 4.466268050894969e-07, "loss": 1.5543, "step": 1484 }, { "epoch": 0.22396501018022774, "grad_norm": 0.24952522775379582, "learning_rate": 4.4655784301530955e-07, "loss": 1.5792, "step": 1485 }, { "epoch": 0.2241158283689013, "grad_norm": 0.2651653229474127, "learning_rate": 4.4648884242139005e-07, "loss": 1.5456, "step": 1486 }, { "epoch": 0.22426664655757483, "grad_norm": 0.24259453226169617, "learning_rate": 4.464198033232312e-07, "loss": 1.5347, "step": 1487 }, { "epoch": 0.2244174647462484, "grad_norm": 0.26490694394792824, "learning_rate": 4.4635072573633416e-07, "loss": 1.5912, "step": 1488 }, { "epoch": 0.22456828293492195, "grad_norm": 0.2866525843062583, "learning_rate": 4.4628160967620886e-07, "loss": 1.614, "step": 1489 }, { "epoch": 0.2247191011235955, "grad_norm": 0.2682453651065939, "learning_rate": 4.46212455158374e-07, "loss": 1.5625, "step": 1490 }, { "epoch": 0.22486991931226907, "grad_norm": 0.2419121075971261, "learning_rate": 4.4614326219835666e-07, "loss": 1.5419, "step": 1491 }, { "epoch": 0.22502073750094262, "grad_norm": 0.267993400660638, "learning_rate": 4.460740308116928e-07, "loss": 1.6074, "step": 1492 }, { "epoch": 0.22517155568961617, "grad_norm": 0.32575575615746094, "learning_rate": 4.4600476101392677e-07, "loss": 1.5516, "step": 1493 }, { "epoch": 0.2253223738782897, "grad_norm": 0.2483260022994849, "learning_rate": 4.4593545282061156e-07, "loss": 1.6733, "step": 1494 }, { "epoch": 0.22547319206696328, "grad_norm": 0.27530216464083385, "learning_rate": 4.458661062473091e-07, "loss": 1.606, "step": 1495 }, { "epoch": 0.22562401025563683, "grad_norm": 0.2443319973678457, "learning_rate": 4.457967213095896e-07, "loss": 1.5532, "step": 1496 }, { "epoch": 0.22577482844431038, "grad_norm": 0.2427356152328098, "learning_rate": 4.45727298023032e-07, "loss": 1.5055, "step": 1497 }, { "epoch": 0.22592564663298395, "grad_norm": 0.25507158062659324, "learning_rate": 4.4565783640322386e-07, "loss": 1.557, "step": 1498 }, { "epoch": 0.2260764648216575, "grad_norm": 0.2620037370610053, "learning_rate": 4.455883364657614e-07, "loss": 1.6873, "step": 1499 }, { "epoch": 0.22622728301033104, "grad_norm": 0.2470852857188749, "learning_rate": 4.455187982262493e-07, "loss": 1.5153, "step": 1500 }, { "epoch": 0.2263781011990046, "grad_norm": 0.2479219594273459, "learning_rate": 4.4544922170030084e-07, "loss": 1.5514, "step": 1501 }, { "epoch": 0.22652891938767816, "grad_norm": 0.2582843145931511, "learning_rate": 4.4537960690353823e-07, "loss": 1.4804, "step": 1502 }, { "epoch": 0.2266797375763517, "grad_norm": 0.2683250070621692, "learning_rate": 4.4530995385159173e-07, "loss": 1.6919, "step": 1503 }, { "epoch": 0.22683055576502525, "grad_norm": 0.23998718465092833, "learning_rate": 4.4524026256010073e-07, "loss": 1.6152, "step": 1504 }, { "epoch": 0.22698137395369883, "grad_norm": 0.3137984877545782, "learning_rate": 4.4517053304471273e-07, "loss": 1.6059, "step": 1505 }, { "epoch": 0.22713219214237237, "grad_norm": 0.2569777072612162, "learning_rate": 4.4510076532108425e-07, "loss": 1.5924, "step": 1506 }, { "epoch": 0.22728301033104592, "grad_norm": 0.24520541445209215, "learning_rate": 4.4503095940488e-07, "loss": 1.546, "step": 1507 }, { "epoch": 0.22743382851971947, "grad_norm": 0.27660005451994707, "learning_rate": 4.4496111531177356e-07, "loss": 1.6151, "step": 1508 }, { "epoch": 0.22758464670839304, "grad_norm": 0.4193956155170342, "learning_rate": 4.448912330574469e-07, "loss": 1.5805, "step": 1509 }, { "epoch": 0.2277354648970666, "grad_norm": 0.24382335659336823, "learning_rate": 4.4482131265759067e-07, "loss": 1.6322, "step": 1510 }, { "epoch": 0.22788628308574013, "grad_norm": 0.2621205660842912, "learning_rate": 4.44751354127904e-07, "loss": 1.581, "step": 1511 }, { "epoch": 0.2280371012744137, "grad_norm": 0.2654196652049394, "learning_rate": 4.446813574840946e-07, "loss": 1.5895, "step": 1512 }, { "epoch": 0.22818791946308725, "grad_norm": 0.24947378314897425, "learning_rate": 4.446113227418787e-07, "loss": 1.5185, "step": 1513 }, { "epoch": 0.2283387376517608, "grad_norm": 0.25239730996688686, "learning_rate": 4.4454124991698136e-07, "loss": 1.5731, "step": 1514 }, { "epoch": 0.22848955584043434, "grad_norm": 0.27285743016093594, "learning_rate": 4.4447113902513577e-07, "loss": 1.597, "step": 1515 }, { "epoch": 0.22864037402910792, "grad_norm": 0.2492947708196186, "learning_rate": 4.44400990082084e-07, "loss": 1.6109, "step": 1516 }, { "epoch": 0.22879119221778146, "grad_norm": 0.2475032437299696, "learning_rate": 4.4433080310357636e-07, "loss": 1.5192, "step": 1517 }, { "epoch": 0.228942010406455, "grad_norm": 0.24904705840941427, "learning_rate": 4.4426057810537187e-07, "loss": 1.5728, "step": 1518 }, { "epoch": 0.22909282859512858, "grad_norm": 0.48411826079897535, "learning_rate": 4.441903151032383e-07, "loss": 1.6649, "step": 1519 }, { "epoch": 0.22924364678380213, "grad_norm": 0.2454018555263281, "learning_rate": 4.441200141129516e-07, "loss": 1.5949, "step": 1520 }, { "epoch": 0.22939446497247568, "grad_norm": 0.2366798276780095, "learning_rate": 4.440496751502965e-07, "loss": 1.5925, "step": 1521 }, { "epoch": 0.22954528316114922, "grad_norm": 0.2410733842493999, "learning_rate": 4.4397929823106596e-07, "loss": 1.5542, "step": 1522 }, { "epoch": 0.2296961013498228, "grad_norm": 0.5869512944596569, "learning_rate": 4.4390888337106177e-07, "loss": 1.6107, "step": 1523 }, { "epoch": 0.22984691953849634, "grad_norm": 0.2569931933365196, "learning_rate": 4.4383843058609413e-07, "loss": 1.5418, "step": 1524 }, { "epoch": 0.2299977377271699, "grad_norm": 0.29199340682139846, "learning_rate": 4.4376793989198165e-07, "loss": 1.5371, "step": 1525 }, { "epoch": 0.23014855591584346, "grad_norm": 0.3161682392737767, "learning_rate": 4.4369741130455156e-07, "loss": 1.587, "step": 1526 }, { "epoch": 0.230299374104517, "grad_norm": 0.25244665931894156, "learning_rate": 4.4362684483963967e-07, "loss": 1.5758, "step": 1527 }, { "epoch": 0.23045019229319055, "grad_norm": 0.3287435741615946, "learning_rate": 4.435562405130902e-07, "loss": 1.5638, "step": 1528 }, { "epoch": 0.2306010104818641, "grad_norm": 0.26030938077603366, "learning_rate": 4.4348559834075577e-07, "loss": 1.5588, "step": 1529 }, { "epoch": 0.23075182867053767, "grad_norm": 0.2529517007869986, "learning_rate": 4.434149183384977e-07, "loss": 1.4988, "step": 1530 }, { "epoch": 0.23090264685921122, "grad_norm": 0.23807690258336311, "learning_rate": 4.433442005221857e-07, "loss": 1.5728, "step": 1531 }, { "epoch": 0.23105346504788477, "grad_norm": 0.2980673816915895, "learning_rate": 4.432734449076979e-07, "loss": 1.5573, "step": 1532 }, { "epoch": 0.23120428323655834, "grad_norm": 0.24343236440687027, "learning_rate": 4.4320265151092115e-07, "loss": 1.5945, "step": 1533 }, { "epoch": 0.23135510142523188, "grad_norm": 0.2438373996327165, "learning_rate": 4.4313182034775044e-07, "loss": 1.5857, "step": 1534 }, { "epoch": 0.23150591961390543, "grad_norm": 0.2847928399924164, "learning_rate": 4.4306095143408957e-07, "loss": 1.5607, "step": 1535 }, { "epoch": 0.231656737802579, "grad_norm": 0.24168986587532496, "learning_rate": 4.429900447858506e-07, "loss": 1.5662, "step": 1536 }, { "epoch": 0.23180755599125255, "grad_norm": 0.24619379089962148, "learning_rate": 4.429191004189542e-07, "loss": 1.5442, "step": 1537 }, { "epoch": 0.2319583741799261, "grad_norm": 0.24484056814337715, "learning_rate": 4.4284811834932933e-07, "loss": 1.6062, "step": 1538 }, { "epoch": 0.23210919236859964, "grad_norm": 0.2489517175275712, "learning_rate": 4.4277709859291375e-07, "loss": 1.5339, "step": 1539 }, { "epoch": 0.23226001055727322, "grad_norm": 0.252822674318724, "learning_rate": 4.4270604116565324e-07, "loss": 1.6146, "step": 1540 }, { "epoch": 0.23241082874594676, "grad_norm": 0.25335946947980326, "learning_rate": 4.4263494608350234e-07, "loss": 1.5978, "step": 1541 }, { "epoch": 0.2325616469346203, "grad_norm": 0.3251146437009607, "learning_rate": 4.4256381336242393e-07, "loss": 1.5488, "step": 1542 }, { "epoch": 0.23271246512329388, "grad_norm": 0.2703768118135377, "learning_rate": 4.424926430183895e-07, "loss": 1.5846, "step": 1543 }, { "epoch": 0.23286328331196743, "grad_norm": 0.27268227347666524, "learning_rate": 4.424214350673788e-07, "loss": 1.577, "step": 1544 }, { "epoch": 0.23301410150064097, "grad_norm": 0.24783995137465384, "learning_rate": 4.4235018952538006e-07, "loss": 1.559, "step": 1545 }, { "epoch": 0.23316491968931452, "grad_norm": 0.26967822095437516, "learning_rate": 4.4227890640839e-07, "loss": 1.5483, "step": 1546 }, { "epoch": 0.2333157378779881, "grad_norm": 0.25574816910228887, "learning_rate": 4.422075857324137e-07, "loss": 1.6082, "step": 1547 }, { "epoch": 0.23346655606666164, "grad_norm": 0.25014337543770127, "learning_rate": 4.4213622751346474e-07, "loss": 1.5987, "step": 1548 }, { "epoch": 0.2336173742553352, "grad_norm": 0.25473300230673646, "learning_rate": 4.420648317675653e-07, "loss": 1.5779, "step": 1549 }, { "epoch": 0.23376819244400876, "grad_norm": 0.24752771791929504, "learning_rate": 4.4199339851074556e-07, "loss": 1.5996, "step": 1550 }, { "epoch": 0.2339190106326823, "grad_norm": 0.24099919228942546, "learning_rate": 4.419219277590445e-07, "loss": 1.5386, "step": 1551 }, { "epoch": 0.23406982882135585, "grad_norm": 0.4893165747025458, "learning_rate": 4.4185041952850937e-07, "loss": 1.5356, "step": 1552 }, { "epoch": 0.2342206470100294, "grad_norm": 0.2659578973957303, "learning_rate": 4.4177887383519584e-07, "loss": 1.6222, "step": 1553 }, { "epoch": 0.23437146519870297, "grad_norm": 0.24979724219614716, "learning_rate": 4.41707290695168e-07, "loss": 1.5928, "step": 1554 }, { "epoch": 0.23452228338737652, "grad_norm": 0.27578137794704516, "learning_rate": 4.416356701244983e-07, "loss": 1.662, "step": 1555 }, { "epoch": 0.23467310157605006, "grad_norm": 0.2680631467952242, "learning_rate": 4.4156401213926777e-07, "loss": 1.6505, "step": 1556 }, { "epoch": 0.23482391976472364, "grad_norm": 0.2700553313518092, "learning_rate": 4.414923167555655e-07, "loss": 1.6536, "step": 1557 }, { "epoch": 0.23497473795339718, "grad_norm": 0.25150863617644587, "learning_rate": 4.4142058398948937e-07, "loss": 1.6113, "step": 1558 }, { "epoch": 0.23512555614207073, "grad_norm": 0.23728026868984647, "learning_rate": 4.413488138571455e-07, "loss": 1.5653, "step": 1559 }, { "epoch": 0.23527637433074428, "grad_norm": 0.8685979934961994, "learning_rate": 4.4127700637464827e-07, "loss": 1.6315, "step": 1560 }, { "epoch": 0.23542719251941785, "grad_norm": 0.2725852994301352, "learning_rate": 4.412051615581206e-07, "loss": 1.5659, "step": 1561 }, { "epoch": 0.2355780107080914, "grad_norm": 0.2548194551778707, "learning_rate": 4.4113327942369373e-07, "loss": 1.5474, "step": 1562 }, { "epoch": 0.23572882889676494, "grad_norm": 1.5574091156237442, "learning_rate": 4.410613599875073e-07, "loss": 1.5811, "step": 1563 }, { "epoch": 0.23587964708543852, "grad_norm": 0.2492067176832404, "learning_rate": 4.4098940326570935e-07, "loss": 1.5303, "step": 1564 }, { "epoch": 0.23603046527411206, "grad_norm": 0.24230255657856714, "learning_rate": 4.4091740927445607e-07, "loss": 1.576, "step": 1565 }, { "epoch": 0.2361812834627856, "grad_norm": 0.24894191425084267, "learning_rate": 4.408453780299125e-07, "loss": 1.5438, "step": 1566 }, { "epoch": 0.23633210165145915, "grad_norm": 0.2392475719036145, "learning_rate": 4.4077330954825153e-07, "loss": 1.4873, "step": 1567 }, { "epoch": 0.23648291984013273, "grad_norm": 0.2584213398407718, "learning_rate": 4.407012038456547e-07, "loss": 1.6108, "step": 1568 }, { "epoch": 0.23663373802880627, "grad_norm": 0.24981877332816438, "learning_rate": 4.40629060938312e-07, "loss": 1.4808, "step": 1569 }, { "epoch": 0.23678455621747982, "grad_norm": 0.2670772171175633, "learning_rate": 4.405568808424213e-07, "loss": 1.5513, "step": 1570 }, { "epoch": 0.2369353744061534, "grad_norm": 0.2561177002350499, "learning_rate": 4.404846635741894e-07, "loss": 1.5232, "step": 1571 }, { "epoch": 0.23708619259482694, "grad_norm": 0.24947164268052072, "learning_rate": 4.4041240914983114e-07, "loss": 1.5567, "step": 1572 }, { "epoch": 0.23723701078350048, "grad_norm": 0.46657843250411435, "learning_rate": 4.403401175855695e-07, "loss": 1.5458, "step": 1573 }, { "epoch": 0.23738782897217403, "grad_norm": 0.24195526178643315, "learning_rate": 4.4026778889763637e-07, "loss": 1.5594, "step": 1574 }, { "epoch": 0.2375386471608476, "grad_norm": 0.27600359901541593, "learning_rate": 4.401954231022714e-07, "loss": 1.5401, "step": 1575 }, { "epoch": 0.23768946534952115, "grad_norm": 0.24704062385590803, "learning_rate": 4.40123020215723e-07, "loss": 1.5771, "step": 1576 }, { "epoch": 0.2378402835381947, "grad_norm": 0.259061251052292, "learning_rate": 4.4005058025424765e-07, "loss": 1.5045, "step": 1577 }, { "epoch": 0.23799110172686827, "grad_norm": 0.25213420085896426, "learning_rate": 4.399781032341102e-07, "loss": 1.6264, "step": 1578 }, { "epoch": 0.23814191991554182, "grad_norm": 0.24235696486526614, "learning_rate": 4.399055891715839e-07, "loss": 1.554, "step": 1579 }, { "epoch": 0.23829273810421536, "grad_norm": 0.24894393406909265, "learning_rate": 4.398330380829502e-07, "loss": 1.5122, "step": 1580 }, { "epoch": 0.23844355629288894, "grad_norm": 0.2522058131129758, "learning_rate": 4.3976044998449904e-07, "loss": 1.5451, "step": 1581 }, { "epoch": 0.23859437448156248, "grad_norm": 0.2481532556382654, "learning_rate": 4.3968782489252845e-07, "loss": 1.5803, "step": 1582 }, { "epoch": 0.23874519267023603, "grad_norm": 0.25947523180652005, "learning_rate": 4.3961516282334497e-07, "loss": 1.6562, "step": 1583 }, { "epoch": 0.23889601085890957, "grad_norm": 0.47998219527827213, "learning_rate": 4.3954246379326326e-07, "loss": 1.5688, "step": 1584 }, { "epoch": 0.23904682904758315, "grad_norm": 0.24000681895761397, "learning_rate": 4.394697278186065e-07, "loss": 1.6005, "step": 1585 }, { "epoch": 0.2391976472362567, "grad_norm": 0.24719599930872269, "learning_rate": 4.393969549157059e-07, "loss": 1.5753, "step": 1586 }, { "epoch": 0.23934846542493024, "grad_norm": 0.2651440718103725, "learning_rate": 4.393241451009012e-07, "loss": 1.5798, "step": 1587 }, { "epoch": 0.23949928361360381, "grad_norm": 0.2573708129158974, "learning_rate": 4.392512983905403e-07, "loss": 1.6199, "step": 1588 }, { "epoch": 0.23965010180227736, "grad_norm": 0.24350260889665962, "learning_rate": 4.3917841480097935e-07, "loss": 1.5163, "step": 1589 }, { "epoch": 0.2398009199909509, "grad_norm": 0.24904924118751787, "learning_rate": 4.391054943485828e-07, "loss": 1.6708, "step": 1590 }, { "epoch": 0.23995173817962445, "grad_norm": 0.23762750588913742, "learning_rate": 4.3903253704972356e-07, "loss": 1.6214, "step": 1591 }, { "epoch": 0.24010255636829803, "grad_norm": 0.2564142889769098, "learning_rate": 4.3895954292078256e-07, "loss": 1.6223, "step": 1592 }, { "epoch": 0.24025337455697157, "grad_norm": 0.30659021520772384, "learning_rate": 4.388865119781492e-07, "loss": 1.6608, "step": 1593 }, { "epoch": 0.24040419274564512, "grad_norm": 0.2545411842270761, "learning_rate": 4.38813444238221e-07, "loss": 1.5091, "step": 1594 }, { "epoch": 0.2405550109343187, "grad_norm": 0.2433107911193599, "learning_rate": 4.3874033971740375e-07, "loss": 1.5504, "step": 1595 }, { "epoch": 0.24070582912299224, "grad_norm": 0.25584836780205455, "learning_rate": 4.386671984321116e-07, "loss": 1.533, "step": 1596 }, { "epoch": 0.24085664731166578, "grad_norm": 0.24865548225791834, "learning_rate": 4.3859402039876684e-07, "loss": 1.5324, "step": 1597 }, { "epoch": 0.24100746550033933, "grad_norm": 0.24093033741997524, "learning_rate": 4.3852080563380023e-07, "loss": 1.585, "step": 1598 }, { "epoch": 0.2411582836890129, "grad_norm": 0.24709009545789049, "learning_rate": 4.3844755415365043e-07, "loss": 1.5321, "step": 1599 }, { "epoch": 0.24130910187768645, "grad_norm": 0.2503465791645773, "learning_rate": 4.3837426597476455e-07, "loss": 1.5112, "step": 1600 }, { "epoch": 0.24145992006636, "grad_norm": 0.2509426828595389, "learning_rate": 4.383009411135981e-07, "loss": 1.5806, "step": 1601 }, { "epoch": 0.24161073825503357, "grad_norm": 0.24265146376897662, "learning_rate": 4.3822757958661445e-07, "loss": 1.5811, "step": 1602 }, { "epoch": 0.24176155644370712, "grad_norm": 0.2733494998765217, "learning_rate": 4.3815418141028546e-07, "loss": 1.5979, "step": 1603 }, { "epoch": 0.24191237463238066, "grad_norm": 0.2451539073765318, "learning_rate": 4.380807466010912e-07, "loss": 1.6545, "step": 1604 }, { "epoch": 0.2420631928210542, "grad_norm": 4.374804279747756, "learning_rate": 4.3800727517551984e-07, "loss": 1.6669, "step": 1605 }, { "epoch": 0.24221401100972778, "grad_norm": 0.25491735405130056, "learning_rate": 4.37933767150068e-07, "loss": 1.6174, "step": 1606 }, { "epoch": 0.24236482919840133, "grad_norm": 0.2633247174467053, "learning_rate": 4.378602225412402e-07, "loss": 1.5471, "step": 1607 }, { "epoch": 0.24251564738707487, "grad_norm": 0.2533193105973062, "learning_rate": 4.3778664136554945e-07, "loss": 1.5456, "step": 1608 }, { "epoch": 0.24266646557574845, "grad_norm": 0.27718357103998414, "learning_rate": 4.3771302363951684e-07, "loss": 1.5974, "step": 1609 }, { "epoch": 0.242817283764422, "grad_norm": 0.2525200412087624, "learning_rate": 4.3763936937967165e-07, "loss": 1.5636, "step": 1610 }, { "epoch": 0.24296810195309554, "grad_norm": 0.2527191841200574, "learning_rate": 4.375656786025515e-07, "loss": 1.5813, "step": 1611 }, { "epoch": 0.24311892014176908, "grad_norm": 0.2424044029783404, "learning_rate": 4.3749195132470205e-07, "loss": 1.5572, "step": 1612 }, { "epoch": 0.24326973833044266, "grad_norm": 0.26674412181726, "learning_rate": 4.374181875626772e-07, "loss": 1.6054, "step": 1613 }, { "epoch": 0.2434205565191162, "grad_norm": 0.24507553143528535, "learning_rate": 4.373443873330392e-07, "loss": 1.4877, "step": 1614 }, { "epoch": 0.24357137470778975, "grad_norm": 0.24946117678679283, "learning_rate": 4.3727055065235823e-07, "loss": 1.5787, "step": 1615 }, { "epoch": 0.24372219289646332, "grad_norm": 0.2555671451600455, "learning_rate": 4.3719667753721285e-07, "loss": 1.6166, "step": 1616 }, { "epoch": 0.24387301108513687, "grad_norm": 0.25242635778884653, "learning_rate": 4.3712276800418955e-07, "loss": 1.5867, "step": 1617 }, { "epoch": 0.24402382927381042, "grad_norm": 0.2548187267229543, "learning_rate": 4.3704882206988337e-07, "loss": 1.54, "step": 1618 }, { "epoch": 0.24417464746248396, "grad_norm": 0.2608444111849341, "learning_rate": 4.3697483975089715e-07, "loss": 1.5985, "step": 1619 }, { "epoch": 0.24432546565115754, "grad_norm": 0.26605893611604164, "learning_rate": 4.369008210638423e-07, "loss": 1.5876, "step": 1620 }, { "epoch": 0.24447628383983108, "grad_norm": 0.2556804828535995, "learning_rate": 4.3682676602533803e-07, "loss": 1.6201, "step": 1621 }, { "epoch": 0.24462710202850463, "grad_norm": 0.24406546279014849, "learning_rate": 4.3675267465201187e-07, "loss": 1.5356, "step": 1622 }, { "epoch": 0.2447779202171782, "grad_norm": 0.24947181715050515, "learning_rate": 4.366785469604994e-07, "loss": 1.5389, "step": 1623 }, { "epoch": 0.24492873840585175, "grad_norm": 0.25390938677924657, "learning_rate": 4.366043829674446e-07, "loss": 1.5537, "step": 1624 }, { "epoch": 0.2450795565945253, "grad_norm": 0.3393992031538395, "learning_rate": 4.365301826894994e-07, "loss": 1.5805, "step": 1625 }, { "epoch": 0.24523037478319884, "grad_norm": 0.25072179195252253, "learning_rate": 4.3645594614332395e-07, "loss": 1.5496, "step": 1626 }, { "epoch": 0.2453811929718724, "grad_norm": 0.24083494853172305, "learning_rate": 4.3638167334558635e-07, "loss": 1.5589, "step": 1627 }, { "epoch": 0.24553201116054596, "grad_norm": 0.25949341928748476, "learning_rate": 4.363073643129632e-07, "loss": 1.5614, "step": 1628 }, { "epoch": 0.2456828293492195, "grad_norm": 0.2411908488354143, "learning_rate": 4.36233019062139e-07, "loss": 1.5887, "step": 1629 }, { "epoch": 0.24583364753789308, "grad_norm": 0.25684724445685997, "learning_rate": 4.361586376098064e-07, "loss": 1.5917, "step": 1630 }, { "epoch": 0.24598446572656663, "grad_norm": 0.24380831232286737, "learning_rate": 4.3608421997266607e-07, "loss": 1.4848, "step": 1631 }, { "epoch": 0.24613528391524017, "grad_norm": 0.2483656328460639, "learning_rate": 4.3600976616742723e-07, "loss": 1.5928, "step": 1632 }, { "epoch": 0.24628610210391375, "grad_norm": 0.2540050298497444, "learning_rate": 4.359352762108067e-07, "loss": 1.5672, "step": 1633 }, { "epoch": 0.2464369202925873, "grad_norm": 0.24384300478168583, "learning_rate": 4.3586075011952964e-07, "loss": 1.5376, "step": 1634 }, { "epoch": 0.24658773848126084, "grad_norm": 0.24073011406086497, "learning_rate": 4.357861879103295e-07, "loss": 1.5768, "step": 1635 }, { "epoch": 0.24673855666993438, "grad_norm": 0.2514451360463635, "learning_rate": 4.3571158959994753e-07, "loss": 1.6003, "step": 1636 }, { "epoch": 0.24688937485860796, "grad_norm": 0.23943815733721524, "learning_rate": 4.356369552051333e-07, "loss": 1.5712, "step": 1637 }, { "epoch": 0.2470401930472815, "grad_norm": 0.2527623225302482, "learning_rate": 4.355622847426443e-07, "loss": 1.6135, "step": 1638 }, { "epoch": 0.24719101123595505, "grad_norm": 0.24943866281182905, "learning_rate": 4.3548757822924623e-07, "loss": 1.5667, "step": 1639 }, { "epoch": 0.24734182942462862, "grad_norm": 0.2543994544070957, "learning_rate": 4.35412835681713e-07, "loss": 1.6605, "step": 1640 }, { "epoch": 0.24749264761330217, "grad_norm": 0.49854934631647785, "learning_rate": 4.353380571168264e-07, "loss": 1.5982, "step": 1641 }, { "epoch": 0.24764346580197572, "grad_norm": 0.24169392277694388, "learning_rate": 4.352632425513764e-07, "loss": 1.5547, "step": 1642 }, { "epoch": 0.24779428399064926, "grad_norm": 0.24571868645006378, "learning_rate": 4.3518839200216116e-07, "loss": 1.5299, "step": 1643 }, { "epoch": 0.24794510217932283, "grad_norm": 0.24052959132029506, "learning_rate": 4.3511350548598663e-07, "loss": 1.6275, "step": 1644 }, { "epoch": 0.24809592036799638, "grad_norm": 0.2746076044352863, "learning_rate": 4.350385830196671e-07, "loss": 1.5231, "step": 1645 }, { "epoch": 0.24824673855666993, "grad_norm": 0.27728208574784535, "learning_rate": 4.349636246200249e-07, "loss": 1.5258, "step": 1646 }, { "epoch": 0.2483975567453435, "grad_norm": 0.2421299350714505, "learning_rate": 4.348886303038903e-07, "loss": 1.5285, "step": 1647 }, { "epoch": 0.24854837493401705, "grad_norm": 0.25132360998606307, "learning_rate": 4.348136000881017e-07, "loss": 1.6063, "step": 1648 }, { "epoch": 0.2486991931226906, "grad_norm": 0.2749557018517846, "learning_rate": 4.347385339895056e-07, "loss": 1.5415, "step": 1649 }, { "epoch": 0.24885001131136414, "grad_norm": 0.24158905852084034, "learning_rate": 4.346634320249565e-07, "loss": 1.5754, "step": 1650 }, { "epoch": 0.2490008295000377, "grad_norm": 0.3022825523735689, "learning_rate": 4.34588294211317e-07, "loss": 1.6446, "step": 1651 }, { "epoch": 0.24915164768871126, "grad_norm": 0.24616181996664016, "learning_rate": 4.3451312056545773e-07, "loss": 1.5699, "step": 1652 }, { "epoch": 0.2493024658773848, "grad_norm": 0.4045887878159248, "learning_rate": 4.3443791110425743e-07, "loss": 1.5591, "step": 1653 }, { "epoch": 0.24945328406605838, "grad_norm": 0.24005335829357874, "learning_rate": 4.343626658446026e-07, "loss": 1.5355, "step": 1654 }, { "epoch": 0.24960410225473192, "grad_norm": 0.2955294522291913, "learning_rate": 4.342873848033883e-07, "loss": 1.5653, "step": 1655 }, { "epoch": 0.24975492044340547, "grad_norm": 0.24029762289547038, "learning_rate": 4.3421206799751703e-07, "loss": 1.5152, "step": 1656 }, { "epoch": 0.24990573863207902, "grad_norm": 0.27040839936026695, "learning_rate": 4.341367154438998e-07, "loss": 1.562, "step": 1657 }, { "epoch": 0.25005655682075256, "grad_norm": 0.2490552430466942, "learning_rate": 4.340613271594553e-07, "loss": 1.5682, "step": 1658 }, { "epoch": 0.25020737500942614, "grad_norm": 0.3085223876082704, "learning_rate": 4.339859031611105e-07, "loss": 1.536, "step": 1659 }, { "epoch": 0.2503581931980997, "grad_norm": 0.24622530190986353, "learning_rate": 4.3391044346580027e-07, "loss": 1.5449, "step": 1660 }, { "epoch": 0.25050901138677323, "grad_norm": 0.4929943024064872, "learning_rate": 4.338349480904675e-07, "loss": 1.5288, "step": 1661 }, { "epoch": 0.2506598295754468, "grad_norm": 0.23007190138349864, "learning_rate": 4.337594170520631e-07, "loss": 1.479, "step": 1662 }, { "epoch": 0.2508106477641204, "grad_norm": 0.23837469219630214, "learning_rate": 4.33683850367546e-07, "loss": 1.5825, "step": 1663 }, { "epoch": 0.2509614659527939, "grad_norm": 0.2719929045506161, "learning_rate": 4.3360824805388313e-07, "loss": 1.5336, "step": 1664 }, { "epoch": 0.25111228414146747, "grad_norm": 0.27297958723610233, "learning_rate": 4.335326101280493e-07, "loss": 1.5636, "step": 1665 }, { "epoch": 0.25126310233014104, "grad_norm": 0.25677317032601776, "learning_rate": 4.3345693660702766e-07, "loss": 1.5933, "step": 1666 }, { "epoch": 0.25141392051881456, "grad_norm": 0.24753988356481488, "learning_rate": 4.3338122750780884e-07, "loss": 1.5948, "step": 1667 }, { "epoch": 0.25156473870748813, "grad_norm": 0.2550844565839159, "learning_rate": 4.3330548284739185e-07, "loss": 1.5243, "step": 1668 }, { "epoch": 0.25171555689616165, "grad_norm": 0.26171046048706154, "learning_rate": 4.332297026427837e-07, "loss": 1.4997, "step": 1669 }, { "epoch": 0.2518663750848352, "grad_norm": 0.25282478277315307, "learning_rate": 4.3315388691099913e-07, "loss": 1.5097, "step": 1670 }, { "epoch": 0.2520171932735088, "grad_norm": 0.2517793759803933, "learning_rate": 4.33078035669061e-07, "loss": 1.6149, "step": 1671 }, { "epoch": 0.2521680114621823, "grad_norm": 0.26008369217999744, "learning_rate": 4.3300214893400013e-07, "loss": 1.5947, "step": 1672 }, { "epoch": 0.2523188296508559, "grad_norm": 0.2683378898504414, "learning_rate": 4.3292622672285524e-07, "loss": 1.616, "step": 1673 }, { "epoch": 0.25246964783952947, "grad_norm": 0.23969156913581993, "learning_rate": 4.3285026905267315e-07, "loss": 1.5356, "step": 1674 }, { "epoch": 0.252620466028203, "grad_norm": 0.24086368898263558, "learning_rate": 4.3277427594050856e-07, "loss": 1.5343, "step": 1675 }, { "epoch": 0.25277128421687656, "grad_norm": 0.30730558248028683, "learning_rate": 4.326982474034241e-07, "loss": 1.6644, "step": 1676 }, { "epoch": 0.25292210240555013, "grad_norm": 0.24569205171743508, "learning_rate": 4.3262218345849046e-07, "loss": 1.5736, "step": 1677 }, { "epoch": 0.25307292059422365, "grad_norm": 0.25356805089460793, "learning_rate": 4.3254608412278614e-07, "loss": 1.4976, "step": 1678 }, { "epoch": 0.2532237387828972, "grad_norm": 0.2530609741816862, "learning_rate": 4.324699494133977e-07, "loss": 1.6002, "step": 1679 }, { "epoch": 0.2533745569715708, "grad_norm": 0.24265654867594905, "learning_rate": 4.3239377934741953e-07, "loss": 1.5675, "step": 1680 }, { "epoch": 0.2535253751602443, "grad_norm": 0.6132678415327332, "learning_rate": 4.323175739419542e-07, "loss": 1.5932, "step": 1681 }, { "epoch": 0.2536761933489179, "grad_norm": 0.2531688134086436, "learning_rate": 4.322413332141118e-07, "loss": 1.535, "step": 1682 }, { "epoch": 0.2538270115375914, "grad_norm": 0.2429492542086216, "learning_rate": 4.321650571810109e-07, "loss": 1.5977, "step": 1683 }, { "epoch": 0.253977829726265, "grad_norm": 0.2594783662561257, "learning_rate": 4.320887458597773e-07, "loss": 1.6983, "step": 1684 }, { "epoch": 0.25412864791493855, "grad_norm": 0.2556315585430283, "learning_rate": 4.3201239926754545e-07, "loss": 1.5241, "step": 1685 }, { "epoch": 0.2542794661036121, "grad_norm": 0.26542964134981467, "learning_rate": 4.3193601742145717e-07, "loss": 1.6081, "step": 1686 }, { "epoch": 0.25443028429228565, "grad_norm": 2.0009433719878618, "learning_rate": 4.318596003386626e-07, "loss": 1.4759, "step": 1687 }, { "epoch": 0.2545811024809592, "grad_norm": 0.25061467648599284, "learning_rate": 4.3178314803631945e-07, "loss": 1.5928, "step": 1688 }, { "epoch": 0.25473192066963274, "grad_norm": 0.4110198564177995, "learning_rate": 4.317066605315935e-07, "loss": 1.5477, "step": 1689 }, { "epoch": 0.2548827388583063, "grad_norm": 0.24508704358544142, "learning_rate": 4.3163013784165855e-07, "loss": 1.5367, "step": 1690 }, { "epoch": 0.2550335570469799, "grad_norm": 0.24949565040040414, "learning_rate": 4.31553579983696e-07, "loss": 1.5979, "step": 1691 }, { "epoch": 0.2551843752356534, "grad_norm": 0.2678625870670917, "learning_rate": 4.314769869748954e-07, "loss": 1.5595, "step": 1692 }, { "epoch": 0.255335193424327, "grad_norm": 0.6388075209881965, "learning_rate": 4.3140035883245407e-07, "loss": 1.5449, "step": 1693 }, { "epoch": 0.25548601161300055, "grad_norm": 0.2502282071079168, "learning_rate": 4.313236955735774e-07, "loss": 1.5923, "step": 1694 }, { "epoch": 0.25563682980167407, "grad_norm": 0.3161482244689952, "learning_rate": 4.312469972154784e-07, "loss": 1.5193, "step": 1695 }, { "epoch": 0.25578764799034764, "grad_norm": 0.24365107691376364, "learning_rate": 4.31170263775378e-07, "loss": 1.6498, "step": 1696 }, { "epoch": 0.25593846617902116, "grad_norm": 0.24672334626379835, "learning_rate": 4.3109349527050543e-07, "loss": 1.5974, "step": 1697 }, { "epoch": 0.25608928436769474, "grad_norm": 0.24377888393499542, "learning_rate": 4.3101669171809703e-07, "loss": 1.5205, "step": 1698 }, { "epoch": 0.2562401025563683, "grad_norm": 0.24090813425503693, "learning_rate": 4.309398531353977e-07, "loss": 1.6216, "step": 1699 }, { "epoch": 0.25639092074504183, "grad_norm": 0.27017836011261337, "learning_rate": 4.3086297953965987e-07, "loss": 1.5731, "step": 1700 }, { "epoch": 0.2565417389337154, "grad_norm": 0.24395037223746055, "learning_rate": 4.307860709481439e-07, "loss": 1.5249, "step": 1701 }, { "epoch": 0.256692557122389, "grad_norm": 0.2517507323798212, "learning_rate": 4.30709127378118e-07, "loss": 1.6236, "step": 1702 }, { "epoch": 0.2568433753110625, "grad_norm": 0.2564053341185109, "learning_rate": 4.306321488468583e-07, "loss": 1.5775, "step": 1703 }, { "epoch": 0.25699419349973607, "grad_norm": 0.24955879072954779, "learning_rate": 4.3055513537164856e-07, "loss": 1.5523, "step": 1704 }, { "epoch": 0.25714501168840964, "grad_norm": 0.24512985002257073, "learning_rate": 4.304780869697807e-07, "loss": 1.5565, "step": 1705 }, { "epoch": 0.25729582987708316, "grad_norm": 0.23499944533232645, "learning_rate": 4.3040100365855443e-07, "loss": 1.5503, "step": 1706 }, { "epoch": 0.25744664806575673, "grad_norm": 0.2566829771396666, "learning_rate": 4.3032388545527694e-07, "loss": 1.5505, "step": 1707 }, { "epoch": 0.2575974662544303, "grad_norm": 0.24099414263993085, "learning_rate": 4.302467323772637e-07, "loss": 1.5022, "step": 1708 }, { "epoch": 0.2577482844431038, "grad_norm": 0.24235084453227237, "learning_rate": 4.3016954444183767e-07, "loss": 1.5593, "step": 1709 }, { "epoch": 0.2578991026317774, "grad_norm": 0.4725987508836748, "learning_rate": 4.300923216663299e-07, "loss": 1.5162, "step": 1710 }, { "epoch": 0.258049920820451, "grad_norm": 0.2661850939605425, "learning_rate": 4.3001506406807913e-07, "loss": 1.5795, "step": 1711 }, { "epoch": 0.2582007390091245, "grad_norm": 0.2515271806280741, "learning_rate": 4.29937771664432e-07, "loss": 1.62, "step": 1712 }, { "epoch": 0.25835155719779807, "grad_norm": 0.24030278635856814, "learning_rate": 4.2986044447274273e-07, "loss": 1.6004, "step": 1713 }, { "epoch": 0.2585023753864716, "grad_norm": 0.26136403601121966, "learning_rate": 4.2978308251037376e-07, "loss": 1.5062, "step": 1714 }, { "epoch": 0.25865319357514516, "grad_norm": 0.25022243826164337, "learning_rate": 4.297056857946949e-07, "loss": 1.5679, "step": 1715 }, { "epoch": 0.25880401176381873, "grad_norm": 0.238469988701716, "learning_rate": 4.2962825434308406e-07, "loss": 1.6042, "step": 1716 }, { "epoch": 0.25895482995249225, "grad_norm": 0.2593833044846852, "learning_rate": 4.295507881729269e-07, "loss": 1.5535, "step": 1717 }, { "epoch": 0.2591056481411658, "grad_norm": 0.25348258747119135, "learning_rate": 4.2947328730161674e-07, "loss": 1.5699, "step": 1718 }, { "epoch": 0.2592564663298394, "grad_norm": 0.2515133377116682, "learning_rate": 4.2939575174655484e-07, "loss": 1.621, "step": 1719 }, { "epoch": 0.2594072845185129, "grad_norm": 0.30768284674184804, "learning_rate": 4.2931818152515016e-07, "loss": 1.5851, "step": 1720 }, { "epoch": 0.2595581027071865, "grad_norm": 0.24333286855773978, "learning_rate": 4.292405766548195e-07, "loss": 1.6054, "step": 1721 }, { "epoch": 0.25970892089586006, "grad_norm": 0.2450978395943358, "learning_rate": 4.291629371529874e-07, "loss": 1.602, "step": 1722 }, { "epoch": 0.2598597390845336, "grad_norm": 0.3485855472756898, "learning_rate": 4.2908526303708613e-07, "loss": 1.5424, "step": 1723 }, { "epoch": 0.26001055727320715, "grad_norm": 0.24658756574185361, "learning_rate": 4.29007554324556e-07, "loss": 1.5697, "step": 1724 }, { "epoch": 0.26016137546188073, "grad_norm": 0.24562117457221536, "learning_rate": 4.2892981103284457e-07, "loss": 1.4849, "step": 1725 }, { "epoch": 0.26031219365055425, "grad_norm": 0.2919432368155901, "learning_rate": 4.2885203317940766e-07, "loss": 1.6167, "step": 1726 }, { "epoch": 0.2604630118392278, "grad_norm": 0.2379643579981179, "learning_rate": 4.287742207817087e-07, "loss": 1.5943, "step": 1727 }, { "epoch": 0.26061383002790134, "grad_norm": 0.27234287100086396, "learning_rate": 4.286963738572187e-07, "loss": 1.5307, "step": 1728 }, { "epoch": 0.2607646482165749, "grad_norm": 0.23794033215323887, "learning_rate": 4.286184924234167e-07, "loss": 1.5633, "step": 1729 }, { "epoch": 0.2609154664052485, "grad_norm": 0.24534584613674915, "learning_rate": 4.285405764977892e-07, "loss": 1.5393, "step": 1730 }, { "epoch": 0.261066284593922, "grad_norm": 0.24622183805830222, "learning_rate": 4.2846262609783083e-07, "loss": 1.5555, "step": 1731 }, { "epoch": 0.2612171027825956, "grad_norm": 0.2473139121117716, "learning_rate": 4.283846412410435e-07, "loss": 1.558, "step": 1732 }, { "epoch": 0.26136792097126915, "grad_norm": 0.2503546090239365, "learning_rate": 4.2830662194493707e-07, "loss": 1.5865, "step": 1733 }, { "epoch": 0.26151873915994267, "grad_norm": 0.24329780657648137, "learning_rate": 4.2822856822702935e-07, "loss": 1.5058, "step": 1734 }, { "epoch": 0.26166955734861624, "grad_norm": 4.367126003159523, "learning_rate": 4.281504801048455e-07, "loss": 1.5286, "step": 1735 }, { "epoch": 0.2618203755372898, "grad_norm": 0.24525797410772415, "learning_rate": 4.2807235759591867e-07, "loss": 1.5335, "step": 1736 }, { "epoch": 0.26197119372596334, "grad_norm": 0.25112320477786404, "learning_rate": 4.279942007177896e-07, "loss": 1.5799, "step": 1737 }, { "epoch": 0.2621220119146369, "grad_norm": 0.25664626588007966, "learning_rate": 4.279160094880067e-07, "loss": 1.5918, "step": 1738 }, { "epoch": 0.2622728301033105, "grad_norm": 0.24580238927716838, "learning_rate": 4.278377839241263e-07, "loss": 1.5439, "step": 1739 }, { "epoch": 0.262423648291984, "grad_norm": 0.24261438363071613, "learning_rate": 4.2775952404371223e-07, "loss": 1.5971, "step": 1740 }, { "epoch": 0.2625744664806576, "grad_norm": 0.2479096132206566, "learning_rate": 4.276812298643363e-07, "loss": 1.5357, "step": 1741 }, { "epoch": 0.2627252846693311, "grad_norm": 0.24731093696997355, "learning_rate": 4.276029014035776e-07, "loss": 1.5494, "step": 1742 }, { "epoch": 0.26287610285800467, "grad_norm": 0.2493656978258735, "learning_rate": 4.275245386790233e-07, "loss": 1.567, "step": 1743 }, { "epoch": 0.26302692104667824, "grad_norm": 0.2580901877251635, "learning_rate": 4.27446141708268e-07, "loss": 1.6034, "step": 1744 }, { "epoch": 0.26317773923535176, "grad_norm": 0.2507352232466474, "learning_rate": 4.273677105089143e-07, "loss": 1.6311, "step": 1745 }, { "epoch": 0.26332855742402533, "grad_norm": 0.2702052480530884, "learning_rate": 4.27289245098572e-07, "loss": 1.5518, "step": 1746 }, { "epoch": 0.2634793756126989, "grad_norm": 0.23892961225211057, "learning_rate": 4.2721074549485907e-07, "loss": 1.5522, "step": 1747 }, { "epoch": 0.2636301938013724, "grad_norm": 2.167013539159077, "learning_rate": 4.271322117154009e-07, "loss": 1.5226, "step": 1748 }, { "epoch": 0.263781011990046, "grad_norm": 0.28077461392710434, "learning_rate": 4.2705364377783066e-07, "loss": 1.5568, "step": 1749 }, { "epoch": 0.2639318301787196, "grad_norm": 0.24046928533669656, "learning_rate": 4.26975041699789e-07, "loss": 1.5687, "step": 1750 }, { "epoch": 0.2640826483673931, "grad_norm": 0.25338651586610195, "learning_rate": 4.268964054989246e-07, "loss": 1.5549, "step": 1751 }, { "epoch": 0.26423346655606667, "grad_norm": 0.2631495053728446, "learning_rate": 4.2681773519289343e-07, "loss": 1.5547, "step": 1752 }, { "epoch": 0.26438428474474024, "grad_norm": 0.4099460775581195, "learning_rate": 4.267390307993592e-07, "loss": 1.5794, "step": 1753 }, { "epoch": 0.26453510293341376, "grad_norm": 0.2541266674653664, "learning_rate": 4.2666029233599355e-07, "loss": 1.5892, "step": 1754 }, { "epoch": 0.26468592112208733, "grad_norm": 0.25026334883655993, "learning_rate": 4.2658151982047534e-07, "loss": 1.5842, "step": 1755 }, { "epoch": 0.2648367393107609, "grad_norm": 0.2397883491629975, "learning_rate": 4.2650271327049146e-07, "loss": 1.589, "step": 1756 }, { "epoch": 0.2649875574994344, "grad_norm": 0.27441100966125137, "learning_rate": 4.2642387270373614e-07, "loss": 1.617, "step": 1757 }, { "epoch": 0.265138375688108, "grad_norm": 0.23639191026103049, "learning_rate": 4.263449981379115e-07, "loss": 1.517, "step": 1758 }, { "epoch": 0.2652891938767815, "grad_norm": 0.2699309144334069, "learning_rate": 4.262660895907271e-07, "loss": 1.6319, "step": 1759 }, { "epoch": 0.2654400120654551, "grad_norm": 0.2445276874634692, "learning_rate": 4.2618714707990015e-07, "loss": 1.5247, "step": 1760 }, { "epoch": 0.26559083025412866, "grad_norm": 0.25528482755111215, "learning_rate": 4.2610817062315576e-07, "loss": 1.5657, "step": 1761 }, { "epoch": 0.2657416484428022, "grad_norm": 0.27608493129247075, "learning_rate": 4.2602916023822625e-07, "loss": 1.5573, "step": 1762 }, { "epoch": 0.26589246663147575, "grad_norm": 0.2757447732664978, "learning_rate": 4.259501159428518e-07, "loss": 1.5814, "step": 1763 }, { "epoch": 0.26604328482014933, "grad_norm": 0.260561028735259, "learning_rate": 4.2587103775478025e-07, "loss": 1.641, "step": 1764 }, { "epoch": 0.26619410300882285, "grad_norm": 0.26360915418753517, "learning_rate": 4.257919256917668e-07, "loss": 1.5619, "step": 1765 }, { "epoch": 0.2663449211974964, "grad_norm": 0.2590646826792949, "learning_rate": 4.257127797715744e-07, "loss": 1.5823, "step": 1766 }, { "epoch": 0.26649573938617, "grad_norm": 0.2832432560993926, "learning_rate": 4.256336000119738e-07, "loss": 1.5921, "step": 1767 }, { "epoch": 0.2666465575748435, "grad_norm": 0.2638014243597412, "learning_rate": 4.255543864307431e-07, "loss": 1.5678, "step": 1768 }, { "epoch": 0.2667973757635171, "grad_norm": 0.5282768986208761, "learning_rate": 4.2547513904566796e-07, "loss": 1.676, "step": 1769 }, { "epoch": 0.26694819395219066, "grad_norm": 0.2510764884728056, "learning_rate": 4.253958578745418e-07, "loss": 1.5789, "step": 1770 }, { "epoch": 0.2670990121408642, "grad_norm": 0.25717667833243285, "learning_rate": 4.2531654293516556e-07, "loss": 1.5463, "step": 1771 }, { "epoch": 0.26724983032953775, "grad_norm": 0.28771594463963907, "learning_rate": 4.2523719424534765e-07, "loss": 1.5332, "step": 1772 }, { "epoch": 0.26740064851821127, "grad_norm": 0.2608554228521943, "learning_rate": 4.251578118229043e-07, "loss": 1.6115, "step": 1773 }, { "epoch": 0.26755146670688484, "grad_norm": 0.26476231982666265, "learning_rate": 4.2507839568565914e-07, "loss": 1.6156, "step": 1774 }, { "epoch": 0.2677022848955584, "grad_norm": 0.2543292846190255, "learning_rate": 4.249989458514433e-07, "loss": 1.545, "step": 1775 }, { "epoch": 0.26785310308423194, "grad_norm": 0.24208442426546845, "learning_rate": 4.249194623380957e-07, "loss": 1.5592, "step": 1776 }, { "epoch": 0.2680039212729055, "grad_norm": 0.24415812731000183, "learning_rate": 4.248399451634627e-07, "loss": 1.5207, "step": 1777 }, { "epoch": 0.2681547394615791, "grad_norm": 0.3047787053782849, "learning_rate": 4.247603943453982e-07, "loss": 1.5716, "step": 1778 }, { "epoch": 0.2683055576502526, "grad_norm": 0.24046780380561628, "learning_rate": 4.246808099017636e-07, "loss": 1.5271, "step": 1779 }, { "epoch": 0.2684563758389262, "grad_norm": 0.24969244713149716, "learning_rate": 4.246011918504281e-07, "loss": 1.55, "step": 1780 }, { "epoch": 0.26860719402759975, "grad_norm": 0.2506095835537955, "learning_rate": 4.24521540209268e-07, "loss": 1.5318, "step": 1781 }, { "epoch": 0.26875801221627327, "grad_norm": 0.2591988512992042, "learning_rate": 4.2444185499616777e-07, "loss": 1.5703, "step": 1782 }, { "epoch": 0.26890883040494684, "grad_norm": 0.25705381864614046, "learning_rate": 4.243621362290187e-07, "loss": 1.5615, "step": 1783 }, { "epoch": 0.2690596485936204, "grad_norm": 0.3141365686214411, "learning_rate": 4.2428238392572014e-07, "loss": 1.6309, "step": 1784 }, { "epoch": 0.26921046678229393, "grad_norm": 0.272157584434689, "learning_rate": 4.242025981041789e-07, "loss": 1.5744, "step": 1785 }, { "epoch": 0.2693612849709675, "grad_norm": 0.5842370568199183, "learning_rate": 4.2412277878230913e-07, "loss": 1.5694, "step": 1786 }, { "epoch": 0.269512103159641, "grad_norm": 0.23365515570093115, "learning_rate": 4.240429259780325e-07, "loss": 1.5745, "step": 1787 }, { "epoch": 0.2696629213483146, "grad_norm": 0.25109599306290425, "learning_rate": 4.2396303970927845e-07, "loss": 1.5455, "step": 1788 }, { "epoch": 0.2698137395369882, "grad_norm": 0.2501155339934986, "learning_rate": 4.238831199939837e-07, "loss": 1.5548, "step": 1789 }, { "epoch": 0.2699645577256617, "grad_norm": 0.2648447708666371, "learning_rate": 4.2380316685009256e-07, "loss": 1.6197, "step": 1790 }, { "epoch": 0.27011537591433527, "grad_norm": 0.2644931125098561, "learning_rate": 4.237231802955568e-07, "loss": 1.5647, "step": 1791 }, { "epoch": 0.27026619410300884, "grad_norm": 0.30971024477150005, "learning_rate": 4.236431603483357e-07, "loss": 1.6217, "step": 1792 }, { "epoch": 0.27041701229168236, "grad_norm": 0.26442207873245427, "learning_rate": 4.2356310702639626e-07, "loss": 1.5302, "step": 1793 }, { "epoch": 0.27056783048035593, "grad_norm": 0.2529394164738092, "learning_rate": 4.234830203477126e-07, "loss": 1.6298, "step": 1794 }, { "epoch": 0.2707186486690295, "grad_norm": 0.24635054290731914, "learning_rate": 4.2340290033026654e-07, "loss": 1.5682, "step": 1795 }, { "epoch": 0.270869466857703, "grad_norm": 0.2509760402983561, "learning_rate": 4.233227469920474e-07, "loss": 1.5543, "step": 1796 }, { "epoch": 0.2710202850463766, "grad_norm": 0.24652129894552294, "learning_rate": 4.23242560351052e-07, "loss": 1.5764, "step": 1797 }, { "epoch": 0.27117110323505017, "grad_norm": 0.24762894141823835, "learning_rate": 4.231623404252845e-07, "loss": 1.5256, "step": 1798 }, { "epoch": 0.2713219214237237, "grad_norm": 0.3218613310706804, "learning_rate": 4.230820872327565e-07, "loss": 1.5912, "step": 1799 }, { "epoch": 0.27147273961239726, "grad_norm": 0.25064303095633134, "learning_rate": 4.2300180079148726e-07, "loss": 1.5687, "step": 1800 }, { "epoch": 0.27162355780107084, "grad_norm": 0.2535822235378952, "learning_rate": 4.229214811195036e-07, "loss": 1.5286, "step": 1801 }, { "epoch": 0.27177437598974435, "grad_norm": 0.4217560376619295, "learning_rate": 4.228411282348394e-07, "loss": 1.526, "step": 1802 }, { "epoch": 0.27192519417841793, "grad_norm": 0.246226672236237, "learning_rate": 4.227607421555363e-07, "loss": 1.6019, "step": 1803 }, { "epoch": 0.27207601236709145, "grad_norm": 0.2568678331029727, "learning_rate": 4.2268032289964336e-07, "loss": 1.5312, "step": 1804 }, { "epoch": 0.272226830555765, "grad_norm": 0.24918046149982423, "learning_rate": 4.22599870485217e-07, "loss": 1.5058, "step": 1805 }, { "epoch": 0.2723776487444386, "grad_norm": 0.24894166724012773, "learning_rate": 4.2251938493032105e-07, "loss": 1.5814, "step": 1806 }, { "epoch": 0.2725284669331121, "grad_norm": 0.25993011394727705, "learning_rate": 4.2243886625302704e-07, "loss": 1.5733, "step": 1807 }, { "epoch": 0.2726792851217857, "grad_norm": 0.27306733568102737, "learning_rate": 4.2235831447141365e-07, "loss": 1.5702, "step": 1808 }, { "epoch": 0.27283010331045926, "grad_norm": 0.25989108875404365, "learning_rate": 4.2227772960356705e-07, "loss": 1.5981, "step": 1809 }, { "epoch": 0.2729809214991328, "grad_norm": 0.26149823478332435, "learning_rate": 4.2219711166758103e-07, "loss": 1.5908, "step": 1810 }, { "epoch": 0.27313173968780635, "grad_norm": 0.26026579015433193, "learning_rate": 4.2211646068155653e-07, "loss": 1.5659, "step": 1811 }, { "epoch": 0.2732825578764799, "grad_norm": 0.2786267119059909, "learning_rate": 4.220357766636021e-07, "loss": 1.4934, "step": 1812 }, { "epoch": 0.27343337606515344, "grad_norm": 0.24873854300957907, "learning_rate": 4.2195505963183366e-07, "loss": 1.5366, "step": 1813 }, { "epoch": 0.273584194253827, "grad_norm": 0.2450054811517773, "learning_rate": 4.218743096043746e-07, "loss": 1.5071, "step": 1814 }, { "epoch": 0.2737350124425006, "grad_norm": 0.24452395711635164, "learning_rate": 4.217935265993555e-07, "loss": 1.635, "step": 1815 }, { "epoch": 0.2738858306311741, "grad_norm": 0.26799091446406315, "learning_rate": 4.217127106349146e-07, "loss": 1.5429, "step": 1816 }, { "epoch": 0.2740366488198477, "grad_norm": 0.2529320540608072, "learning_rate": 4.216318617291974e-07, "loss": 1.5548, "step": 1817 }, { "epoch": 0.2741874670085212, "grad_norm": 0.3240342155926816, "learning_rate": 4.2155097990035686e-07, "loss": 1.4814, "step": 1818 }, { "epoch": 0.2743382851971948, "grad_norm": 0.25928057871799937, "learning_rate": 4.2147006516655336e-07, "loss": 1.6202, "step": 1819 }, { "epoch": 0.27448910338586835, "grad_norm": 0.2637460831085882, "learning_rate": 4.2138911754595447e-07, "loss": 1.5331, "step": 1820 }, { "epoch": 0.27463992157454187, "grad_norm": 0.40213844229973383, "learning_rate": 4.213081370567354e-07, "loss": 1.5896, "step": 1821 }, { "epoch": 0.27479073976321544, "grad_norm": 0.2635155410550008, "learning_rate": 4.2122712371707857e-07, "loss": 1.5273, "step": 1822 }, { "epoch": 0.274941557951889, "grad_norm": 0.251759146208383, "learning_rate": 4.2114607754517385e-07, "loss": 1.5721, "step": 1823 }, { "epoch": 0.27509237614056253, "grad_norm": 0.2390934984695804, "learning_rate": 4.210649985592186e-07, "loss": 1.5836, "step": 1824 }, { "epoch": 0.2752431943292361, "grad_norm": 0.24859484293238596, "learning_rate": 4.2098388677741707e-07, "loss": 1.6303, "step": 1825 }, { "epoch": 0.2753940125179097, "grad_norm": 0.3789595946124548, "learning_rate": 4.2090274221798166e-07, "loss": 1.7072, "step": 1826 }, { "epoch": 0.2755448307065832, "grad_norm": 0.2602972456576551, "learning_rate": 4.2082156489913134e-07, "loss": 1.6016, "step": 1827 }, { "epoch": 0.2756956488952568, "grad_norm": 0.2697359521626207, "learning_rate": 4.207403548390929e-07, "loss": 1.5317, "step": 1828 }, { "epoch": 0.27584646708393035, "grad_norm": 0.2568251490608846, "learning_rate": 4.2065911205610044e-07, "loss": 1.4874, "step": 1829 }, { "epoch": 0.27599728527260387, "grad_norm": 0.2511200614946822, "learning_rate": 4.2057783656839517e-07, "loss": 1.6139, "step": 1830 }, { "epoch": 0.27614810346127744, "grad_norm": 0.24320173272634724, "learning_rate": 4.20496528394226e-07, "loss": 1.6247, "step": 1831 }, { "epoch": 0.27629892164995096, "grad_norm": 0.28999189566860223, "learning_rate": 4.2041518755184886e-07, "loss": 1.5557, "step": 1832 }, { "epoch": 0.27644973983862453, "grad_norm": 0.24945551894216397, "learning_rate": 4.203338140595271e-07, "loss": 1.6079, "step": 1833 }, { "epoch": 0.2766005580272981, "grad_norm": 0.24195153170569705, "learning_rate": 4.2025240793553155e-07, "loss": 1.5163, "step": 1834 }, { "epoch": 0.2767513762159716, "grad_norm": 0.258263074146948, "learning_rate": 4.201709691981402e-07, "loss": 1.4975, "step": 1835 }, { "epoch": 0.2769021944046452, "grad_norm": 0.3224877359796225, "learning_rate": 4.2008949786563837e-07, "loss": 1.5915, "step": 1836 }, { "epoch": 0.27705301259331877, "grad_norm": 0.24634862739392877, "learning_rate": 4.200079939563188e-07, "loss": 1.5729, "step": 1837 }, { "epoch": 0.2772038307819923, "grad_norm": 0.25703866871533637, "learning_rate": 4.1992645748848156e-07, "loss": 1.5443, "step": 1838 }, { "epoch": 0.27735464897066586, "grad_norm": 0.9585138467259852, "learning_rate": 4.198448884804338e-07, "loss": 1.5301, "step": 1839 }, { "epoch": 0.27750546715933944, "grad_norm": 0.26208245329281477, "learning_rate": 4.197632869504903e-07, "loss": 1.569, "step": 1840 }, { "epoch": 0.27765628534801295, "grad_norm": 0.2513114884655622, "learning_rate": 4.196816529169729e-07, "loss": 1.605, "step": 1841 }, { "epoch": 0.27780710353668653, "grad_norm": 0.2530136474115613, "learning_rate": 4.1959998639821076e-07, "loss": 1.5764, "step": 1842 }, { "epoch": 0.2779579217253601, "grad_norm": 0.2757870606707001, "learning_rate": 4.195182874125405e-07, "loss": 1.6069, "step": 1843 }, { "epoch": 0.2781087399140336, "grad_norm": 0.25075678759563796, "learning_rate": 4.1943655597830585e-07, "loss": 1.4765, "step": 1844 }, { "epoch": 0.2782595581027072, "grad_norm": 0.24819323725669667, "learning_rate": 4.1935479211385793e-07, "loss": 1.5187, "step": 1845 }, { "epoch": 0.27841037629138077, "grad_norm": 0.2539600806747054, "learning_rate": 4.192729958375551e-07, "loss": 1.525, "step": 1846 }, { "epoch": 0.2785611944800543, "grad_norm": 0.2500165991563776, "learning_rate": 4.1919116716776306e-07, "loss": 1.5998, "step": 1847 }, { "epoch": 0.27871201266872786, "grad_norm": 0.25093956805624584, "learning_rate": 4.1910930612285463e-07, "loss": 1.5428, "step": 1848 }, { "epoch": 0.2788628308574014, "grad_norm": 0.24599969788501466, "learning_rate": 4.190274127212101e-07, "loss": 1.5203, "step": 1849 }, { "epoch": 0.27901364904607495, "grad_norm": 0.24701840386671767, "learning_rate": 4.189454869812168e-07, "loss": 1.5905, "step": 1850 }, { "epoch": 0.2791644672347485, "grad_norm": 0.2593434020338551, "learning_rate": 4.188635289212696e-07, "loss": 1.5726, "step": 1851 }, { "epoch": 0.27931528542342204, "grad_norm": 0.2433982875221031, "learning_rate": 4.1878153855977026e-07, "loss": 1.5422, "step": 1852 }, { "epoch": 0.2794661036120956, "grad_norm": 0.25178640775843025, "learning_rate": 4.1869951591512825e-07, "loss": 1.6041, "step": 1853 }, { "epoch": 0.2796169218007692, "grad_norm": 0.3112800150444886, "learning_rate": 4.186174610057599e-07, "loss": 1.6311, "step": 1854 }, { "epoch": 0.2797677399894427, "grad_norm": 0.34722227334108646, "learning_rate": 4.18535373850089e-07, "loss": 1.5713, "step": 1855 }, { "epoch": 0.2799185581781163, "grad_norm": 0.2556966025664008, "learning_rate": 4.184532544665463e-07, "loss": 1.6148, "step": 1856 }, { "epoch": 0.28006937636678986, "grad_norm": 0.25148191572401546, "learning_rate": 4.1837110287357036e-07, "loss": 1.5831, "step": 1857 }, { "epoch": 0.2802201945554634, "grad_norm": 0.3077566897785846, "learning_rate": 4.182889190896063e-07, "loss": 1.6374, "step": 1858 }, { "epoch": 0.28037101274413695, "grad_norm": 0.26809303377651905, "learning_rate": 4.182067031331068e-07, "loss": 1.5719, "step": 1859 }, { "epoch": 0.2805218309328105, "grad_norm": 0.2788609639999816, "learning_rate": 4.1812445502253187e-07, "loss": 1.6558, "step": 1860 }, { "epoch": 0.28067264912148404, "grad_norm": 0.2508605671959314, "learning_rate": 4.180421747763485e-07, "loss": 1.5351, "step": 1861 }, { "epoch": 0.2808234673101576, "grad_norm": 0.2575116015673467, "learning_rate": 4.179598624130311e-07, "loss": 1.5704, "step": 1862 }, { "epoch": 0.28097428549883113, "grad_norm": 0.26051246201496414, "learning_rate": 4.178775179510611e-07, "loss": 1.5248, "step": 1863 }, { "epoch": 0.2811251036875047, "grad_norm": 0.3062813945889822, "learning_rate": 4.177951414089273e-07, "loss": 1.55, "step": 1864 }, { "epoch": 0.2812759218761783, "grad_norm": 0.2452833635928116, "learning_rate": 4.1771273280512563e-07, "loss": 1.5735, "step": 1865 }, { "epoch": 0.2814267400648518, "grad_norm": 0.3323851621699343, "learning_rate": 4.1763029215815917e-07, "loss": 1.6476, "step": 1866 }, { "epoch": 0.2815775582535254, "grad_norm": 0.3002016764143178, "learning_rate": 4.175478194865382e-07, "loss": 1.5434, "step": 1867 }, { "epoch": 0.28172837644219895, "grad_norm": 0.24134420662883305, "learning_rate": 4.1746531480878034e-07, "loss": 1.5084, "step": 1868 }, { "epoch": 0.28187919463087246, "grad_norm": 0.25200348166629133, "learning_rate": 4.173827781434103e-07, "loss": 1.5788, "step": 1869 }, { "epoch": 0.28203001281954604, "grad_norm": 0.8112500246985829, "learning_rate": 4.1730020950895984e-07, "loss": 1.5803, "step": 1870 }, { "epoch": 0.2821808310082196, "grad_norm": 0.25242595654431865, "learning_rate": 4.172176089239682e-07, "loss": 1.5233, "step": 1871 }, { "epoch": 0.28233164919689313, "grad_norm": 0.2477708621373277, "learning_rate": 4.171349764069814e-07, "loss": 1.6228, "step": 1872 }, { "epoch": 0.2824824673855667, "grad_norm": 0.25166770520066656, "learning_rate": 4.170523119765531e-07, "loss": 1.5883, "step": 1873 }, { "epoch": 0.2826332855742403, "grad_norm": 0.2488566651756397, "learning_rate": 4.1696961565124365e-07, "loss": 1.6172, "step": 1874 }, { "epoch": 0.2827841037629138, "grad_norm": 0.26082171280059746, "learning_rate": 4.168868874496209e-07, "loss": 1.6535, "step": 1875 }, { "epoch": 0.28293492195158737, "grad_norm": 0.2574831982077194, "learning_rate": 4.1680412739025975e-07, "loss": 1.5699, "step": 1876 }, { "epoch": 0.2830857401402609, "grad_norm": 0.25686836612569874, "learning_rate": 4.1672133549174213e-07, "loss": 1.5909, "step": 1877 }, { "epoch": 0.28323655832893446, "grad_norm": 0.2626699964340152, "learning_rate": 4.166385117726574e-07, "loss": 1.5446, "step": 1878 }, { "epoch": 0.28338737651760804, "grad_norm": 0.2457572940259408, "learning_rate": 4.1655565625160175e-07, "loss": 1.6223, "step": 1879 }, { "epoch": 0.28353819470628155, "grad_norm": 0.2591728510568822, "learning_rate": 4.1647276894717877e-07, "loss": 1.5866, "step": 1880 }, { "epoch": 0.28368901289495513, "grad_norm": 0.2525946883907776, "learning_rate": 4.1638984987799896e-07, "loss": 1.6028, "step": 1881 }, { "epoch": 0.2838398310836287, "grad_norm": 0.26481841535374673, "learning_rate": 4.1630689906268015e-07, "loss": 1.5451, "step": 1882 }, { "epoch": 0.2839906492723022, "grad_norm": 0.26108338852677637, "learning_rate": 4.162239165198472e-07, "loss": 1.5494, "step": 1883 }, { "epoch": 0.2841414674609758, "grad_norm": 0.28379144963495145, "learning_rate": 4.161409022681321e-07, "loss": 1.5438, "step": 1884 }, { "epoch": 0.28429228564964937, "grad_norm": 0.24147918067444937, "learning_rate": 4.1605785632617395e-07, "loss": 1.5234, "step": 1885 }, { "epoch": 0.2844431038383229, "grad_norm": 0.2558554466858706, "learning_rate": 4.1597477871261894e-07, "loss": 1.5412, "step": 1886 }, { "epoch": 0.28459392202699646, "grad_norm": 0.24699696333450885, "learning_rate": 4.1589166944612057e-07, "loss": 1.5232, "step": 1887 }, { "epoch": 0.28474474021567003, "grad_norm": 0.253191918475815, "learning_rate": 4.1580852854533916e-07, "loss": 1.5336, "step": 1888 }, { "epoch": 0.28489555840434355, "grad_norm": 0.26486007911434506, "learning_rate": 4.157253560289423e-07, "loss": 1.4881, "step": 1889 }, { "epoch": 0.2850463765930171, "grad_norm": 0.24381429165526342, "learning_rate": 4.156421519156046e-07, "loss": 1.5644, "step": 1890 }, { "epoch": 0.2851971947816907, "grad_norm": 0.24278796646850606, "learning_rate": 4.155589162240078e-07, "loss": 1.5579, "step": 1891 }, { "epoch": 0.2853480129703642, "grad_norm": 0.27047678681380877, "learning_rate": 4.154756489728409e-07, "loss": 1.5339, "step": 1892 }, { "epoch": 0.2854988311590378, "grad_norm": 0.32759906068096833, "learning_rate": 4.153923501807996e-07, "loss": 1.6226, "step": 1893 }, { "epoch": 0.2856496493477113, "grad_norm": 0.850154673090816, "learning_rate": 4.1530901986658704e-07, "loss": 1.5314, "step": 1894 }, { "epoch": 0.2858004675363849, "grad_norm": 0.2445143331468664, "learning_rate": 4.1522565804891326e-07, "loss": 1.5958, "step": 1895 }, { "epoch": 0.28595128572505846, "grad_norm": 0.25907546127233794, "learning_rate": 4.1514226474649543e-07, "loss": 1.643, "step": 1896 }, { "epoch": 0.286102103913732, "grad_norm": 0.24135999646434758, "learning_rate": 4.150588399780578e-07, "loss": 1.5438, "step": 1897 }, { "epoch": 0.28625292210240555, "grad_norm": 0.247240957153842, "learning_rate": 4.1497538376233167e-07, "loss": 1.5548, "step": 1898 }, { "epoch": 0.2864037402910791, "grad_norm": 0.2702835083554822, "learning_rate": 4.1489189611805524e-07, "loss": 1.6004, "step": 1899 }, { "epoch": 0.28655455847975264, "grad_norm": 0.2459064321844834, "learning_rate": 4.148083770639741e-07, "loss": 1.6031, "step": 1900 }, { "epoch": 0.2867053766684262, "grad_norm": 0.3773518913970108, "learning_rate": 4.147248266188406e-07, "loss": 1.5511, "step": 1901 }, { "epoch": 0.2868561948570998, "grad_norm": 0.2619832946401977, "learning_rate": 4.146412448014143e-07, "loss": 1.5707, "step": 1902 }, { "epoch": 0.2870070130457733, "grad_norm": 0.25795252598404206, "learning_rate": 4.145576316304618e-07, "loss": 1.5677, "step": 1903 }, { "epoch": 0.2871578312344469, "grad_norm": 0.2598821459333334, "learning_rate": 4.1447398712475656e-07, "loss": 1.6072, "step": 1904 }, { "epoch": 0.28730864942312045, "grad_norm": 0.31528686521750887, "learning_rate": 4.1439031130307925e-07, "loss": 1.5934, "step": 1905 }, { "epoch": 0.287459467611794, "grad_norm": 0.33197145914876147, "learning_rate": 4.143066041842176e-07, "loss": 1.615, "step": 1906 }, { "epoch": 0.28761028580046755, "grad_norm": 0.2571601768464355, "learning_rate": 4.142228657869663e-07, "loss": 1.5988, "step": 1907 }, { "epoch": 0.28776110398914106, "grad_norm": 0.24326782318283038, "learning_rate": 4.14139096130127e-07, "loss": 1.5164, "step": 1908 }, { "epoch": 0.28791192217781464, "grad_norm": 0.25010534683464847, "learning_rate": 4.140552952325084e-07, "loss": 1.585, "step": 1909 }, { "epoch": 0.2880627403664882, "grad_norm": 0.2484450481343041, "learning_rate": 4.139714631129263e-07, "loss": 1.5448, "step": 1910 }, { "epoch": 0.28821355855516173, "grad_norm": 0.26683794510788705, "learning_rate": 4.1388759979020337e-07, "loss": 1.6287, "step": 1911 }, { "epoch": 0.2883643767438353, "grad_norm": 0.2549367342857066, "learning_rate": 4.1380370528316947e-07, "loss": 1.655, "step": 1912 }, { "epoch": 0.2885151949325089, "grad_norm": 0.2628214303418074, "learning_rate": 4.137197796106614e-07, "loss": 1.5859, "step": 1913 }, { "epoch": 0.2886660131211824, "grad_norm": 0.26071551769767043, "learning_rate": 4.1363582279152283e-07, "loss": 1.5769, "step": 1914 }, { "epoch": 0.28881683130985597, "grad_norm": 0.26176747168082637, "learning_rate": 4.1355183484460443e-07, "loss": 1.5321, "step": 1915 }, { "epoch": 0.28896764949852954, "grad_norm": 0.24734604568247337, "learning_rate": 4.134678157887641e-07, "loss": 1.5853, "step": 1916 }, { "epoch": 0.28911846768720306, "grad_norm": 0.259978618658322, "learning_rate": 4.1338376564286647e-07, "loss": 1.5809, "step": 1917 }, { "epoch": 0.28926928587587664, "grad_norm": 0.25786027074792967, "learning_rate": 4.132996844257833e-07, "loss": 1.5367, "step": 1918 }, { "epoch": 0.2894201040645502, "grad_norm": 0.2571241034906806, "learning_rate": 4.1321557215639317e-07, "loss": 1.5649, "step": 1919 }, { "epoch": 0.28957092225322373, "grad_norm": 0.2507150683383269, "learning_rate": 4.131314288535819e-07, "loss": 1.5159, "step": 1920 }, { "epoch": 0.2897217404418973, "grad_norm": 0.27552243362352963, "learning_rate": 4.1304725453624195e-07, "loss": 1.6037, "step": 1921 }, { "epoch": 0.2898725586305708, "grad_norm": 0.24515585991702715, "learning_rate": 4.12963049223273e-07, "loss": 1.5499, "step": 1922 }, { "epoch": 0.2900233768192444, "grad_norm": 0.26782050776616445, "learning_rate": 4.128788129335816e-07, "loss": 1.5409, "step": 1923 }, { "epoch": 0.29017419500791797, "grad_norm": 0.25978871302610407, "learning_rate": 4.1279454568608123e-07, "loss": 1.5393, "step": 1924 }, { "epoch": 0.2903250131965915, "grad_norm": 0.25351359568716264, "learning_rate": 4.127102474996923e-07, "loss": 1.5486, "step": 1925 }, { "epoch": 0.29047583138526506, "grad_norm": 0.24293422687620772, "learning_rate": 4.126259183933423e-07, "loss": 1.5937, "step": 1926 }, { "epoch": 0.29062664957393863, "grad_norm": 0.245471616869013, "learning_rate": 4.1254155838596544e-07, "loss": 1.5299, "step": 1927 }, { "epoch": 0.29077746776261215, "grad_norm": 0.25437820552331764, "learning_rate": 4.1245716749650305e-07, "loss": 1.5279, "step": 1928 }, { "epoch": 0.2909282859512857, "grad_norm": 0.2718417064207998, "learning_rate": 4.123727457439034e-07, "loss": 1.5289, "step": 1929 }, { "epoch": 0.2910791041399593, "grad_norm": 0.2415724984252487, "learning_rate": 4.122882931471216e-07, "loss": 1.563, "step": 1930 }, { "epoch": 0.2912299223286328, "grad_norm": 0.2927444262782482, "learning_rate": 4.122038097251197e-07, "loss": 1.5284, "step": 1931 }, { "epoch": 0.2913807405173064, "grad_norm": 0.27657048401816287, "learning_rate": 4.121192954968667e-07, "loss": 1.5759, "step": 1932 }, { "epoch": 0.29153155870597997, "grad_norm": 0.2630675188418442, "learning_rate": 4.1203475048133853e-07, "loss": 1.5694, "step": 1933 }, { "epoch": 0.2916823768946535, "grad_norm": 0.25207784287561097, "learning_rate": 4.119501746975179e-07, "loss": 1.5946, "step": 1934 }, { "epoch": 0.29183319508332706, "grad_norm": 0.29143907484127923, "learning_rate": 4.1186556816439476e-07, "loss": 1.5423, "step": 1935 }, { "epoch": 0.29198401327200063, "grad_norm": 0.25399531184844054, "learning_rate": 4.117809309009655e-07, "loss": 1.5576, "step": 1936 }, { "epoch": 0.29213483146067415, "grad_norm": 0.24292551587894717, "learning_rate": 4.116962629262338e-07, "loss": 1.5561, "step": 1937 }, { "epoch": 0.2922856496493477, "grad_norm": 0.26846283060694975, "learning_rate": 4.1161156425921e-07, "loss": 1.536, "step": 1938 }, { "epoch": 0.29243646783802124, "grad_norm": 0.25830246639286103, "learning_rate": 4.1152683491891146e-07, "loss": 1.5553, "step": 1939 }, { "epoch": 0.2925872860266948, "grad_norm": 0.4761048260200553, "learning_rate": 4.114420749243624e-07, "loss": 1.5477, "step": 1940 }, { "epoch": 0.2927381042153684, "grad_norm": 0.2632060076635442, "learning_rate": 4.1135728429459387e-07, "loss": 1.5913, "step": 1941 }, { "epoch": 0.2928889224040419, "grad_norm": 0.2607154117850426, "learning_rate": 4.112724630486439e-07, "loss": 1.5463, "step": 1942 }, { "epoch": 0.2930397405927155, "grad_norm": 0.24111840360689804, "learning_rate": 4.1118761120555725e-07, "loss": 1.5928, "step": 1943 }, { "epoch": 0.29319055878138905, "grad_norm": 0.2416204003895274, "learning_rate": 4.1110272878438567e-07, "loss": 1.5688, "step": 1944 }, { "epoch": 0.2933413769700626, "grad_norm": 0.27524436757956083, "learning_rate": 4.1101781580418764e-07, "loss": 1.5121, "step": 1945 }, { "epoch": 0.29349219515873615, "grad_norm": 0.24805067548995707, "learning_rate": 4.1093287228402886e-07, "loss": 1.5406, "step": 1946 }, { "epoch": 0.2936430133474097, "grad_norm": 0.2575731295687873, "learning_rate": 4.1084789824298125e-07, "loss": 1.5712, "step": 1947 }, { "epoch": 0.29379383153608324, "grad_norm": 0.6450012859290429, "learning_rate": 4.107628937001243e-07, "loss": 1.5549, "step": 1948 }, { "epoch": 0.2939446497247568, "grad_norm": 0.2611294321594528, "learning_rate": 4.106778586745438e-07, "loss": 1.6068, "step": 1949 }, { "epoch": 0.2940954679134304, "grad_norm": 0.47404066070592726, "learning_rate": 4.105927931853327e-07, "loss": 1.5262, "step": 1950 }, { "epoch": 0.2942462861021039, "grad_norm": 0.26325757681741707, "learning_rate": 4.1050769725159054e-07, "loss": 1.5558, "step": 1951 }, { "epoch": 0.2943971042907775, "grad_norm": 0.26210215015949584, "learning_rate": 4.10422570892424e-07, "loss": 1.5707, "step": 1952 }, { "epoch": 0.294547922479451, "grad_norm": 0.23889687523013875, "learning_rate": 4.1033741412694636e-07, "loss": 1.5696, "step": 1953 }, { "epoch": 0.29469874066812457, "grad_norm": 0.2864911607553577, "learning_rate": 4.1025222697427777e-07, "loss": 1.5221, "step": 1954 }, { "epoch": 0.29484955885679814, "grad_norm": 0.2507908685608037, "learning_rate": 4.1016700945354525e-07, "loss": 1.4778, "step": 1955 }, { "epoch": 0.29500037704547166, "grad_norm": 0.2679153163035175, "learning_rate": 4.100817615838826e-07, "loss": 1.5648, "step": 1956 }, { "epoch": 0.29515119523414524, "grad_norm": 0.24669025937295905, "learning_rate": 4.0999648338443045e-07, "loss": 1.5723, "step": 1957 }, { "epoch": 0.2953020134228188, "grad_norm": 0.26473189279096443, "learning_rate": 4.0991117487433623e-07, "loss": 1.5536, "step": 1958 }, { "epoch": 0.29545283161149233, "grad_norm": 0.2487310850412394, "learning_rate": 4.098258360727543e-07, "loss": 1.5793, "step": 1959 }, { "epoch": 0.2956036498001659, "grad_norm": 0.2587133676238718, "learning_rate": 4.097404669988455e-07, "loss": 1.4535, "step": 1960 }, { "epoch": 0.2957544679888395, "grad_norm": 0.2536359469005928, "learning_rate": 4.0965506767177784e-07, "loss": 1.5054, "step": 1961 }, { "epoch": 0.295905286177513, "grad_norm": 0.262082928095278, "learning_rate": 4.0956963811072583e-07, "loss": 1.598, "step": 1962 }, { "epoch": 0.29605610436618657, "grad_norm": 0.25440730089743857, "learning_rate": 4.0948417833487103e-07, "loss": 1.5339, "step": 1963 }, { "epoch": 0.29620692255486014, "grad_norm": 0.25822983100141933, "learning_rate": 4.093986883634016e-07, "loss": 1.6163, "step": 1964 }, { "epoch": 0.29635774074353366, "grad_norm": 0.24813816449687914, "learning_rate": 4.093131682155125e-07, "loss": 1.5453, "step": 1965 }, { "epoch": 0.29650855893220723, "grad_norm": 0.2548251273125417, "learning_rate": 4.092276179104055e-07, "loss": 1.5398, "step": 1966 }, { "epoch": 0.29665937712088075, "grad_norm": 0.3985691681710239, "learning_rate": 4.09142037467289e-07, "loss": 1.5861, "step": 1967 }, { "epoch": 0.2968101953095543, "grad_norm": 0.26433297032732417, "learning_rate": 4.090564269053787e-07, "loss": 1.5728, "step": 1968 }, { "epoch": 0.2969610134982279, "grad_norm": 0.23422264251292965, "learning_rate": 4.089707862438962e-07, "loss": 1.6059, "step": 1969 }, { "epoch": 0.2971118316869014, "grad_norm": 0.27084535372529556, "learning_rate": 4.088851155020706e-07, "loss": 1.6073, "step": 1970 }, { "epoch": 0.297262649875575, "grad_norm": 0.2821936345624914, "learning_rate": 4.087994146991374e-07, "loss": 1.518, "step": 1971 }, { "epoch": 0.29741346806424857, "grad_norm": 0.26212810945747056, "learning_rate": 4.087136838543389e-07, "loss": 1.5569, "step": 1972 }, { "epoch": 0.2975642862529221, "grad_norm": 0.2447975691364571, "learning_rate": 4.086279229869242e-07, "loss": 1.5593, "step": 1973 }, { "epoch": 0.29771510444159566, "grad_norm": 0.2419722057739497, "learning_rate": 4.0854213211614916e-07, "loss": 1.5567, "step": 1974 }, { "epoch": 0.29786592263026923, "grad_norm": 0.24653505228114833, "learning_rate": 4.084563112612763e-07, "loss": 1.5541, "step": 1975 }, { "epoch": 0.29801674081894275, "grad_norm": 0.2689128543871328, "learning_rate": 4.083704604415748e-07, "loss": 1.6205, "step": 1976 }, { "epoch": 0.2981675590076163, "grad_norm": 0.27997129697482137, "learning_rate": 4.0828457967632086e-07, "loss": 1.641, "step": 1977 }, { "epoch": 0.2983183771962899, "grad_norm": 0.25499025032094236, "learning_rate": 4.0819866898479703e-07, "loss": 1.5725, "step": 1978 }, { "epoch": 0.2984691953849634, "grad_norm": 0.5480214467709781, "learning_rate": 4.081127283862929e-07, "loss": 1.5784, "step": 1979 }, { "epoch": 0.298620013573637, "grad_norm": 0.2620739620572641, "learning_rate": 4.0802675790010456e-07, "loss": 1.4975, "step": 1980 }, { "epoch": 0.2987708317623105, "grad_norm": 0.25424429682397565, "learning_rate": 4.0794075754553483e-07, "loss": 1.5506, "step": 1981 }, { "epoch": 0.2989216499509841, "grad_norm": 0.5682722291288377, "learning_rate": 4.0785472734189345e-07, "loss": 1.548, "step": 1982 }, { "epoch": 0.29907246813965765, "grad_norm": 0.37582259605338, "learning_rate": 4.077686673084966e-07, "loss": 1.5605, "step": 1983 }, { "epoch": 0.2992232863283312, "grad_norm": 0.34117334933146093, "learning_rate": 4.076825774646672e-07, "loss": 1.5692, "step": 1984 }, { "epoch": 0.29937410451700475, "grad_norm": 0.24169476981631696, "learning_rate": 4.075964578297352e-07, "loss": 1.5554, "step": 1985 }, { "epoch": 0.2995249227056783, "grad_norm": 0.3749710955356763, "learning_rate": 4.0751030842303656e-07, "loss": 1.5426, "step": 1986 }, { "epoch": 0.29967574089435184, "grad_norm": 0.4484413453699465, "learning_rate": 4.0742412926391474e-07, "loss": 1.5726, "step": 1987 }, { "epoch": 0.2998265590830254, "grad_norm": 0.24642273347537877, "learning_rate": 4.0733792037171916e-07, "loss": 1.5898, "step": 1988 }, { "epoch": 0.299977377271699, "grad_norm": 0.29618837310677704, "learning_rate": 4.072516817658064e-07, "loss": 1.6294, "step": 1989 }, { "epoch": 0.3001281954603725, "grad_norm": 0.25474726896157884, "learning_rate": 4.071654134655395e-07, "loss": 1.5493, "step": 1990 }, { "epoch": 0.3002790136490461, "grad_norm": 0.7666324706571088, "learning_rate": 4.070791154902882e-07, "loss": 1.5685, "step": 1991 }, { "epoch": 0.30042983183771965, "grad_norm": 0.23909485923559612, "learning_rate": 4.069927878594289e-07, "loss": 1.5954, "step": 1992 }, { "epoch": 0.30058065002639317, "grad_norm": 0.25181643764237355, "learning_rate": 4.069064305923447e-07, "loss": 1.5721, "step": 1993 }, { "epoch": 0.30073146821506674, "grad_norm": 0.24995411296454376, "learning_rate": 4.068200437084252e-07, "loss": 1.5806, "step": 1994 }, { "epoch": 0.3008822864037403, "grad_norm": 0.2492558803326442, "learning_rate": 4.0673362722706695e-07, "loss": 1.6338, "step": 1995 }, { "epoch": 0.30103310459241384, "grad_norm": 0.24359118017630724, "learning_rate": 4.0664718116767287e-07, "loss": 1.6746, "step": 1996 }, { "epoch": 0.3011839227810874, "grad_norm": 0.25822056709052726, "learning_rate": 4.0656070554965257e-07, "loss": 1.5229, "step": 1997 }, { "epoch": 0.30133474096976093, "grad_norm": 0.25630204248834293, "learning_rate": 4.064742003924224e-07, "loss": 1.5693, "step": 1998 }, { "epoch": 0.3014855591584345, "grad_norm": 0.2815196438644947, "learning_rate": 4.063876657154054e-07, "loss": 1.5625, "step": 1999 }, { "epoch": 0.3016363773471081, "grad_norm": 0.24786544945527367, "learning_rate": 4.0630110153803094e-07, "loss": 1.5591, "step": 2000 }, { "epoch": 0.3017871955357816, "grad_norm": 0.24701583438940208, "learning_rate": 4.062145078797353e-07, "loss": 1.549, "step": 2001 }, { "epoch": 0.30193801372445517, "grad_norm": 0.25457518078584596, "learning_rate": 4.061278847599612e-07, "loss": 1.6315, "step": 2002 }, { "epoch": 0.30208883191312874, "grad_norm": 0.2360049490832213, "learning_rate": 4.060412321981582e-07, "loss": 1.5689, "step": 2003 }, { "epoch": 0.30223965010180226, "grad_norm": 0.2538901784411201, "learning_rate": 4.059545502137821e-07, "loss": 1.6141, "step": 2004 }, { "epoch": 0.30239046829047583, "grad_norm": 0.2611059042662772, "learning_rate": 4.0586783882629574e-07, "loss": 1.4949, "step": 2005 }, { "epoch": 0.3025412864791494, "grad_norm": 0.24314716429863004, "learning_rate": 4.0578109805516825e-07, "loss": 1.5134, "step": 2006 }, { "epoch": 0.3026921046678229, "grad_norm": 0.2744908505423212, "learning_rate": 4.0569432791987545e-07, "loss": 1.5875, "step": 2007 }, { "epoch": 0.3028429228564965, "grad_norm": 0.265288441657907, "learning_rate": 4.056075284398998e-07, "loss": 1.544, "step": 2008 }, { "epoch": 0.3029937410451701, "grad_norm": 0.24689543993443636, "learning_rate": 4.055206996347304e-07, "loss": 1.563, "step": 2009 }, { "epoch": 0.3031445592338436, "grad_norm": 0.2846226929353735, "learning_rate": 4.054338415238627e-07, "loss": 1.538, "step": 2010 }, { "epoch": 0.30329537742251717, "grad_norm": 0.25529054190064804, "learning_rate": 4.0534695412679885e-07, "loss": 1.5938, "step": 2011 }, { "epoch": 0.3034461956111907, "grad_norm": 0.26141624179768436, "learning_rate": 4.0526003746304776e-07, "loss": 1.6299, "step": 2012 }, { "epoch": 0.30359701379986426, "grad_norm": 0.2509275510566078, "learning_rate": 4.051730915521246e-07, "loss": 1.5485, "step": 2013 }, { "epoch": 0.30374783198853783, "grad_norm": 0.25921193285116084, "learning_rate": 4.0508611641355137e-07, "loss": 1.6706, "step": 2014 }, { "epoch": 0.30389865017721135, "grad_norm": 0.29053096237224685, "learning_rate": 4.049991120668565e-07, "loss": 1.5731, "step": 2015 }, { "epoch": 0.3040494683658849, "grad_norm": 0.2403332440983861, "learning_rate": 4.04912078531575e-07, "loss": 1.5128, "step": 2016 }, { "epoch": 0.3042002865545585, "grad_norm": 0.2429201229927698, "learning_rate": 4.0482501582724824e-07, "loss": 1.5768, "step": 2017 }, { "epoch": 0.304351104743232, "grad_norm": 0.2496444300054758, "learning_rate": 4.047379239734247e-07, "loss": 1.5181, "step": 2018 }, { "epoch": 0.3045019229319056, "grad_norm": 0.2467065446436571, "learning_rate": 4.046508029896588e-07, "loss": 1.6075, "step": 2019 }, { "epoch": 0.30465274112057916, "grad_norm": 0.24456129277039818, "learning_rate": 4.045636528955117e-07, "loss": 1.5415, "step": 2020 }, { "epoch": 0.3048035593092527, "grad_norm": 0.2641929208072259, "learning_rate": 4.0447647371055127e-07, "loss": 1.5642, "step": 2021 }, { "epoch": 0.30495437749792625, "grad_norm": 0.253542202164034, "learning_rate": 4.043892654543517e-07, "loss": 1.535, "step": 2022 }, { "epoch": 0.30510519568659983, "grad_norm": 0.24314338746301464, "learning_rate": 4.0430202814649386e-07, "loss": 1.6212, "step": 2023 }, { "epoch": 0.30525601387527335, "grad_norm": 0.25716390681239226, "learning_rate": 4.0421476180656495e-07, "loss": 1.5737, "step": 2024 }, { "epoch": 0.3054068320639469, "grad_norm": 0.2569826323216223, "learning_rate": 4.041274664541589e-07, "loss": 1.5949, "step": 2025 }, { "epoch": 0.30555765025262044, "grad_norm": 0.29526969958564175, "learning_rate": 4.0404014210887597e-07, "loss": 1.5407, "step": 2026 }, { "epoch": 0.305708468441294, "grad_norm": 0.2548618078832389, "learning_rate": 4.0395278879032315e-07, "loss": 1.5513, "step": 2027 }, { "epoch": 0.3058592866299676, "grad_norm": 0.25121971550161243, "learning_rate": 4.0386540651811363e-07, "loss": 1.5606, "step": 2028 }, { "epoch": 0.3060101048186411, "grad_norm": 0.248833321551354, "learning_rate": 4.037779953118674e-07, "loss": 1.5918, "step": 2029 }, { "epoch": 0.3061609230073147, "grad_norm": 0.2538335656892254, "learning_rate": 4.036905551912108e-07, "loss": 1.556, "step": 2030 }, { "epoch": 0.30631174119598825, "grad_norm": 0.26766284188651057, "learning_rate": 4.0360308617577677e-07, "loss": 1.5752, "step": 2031 }, { "epoch": 0.30646255938466177, "grad_norm": 0.2617566660075584, "learning_rate": 4.035155882852045e-07, "loss": 1.5394, "step": 2032 }, { "epoch": 0.30661337757333534, "grad_norm": 0.2499477755959539, "learning_rate": 4.034280615391398e-07, "loss": 1.5377, "step": 2033 }, { "epoch": 0.3067641957620089, "grad_norm": 0.26571280247917245, "learning_rate": 4.033405059572351e-07, "loss": 1.5705, "step": 2034 }, { "epoch": 0.30691501395068244, "grad_norm": 0.23425416184187495, "learning_rate": 4.0325292155914913e-07, "loss": 1.5085, "step": 2035 }, { "epoch": 0.307065832139356, "grad_norm": 0.24820658850688326, "learning_rate": 4.031653083645472e-07, "loss": 1.4914, "step": 2036 }, { "epoch": 0.3072166503280296, "grad_norm": 0.4945570448028612, "learning_rate": 4.0307766639310083e-07, "loss": 1.6127, "step": 2037 }, { "epoch": 0.3073674685167031, "grad_norm": 0.25618881711795755, "learning_rate": 4.0298999566448837e-07, "loss": 1.5884, "step": 2038 }, { "epoch": 0.3075182867053767, "grad_norm": 0.2516212800103507, "learning_rate": 4.029022961983944e-07, "loss": 1.5638, "step": 2039 }, { "epoch": 0.30766910489405025, "grad_norm": 0.26196457733833556, "learning_rate": 4.0281456801451004e-07, "loss": 1.5156, "step": 2040 }, { "epoch": 0.30781992308272377, "grad_norm": 0.2576699667694645, "learning_rate": 4.0272681113253274e-07, "loss": 1.5367, "step": 2041 }, { "epoch": 0.30797074127139734, "grad_norm": 0.2427529859890157, "learning_rate": 4.0263902557216654e-07, "loss": 1.5214, "step": 2042 }, { "epoch": 0.30812155946007086, "grad_norm": 0.27396337621628536, "learning_rate": 4.0255121135312186e-07, "loss": 1.6634, "step": 2043 }, { "epoch": 0.30827237764874443, "grad_norm": 0.24670362168152069, "learning_rate": 4.0246336849511554e-07, "loss": 1.5854, "step": 2044 }, { "epoch": 0.308423195837418, "grad_norm": 0.2509527913346677, "learning_rate": 4.023754970178708e-07, "loss": 1.5787, "step": 2045 }, { "epoch": 0.3085740140260915, "grad_norm": 0.27565341178868197, "learning_rate": 4.0228759694111745e-07, "loss": 1.574, "step": 2046 }, { "epoch": 0.3087248322147651, "grad_norm": 0.25797778584819087, "learning_rate": 4.0219966828459153e-07, "loss": 1.6037, "step": 2047 }, { "epoch": 0.3088756504034387, "grad_norm": 0.23845818730109467, "learning_rate": 4.0211171106803565e-07, "loss": 1.5155, "step": 2048 }, { "epoch": 0.3090264685921122, "grad_norm": 0.24152957942868847, "learning_rate": 4.0202372531119864e-07, "loss": 1.5317, "step": 2049 }, { "epoch": 0.30917728678078576, "grad_norm": 0.24716342526147034, "learning_rate": 4.0193571103383607e-07, "loss": 1.5569, "step": 2050 }, { "epoch": 0.30932810496945934, "grad_norm": 0.2939391755532749, "learning_rate": 4.0184766825570953e-07, "loss": 1.626, "step": 2051 }, { "epoch": 0.30947892315813286, "grad_norm": 0.23417825092694933, "learning_rate": 4.0175959699658724e-07, "loss": 1.5612, "step": 2052 }, { "epoch": 0.30962974134680643, "grad_norm": 0.2490779999885615, "learning_rate": 4.016714972762438e-07, "loss": 1.5613, "step": 2053 }, { "epoch": 0.30978055953548, "grad_norm": 0.2748134255196443, "learning_rate": 4.015833691144602e-07, "loss": 1.5536, "step": 2054 }, { "epoch": 0.3099313777241535, "grad_norm": 0.2582447556287364, "learning_rate": 4.0149521253102363e-07, "loss": 1.5974, "step": 2055 }, { "epoch": 0.3100821959128271, "grad_norm": 0.2458853398411171, "learning_rate": 4.01407027545728e-07, "loss": 1.5975, "step": 2056 }, { "epoch": 0.3102330141015006, "grad_norm": 0.243255176864826, "learning_rate": 4.013188141783732e-07, "loss": 1.598, "step": 2057 }, { "epoch": 0.3103838322901742, "grad_norm": 0.2612334752736981, "learning_rate": 4.0123057244876585e-07, "loss": 1.5072, "step": 2058 }, { "epoch": 0.31053465047884776, "grad_norm": 0.2533063131475423, "learning_rate": 4.0114230237671874e-07, "loss": 1.5773, "step": 2059 }, { "epoch": 0.3106854686675213, "grad_norm": 0.24648584229395598, "learning_rate": 4.010540039820511e-07, "loss": 1.5581, "step": 2060 }, { "epoch": 0.31083628685619485, "grad_norm": 0.24392715663996672, "learning_rate": 4.009656772845884e-07, "loss": 1.6086, "step": 2061 }, { "epoch": 0.31098710504486843, "grad_norm": 0.24558646280686036, "learning_rate": 4.008773223041627e-07, "loss": 1.6147, "step": 2062 }, { "epoch": 0.31113792323354195, "grad_norm": 0.5569634572330247, "learning_rate": 4.007889390606122e-07, "loss": 1.5862, "step": 2063 }, { "epoch": 0.3112887414222155, "grad_norm": 0.2558174149083045, "learning_rate": 4.007005275737814e-07, "loss": 1.6275, "step": 2064 }, { "epoch": 0.3114395596108891, "grad_norm": 0.2481834056158772, "learning_rate": 4.006120878635214e-07, "loss": 1.5096, "step": 2065 }, { "epoch": 0.3115903777995626, "grad_norm": 0.38793992659885385, "learning_rate": 4.0052361994968946e-07, "loss": 1.6465, "step": 2066 }, { "epoch": 0.3117411959882362, "grad_norm": 0.2628176803158427, "learning_rate": 4.0043512385214923e-07, "loss": 1.5849, "step": 2067 }, { "epoch": 0.31189201417690976, "grad_norm": 0.4098456469378603, "learning_rate": 4.003465995907706e-07, "loss": 1.549, "step": 2068 }, { "epoch": 0.3120428323655833, "grad_norm": 0.5901459740996087, "learning_rate": 4.0025804718542983e-07, "loss": 1.6553, "step": 2069 }, { "epoch": 0.31219365055425685, "grad_norm": 0.27798836334717175, "learning_rate": 4.001694666560096e-07, "loss": 1.608, "step": 2070 }, { "epoch": 0.31234446874293037, "grad_norm": 0.25224554357567286, "learning_rate": 4.0008085802239874e-07, "loss": 1.611, "step": 2071 }, { "epoch": 0.31249528693160394, "grad_norm": 0.25170188689624073, "learning_rate": 3.9999222130449244e-07, "loss": 1.6008, "step": 2072 }, { "epoch": 0.3126461051202775, "grad_norm": 0.31290105462564416, "learning_rate": 3.999035565221924e-07, "loss": 1.7323, "step": 2073 }, { "epoch": 0.31279692330895104, "grad_norm": 0.3964292109873429, "learning_rate": 3.9981486369540626e-07, "loss": 1.6061, "step": 2074 }, { "epoch": 0.3129477414976246, "grad_norm": 0.24857853557946208, "learning_rate": 3.9972614284404814e-07, "loss": 1.5927, "step": 2075 }, { "epoch": 0.3130985596862982, "grad_norm": 0.3851454200852006, "learning_rate": 3.9963739398803864e-07, "loss": 1.5901, "step": 2076 }, { "epoch": 0.3132493778749717, "grad_norm": 0.2628770261936668, "learning_rate": 3.9954861714730434e-07, "loss": 1.6159, "step": 2077 }, { "epoch": 0.3134001960636453, "grad_norm": 0.2677944478927853, "learning_rate": 3.994598123417781e-07, "loss": 1.5402, "step": 2078 }, { "epoch": 0.31355101425231885, "grad_norm": 0.27617794676009344, "learning_rate": 3.993709795913995e-07, "loss": 1.6657, "step": 2079 }, { "epoch": 0.31370183244099237, "grad_norm": 0.2456539890488146, "learning_rate": 3.992821189161138e-07, "loss": 1.6086, "step": 2080 }, { "epoch": 0.31385265062966594, "grad_norm": 0.24892899783486858, "learning_rate": 3.9919323033587295e-07, "loss": 1.5301, "step": 2081 }, { "epoch": 0.3140034688183395, "grad_norm": 0.2393655906478922, "learning_rate": 3.9910431387063493e-07, "loss": 1.5066, "step": 2082 }, { "epoch": 0.31415428700701303, "grad_norm": 0.2498382602800907, "learning_rate": 3.9901536954036416e-07, "loss": 1.5667, "step": 2083 }, { "epoch": 0.3143051051956866, "grad_norm": 0.2493389529791471, "learning_rate": 3.989263973650312e-07, "loss": 1.6019, "step": 2084 }, { "epoch": 0.3144559233843602, "grad_norm": 0.24762083529534487, "learning_rate": 3.988373973646129e-07, "loss": 1.5889, "step": 2085 }, { "epoch": 0.3146067415730337, "grad_norm": 0.6543181533090748, "learning_rate": 3.9874836955909236e-07, "loss": 1.5563, "step": 2086 }, { "epoch": 0.3147575597617073, "grad_norm": 0.2502822106752469, "learning_rate": 3.9865931396845887e-07, "loss": 1.5951, "step": 2087 }, { "epoch": 0.3149083779503808, "grad_norm": 0.27690010796350406, "learning_rate": 3.9857023061270805e-07, "loss": 1.5315, "step": 2088 }, { "epoch": 0.31505919613905436, "grad_norm": 0.24290586659146246, "learning_rate": 3.984811195118417e-07, "loss": 1.5492, "step": 2089 }, { "epoch": 0.31521001432772794, "grad_norm": 0.24725428291645074, "learning_rate": 3.983919806858678e-07, "loss": 1.5565, "step": 2090 }, { "epoch": 0.31536083251640146, "grad_norm": 0.25750934092248584, "learning_rate": 3.983028141548008e-07, "loss": 1.5927, "step": 2091 }, { "epoch": 0.31551165070507503, "grad_norm": 0.29618494596929745, "learning_rate": 3.982136199386609e-07, "loss": 1.4966, "step": 2092 }, { "epoch": 0.3156624688937486, "grad_norm": 0.2486932990700529, "learning_rate": 3.98124398057475e-07, "loss": 1.6357, "step": 2093 }, { "epoch": 0.3158132870824221, "grad_norm": 0.2533954281924328, "learning_rate": 3.9803514853127596e-07, "loss": 1.5909, "step": 2094 }, { "epoch": 0.3159641052710957, "grad_norm": 0.2589368625928245, "learning_rate": 3.9794587138010296e-07, "loss": 1.531, "step": 2095 }, { "epoch": 0.31611492345976927, "grad_norm": 0.2542643224228135, "learning_rate": 3.978565666240011e-07, "loss": 1.6488, "step": 2096 }, { "epoch": 0.3162657416484428, "grad_norm": 0.244359072570323, "learning_rate": 3.9776723428302216e-07, "loss": 1.5402, "step": 2097 }, { "epoch": 0.31641655983711636, "grad_norm": 0.25460464026757734, "learning_rate": 3.976778743772237e-07, "loss": 1.5894, "step": 2098 }, { "epoch": 0.31656737802578994, "grad_norm": 0.2868104823843204, "learning_rate": 3.975884869266698e-07, "loss": 1.5637, "step": 2099 }, { "epoch": 0.31671819621446345, "grad_norm": 0.31017877646583497, "learning_rate": 3.974990719514302e-07, "loss": 1.5839, "step": 2100 }, { "epoch": 0.31686901440313703, "grad_norm": 0.26396866795190366, "learning_rate": 3.974096294715816e-07, "loss": 1.5178, "step": 2101 }, { "epoch": 0.31701983259181055, "grad_norm": 0.30168437991242275, "learning_rate": 3.973201595072061e-07, "loss": 1.6485, "step": 2102 }, { "epoch": 0.3171706507804841, "grad_norm": 0.24033094292592433, "learning_rate": 3.972306620783925e-07, "loss": 1.5973, "step": 2103 }, { "epoch": 0.3173214689691577, "grad_norm": 0.24507913145198376, "learning_rate": 3.971411372052354e-07, "loss": 1.5712, "step": 2104 }, { "epoch": 0.3174722871578312, "grad_norm": 0.23871265245180445, "learning_rate": 3.9705158490783594e-07, "loss": 1.5227, "step": 2105 }, { "epoch": 0.3176231053465048, "grad_norm": 0.3965284184062262, "learning_rate": 3.969620052063011e-07, "loss": 1.6573, "step": 2106 }, { "epoch": 0.31777392353517836, "grad_norm": 0.25745473428479865, "learning_rate": 3.9687239812074417e-07, "loss": 1.6004, "step": 2107 }, { "epoch": 0.3179247417238519, "grad_norm": 0.2983580390523171, "learning_rate": 3.9678276367128457e-07, "loss": 1.633, "step": 2108 }, { "epoch": 0.31807555991252545, "grad_norm": 0.25214947570308643, "learning_rate": 3.9669310187804776e-07, "loss": 1.5447, "step": 2109 }, { "epoch": 0.318226378101199, "grad_norm": 0.7384214341008223, "learning_rate": 3.9660341276116554e-07, "loss": 1.4902, "step": 2110 }, { "epoch": 0.31837719628987254, "grad_norm": 0.25732836197018455, "learning_rate": 3.9651369634077566e-07, "loss": 1.6187, "step": 2111 }, { "epoch": 0.3185280144785461, "grad_norm": 0.24660030206232522, "learning_rate": 3.9642395263702213e-07, "loss": 1.5436, "step": 2112 }, { "epoch": 0.3186788326672197, "grad_norm": 0.24938974250593018, "learning_rate": 3.9633418167005486e-07, "loss": 1.5876, "step": 2113 }, { "epoch": 0.3188296508558932, "grad_norm": 0.32231112612116103, "learning_rate": 3.9624438346003023e-07, "loss": 1.5242, "step": 2114 }, { "epoch": 0.3189804690445668, "grad_norm": 0.25084366668647723, "learning_rate": 3.961545580271106e-07, "loss": 1.5895, "step": 2115 }, { "epoch": 0.3191312872332403, "grad_norm": 0.23999056686624912, "learning_rate": 3.960647053914641e-07, "loss": 1.5713, "step": 2116 }, { "epoch": 0.3192821054219139, "grad_norm": 0.23925194136729683, "learning_rate": 3.959748255732655e-07, "loss": 1.5073, "step": 2117 }, { "epoch": 0.31943292361058745, "grad_norm": 0.27256469617442297, "learning_rate": 3.9588491859269534e-07, "loss": 1.5992, "step": 2118 }, { "epoch": 0.31958374179926097, "grad_norm": 0.23824644344923226, "learning_rate": 3.9579498446994055e-07, "loss": 1.5236, "step": 2119 }, { "epoch": 0.31973455998793454, "grad_norm": 0.2652599022290534, "learning_rate": 3.9570502322519365e-07, "loss": 1.5538, "step": 2120 }, { "epoch": 0.3198853781766081, "grad_norm": 0.24330049060046072, "learning_rate": 3.956150348786538e-07, "loss": 1.6044, "step": 2121 }, { "epoch": 0.32003619636528163, "grad_norm": 0.2738743828606264, "learning_rate": 3.955250194505259e-07, "loss": 1.6004, "step": 2122 }, { "epoch": 0.3201870145539552, "grad_norm": 0.2574178597933717, "learning_rate": 3.9543497696102107e-07, "loss": 1.5936, "step": 2123 }, { "epoch": 0.3203378327426288, "grad_norm": 0.2950917159578343, "learning_rate": 3.9534490743035643e-07, "loss": 1.5641, "step": 2124 }, { "epoch": 0.3204886509313023, "grad_norm": 0.4756485614317174, "learning_rate": 3.952548108787552e-07, "loss": 1.5103, "step": 2125 }, { "epoch": 0.3206394691199759, "grad_norm": 0.2615377016755134, "learning_rate": 3.951646873264468e-07, "loss": 1.6246, "step": 2126 }, { "epoch": 0.32079028730864945, "grad_norm": 0.34577547477197323, "learning_rate": 3.950745367936665e-07, "loss": 1.5757, "step": 2127 }, { "epoch": 0.32094110549732296, "grad_norm": 0.254203670031749, "learning_rate": 3.9498435930065566e-07, "loss": 1.5617, "step": 2128 }, { "epoch": 0.32109192368599654, "grad_norm": 0.24546737378883726, "learning_rate": 3.94894154867662e-07, "loss": 1.5633, "step": 2129 }, { "epoch": 0.3212427418746701, "grad_norm": 0.2515906858162734, "learning_rate": 3.948039235149387e-07, "loss": 1.5815, "step": 2130 }, { "epoch": 0.32139356006334363, "grad_norm": 0.24451920390423948, "learning_rate": 3.947136652627455e-07, "loss": 1.6121, "step": 2131 }, { "epoch": 0.3215443782520172, "grad_norm": 0.27481738304353953, "learning_rate": 3.9462338013134813e-07, "loss": 1.5426, "step": 2132 }, { "epoch": 0.3216951964406907, "grad_norm": 0.2965333454714891, "learning_rate": 3.9453306814101795e-07, "loss": 1.6291, "step": 2133 }, { "epoch": 0.3218460146293643, "grad_norm": 0.24792020164578948, "learning_rate": 3.9444272931203294e-07, "loss": 1.499, "step": 2134 }, { "epoch": 0.32199683281803787, "grad_norm": 0.24446936022136556, "learning_rate": 3.943523636646765e-07, "loss": 1.6617, "step": 2135 }, { "epoch": 0.3221476510067114, "grad_norm": 0.24328300281267154, "learning_rate": 3.9426197121923853e-07, "loss": 1.5624, "step": 2136 }, { "epoch": 0.32229846919538496, "grad_norm": 0.26383286243955617, "learning_rate": 3.941715519960148e-07, "loss": 1.6099, "step": 2137 }, { "epoch": 0.32244928738405854, "grad_norm": 0.26621979937034657, "learning_rate": 3.94081106015307e-07, "loss": 1.5884, "step": 2138 }, { "epoch": 0.32260010557273205, "grad_norm": 0.24855711191351867, "learning_rate": 3.9399063329742277e-07, "loss": 1.4988, "step": 2139 }, { "epoch": 0.32275092376140563, "grad_norm": 0.26128115604409363, "learning_rate": 3.93900133862676e-07, "loss": 1.6908, "step": 2140 }, { "epoch": 0.3229017419500792, "grad_norm": 0.8190000147879138, "learning_rate": 3.9380960773138643e-07, "loss": 1.5914, "step": 2141 }, { "epoch": 0.3230525601387527, "grad_norm": 0.24931712688641844, "learning_rate": 3.9371905492387987e-07, "loss": 1.5765, "step": 2142 }, { "epoch": 0.3232033783274263, "grad_norm": 0.2604485651872273, "learning_rate": 3.9362847546048796e-07, "loss": 1.5407, "step": 2143 }, { "epoch": 0.32335419651609987, "grad_norm": 0.3841819530609337, "learning_rate": 3.935378693615484e-07, "loss": 1.6656, "step": 2144 }, { "epoch": 0.3235050147047734, "grad_norm": 0.25042361022528925, "learning_rate": 3.93447236647405e-07, "loss": 1.6574, "step": 2145 }, { "epoch": 0.32365583289344696, "grad_norm": 0.2479456279202682, "learning_rate": 3.9335657733840744e-07, "loss": 1.5793, "step": 2146 }, { "epoch": 0.3238066510821205, "grad_norm": 0.26260084267739775, "learning_rate": 3.932658914549113e-07, "loss": 1.6222, "step": 2147 }, { "epoch": 0.32395746927079405, "grad_norm": 0.24831072353040998, "learning_rate": 3.931751790172783e-07, "loss": 1.5327, "step": 2148 }, { "epoch": 0.3241082874594676, "grad_norm": 0.2357508409240409, "learning_rate": 3.930844400458759e-07, "loss": 1.548, "step": 2149 }, { "epoch": 0.32425910564814114, "grad_norm": 0.3313130342092389, "learning_rate": 3.9299367456107787e-07, "loss": 1.5344, "step": 2150 }, { "epoch": 0.3244099238368147, "grad_norm": 0.2655658423317557, "learning_rate": 3.929028825832634e-07, "loss": 1.5548, "step": 2151 }, { "epoch": 0.3245607420254883, "grad_norm": 0.27936045500581236, "learning_rate": 3.928120641328181e-07, "loss": 1.5474, "step": 2152 }, { "epoch": 0.3247115602141618, "grad_norm": 0.2562919828901846, "learning_rate": 3.9272121923013336e-07, "loss": 1.6042, "step": 2153 }, { "epoch": 0.3248623784028354, "grad_norm": 0.24525026196111174, "learning_rate": 3.9263034789560645e-07, "loss": 1.537, "step": 2154 }, { "epoch": 0.32501319659150896, "grad_norm": 0.2601161416687034, "learning_rate": 3.9253945014964066e-07, "loss": 1.6371, "step": 2155 }, { "epoch": 0.3251640147801825, "grad_norm": 0.2594960670160656, "learning_rate": 3.9244852601264525e-07, "loss": 1.5335, "step": 2156 }, { "epoch": 0.32531483296885605, "grad_norm": 0.2625263636856618, "learning_rate": 3.923575755050352e-07, "loss": 1.5754, "step": 2157 }, { "epoch": 0.3254656511575296, "grad_norm": 0.2463256200074851, "learning_rate": 3.922665986472316e-07, "loss": 1.5661, "step": 2158 }, { "epoch": 0.32561646934620314, "grad_norm": 0.2577915088506624, "learning_rate": 3.921755954596615e-07, "loss": 1.5505, "step": 2159 }, { "epoch": 0.3257672875348767, "grad_norm": 0.2509018610982779, "learning_rate": 3.920845659627577e-07, "loss": 1.6161, "step": 2160 }, { "epoch": 0.32591810572355023, "grad_norm": 0.25334757643146116, "learning_rate": 3.9199351017695887e-07, "loss": 1.551, "step": 2161 }, { "epoch": 0.3260689239122238, "grad_norm": 0.27512796587292415, "learning_rate": 3.9190242812270985e-07, "loss": 1.6038, "step": 2162 }, { "epoch": 0.3262197421008974, "grad_norm": 0.28985312260866014, "learning_rate": 3.918113198204611e-07, "loss": 1.5417, "step": 2163 }, { "epoch": 0.3263705602895709, "grad_norm": 0.2503779552056232, "learning_rate": 3.917201852906691e-07, "loss": 1.5842, "step": 2164 }, { "epoch": 0.3265213784782445, "grad_norm": 0.24484423110593662, "learning_rate": 3.9162902455379636e-07, "loss": 1.5275, "step": 2165 }, { "epoch": 0.32667219666691805, "grad_norm": 0.2890156902730706, "learning_rate": 3.9153783763031085e-07, "loss": 1.5492, "step": 2166 }, { "epoch": 0.32682301485559156, "grad_norm": 0.24639661617104186, "learning_rate": 3.914466245406869e-07, "loss": 1.5571, "step": 2167 }, { "epoch": 0.32697383304426514, "grad_norm": 0.2450312290995254, "learning_rate": 3.9135538530540447e-07, "loss": 1.623, "step": 2168 }, { "epoch": 0.3271246512329387, "grad_norm": 0.25377142288752186, "learning_rate": 3.9126411994494936e-07, "loss": 1.6188, "step": 2169 }, { "epoch": 0.32727546942161223, "grad_norm": 0.2547601828093652, "learning_rate": 3.911728284798133e-07, "loss": 1.5359, "step": 2170 }, { "epoch": 0.3274262876102858, "grad_norm": 0.25001982542415085, "learning_rate": 3.9108151093049397e-07, "loss": 1.5617, "step": 2171 }, { "epoch": 0.3275771057989594, "grad_norm": 0.25181077387583223, "learning_rate": 3.909901673174948e-07, "loss": 1.5743, "step": 2172 }, { "epoch": 0.3277279239876329, "grad_norm": 0.24853020045973234, "learning_rate": 3.90898797661325e-07, "loss": 1.5056, "step": 2173 }, { "epoch": 0.32787874217630647, "grad_norm": 0.24903623322804977, "learning_rate": 3.9080740198249976e-07, "loss": 1.5015, "step": 2174 }, { "epoch": 0.32802956036498004, "grad_norm": 0.2715084471234611, "learning_rate": 3.9071598030154015e-07, "loss": 1.568, "step": 2175 }, { "epoch": 0.32818037855365356, "grad_norm": 0.24719338424330595, "learning_rate": 3.906245326389729e-07, "loss": 1.4931, "step": 2176 }, { "epoch": 0.32833119674232714, "grad_norm": 0.24057406371883366, "learning_rate": 3.905330590153308e-07, "loss": 1.6049, "step": 2177 }, { "epoch": 0.32848201493100065, "grad_norm": 0.2577269643414046, "learning_rate": 3.904415594511521e-07, "loss": 1.5177, "step": 2178 }, { "epoch": 0.32863283311967423, "grad_norm": 0.2277806229734619, "learning_rate": 3.903500339669814e-07, "loss": 1.5321, "step": 2179 }, { "epoch": 0.3287836513083478, "grad_norm": 0.25168229116355645, "learning_rate": 3.902584825833687e-07, "loss": 1.573, "step": 2180 }, { "epoch": 0.3289344694970213, "grad_norm": 0.25178755697604427, "learning_rate": 3.9016690532086995e-07, "loss": 1.475, "step": 2181 }, { "epoch": 0.3290852876856949, "grad_norm": 0.23538365702618483, "learning_rate": 3.9007530220004686e-07, "loss": 1.5286, "step": 2182 }, { "epoch": 0.32923610587436847, "grad_norm": 0.2477937784114404, "learning_rate": 3.8998367324146716e-07, "loss": 1.6322, "step": 2183 }, { "epoch": 0.329386924063042, "grad_norm": 0.2545925937803815, "learning_rate": 3.8989201846570403e-07, "loss": 1.6231, "step": 2184 }, { "epoch": 0.32953774225171556, "grad_norm": 0.26975812222452605, "learning_rate": 3.8980033789333676e-07, "loss": 1.5295, "step": 2185 }, { "epoch": 0.32968856044038913, "grad_norm": 0.24686203970847262, "learning_rate": 3.897086315449503e-07, "loss": 1.4873, "step": 2186 }, { "epoch": 0.32983937862906265, "grad_norm": 0.24428165251845188, "learning_rate": 3.896168994411354e-07, "loss": 1.5535, "step": 2187 }, { "epoch": 0.3299901968177362, "grad_norm": 0.25997229380516884, "learning_rate": 3.895251416024885e-07, "loss": 1.5302, "step": 2188 }, { "epoch": 0.3301410150064098, "grad_norm": 0.24197139910273505, "learning_rate": 3.89433358049612e-07, "loss": 1.5913, "step": 2189 }, { "epoch": 0.3302918331950833, "grad_norm": 0.3740910107363862, "learning_rate": 3.89341548803114e-07, "loss": 1.5811, "step": 2190 }, { "epoch": 0.3304426513837569, "grad_norm": 0.27649441189430246, "learning_rate": 3.892497138836083e-07, "loss": 1.5538, "step": 2191 }, { "epoch": 0.3305934695724304, "grad_norm": 0.2725749848661994, "learning_rate": 3.8915785331171434e-07, "loss": 1.6107, "step": 2192 }, { "epoch": 0.330744287761104, "grad_norm": 0.25734228685195976, "learning_rate": 3.8906596710805786e-07, "loss": 1.5388, "step": 2193 }, { "epoch": 0.33089510594977756, "grad_norm": 0.2754469554795411, "learning_rate": 3.889740552932697e-07, "loss": 1.5455, "step": 2194 }, { "epoch": 0.3310459241384511, "grad_norm": 0.40458062480498685, "learning_rate": 3.8888211788798686e-07, "loss": 1.6405, "step": 2195 }, { "epoch": 0.33119674232712465, "grad_norm": 0.26011942351190315, "learning_rate": 3.88790154912852e-07, "loss": 1.5743, "step": 2196 }, { "epoch": 0.3313475605157982, "grad_norm": 0.3383973302983143, "learning_rate": 3.886981663885133e-07, "loss": 1.5769, "step": 2197 }, { "epoch": 0.33149837870447174, "grad_norm": 0.2796548395836774, "learning_rate": 3.8860615233562505e-07, "loss": 1.5225, "step": 2198 }, { "epoch": 0.3316491968931453, "grad_norm": 0.25733138375962183, "learning_rate": 3.8851411277484707e-07, "loss": 1.53, "step": 2199 }, { "epoch": 0.3318000150818189, "grad_norm": 0.25688616343477166, "learning_rate": 3.884220477268448e-07, "loss": 1.5664, "step": 2200 }, { "epoch": 0.3319508332704924, "grad_norm": 0.2503649611643573, "learning_rate": 3.883299572122897e-07, "loss": 1.5365, "step": 2201 }, { "epoch": 0.332101651459166, "grad_norm": 0.24801357715162026, "learning_rate": 3.882378412518586e-07, "loss": 1.6041, "step": 2202 }, { "epoch": 0.33225246964783955, "grad_norm": 0.25363808699274276, "learning_rate": 3.881456998662343e-07, "loss": 1.5687, "step": 2203 }, { "epoch": 0.3324032878365131, "grad_norm": 0.2784222811332939, "learning_rate": 3.8805353307610516e-07, "loss": 1.5556, "step": 2204 }, { "epoch": 0.33255410602518665, "grad_norm": 0.2468012831812265, "learning_rate": 3.879613409021654e-07, "loss": 1.5134, "step": 2205 }, { "epoch": 0.33270492421386016, "grad_norm": 0.26575926203468225, "learning_rate": 3.8786912336511486e-07, "loss": 1.6228, "step": 2206 }, { "epoch": 0.33285574240253374, "grad_norm": 0.2587565684416558, "learning_rate": 3.87776880485659e-07, "loss": 1.4733, "step": 2207 }, { "epoch": 0.3330065605912073, "grad_norm": 0.2437834927101869, "learning_rate": 3.87684612284509e-07, "loss": 1.5645, "step": 2208 }, { "epoch": 0.33315737877988083, "grad_norm": 0.26723277558649283, "learning_rate": 3.875923187823818e-07, "loss": 1.5352, "step": 2209 }, { "epoch": 0.3333081969685544, "grad_norm": 0.2504192784843705, "learning_rate": 3.875e-07, "loss": 1.5551, "step": 2210 }, { "epoch": 0.333459015157228, "grad_norm": 0.24068172574496213, "learning_rate": 3.8740765595809185e-07, "loss": 1.5108, "step": 2211 }, { "epoch": 0.3336098333459015, "grad_norm": 0.2829678841255783, "learning_rate": 3.8731528667739126e-07, "loss": 1.652, "step": 2212 }, { "epoch": 0.33376065153457507, "grad_norm": 0.24627534576714177, "learning_rate": 3.872228921786378e-07, "loss": 1.5755, "step": 2213 }, { "epoch": 0.33391146972324864, "grad_norm": 0.24708427462899366, "learning_rate": 3.8713047248257676e-07, "loss": 1.5418, "step": 2214 }, { "epoch": 0.33406228791192216, "grad_norm": 0.2679802446868686, "learning_rate": 3.870380276099591e-07, "loss": 1.5228, "step": 2215 }, { "epoch": 0.33421310610059574, "grad_norm": 0.2696195676939389, "learning_rate": 3.869455575815413e-07, "loss": 1.5744, "step": 2216 }, { "epoch": 0.3343639242892693, "grad_norm": 0.24986388598147388, "learning_rate": 3.868530624180856e-07, "loss": 1.5323, "step": 2217 }, { "epoch": 0.33451474247794283, "grad_norm": 0.27737913998484093, "learning_rate": 3.867605421403599e-07, "loss": 1.5496, "step": 2218 }, { "epoch": 0.3346655606666164, "grad_norm": 0.3270339884312583, "learning_rate": 3.8666799676913756e-07, "loss": 1.5286, "step": 2219 }, { "epoch": 0.33481637885529, "grad_norm": 0.2479114748899475, "learning_rate": 3.8657542632519796e-07, "loss": 1.5699, "step": 2220 }, { "epoch": 0.3349671970439635, "grad_norm": 0.23357546957906214, "learning_rate": 3.8648283082932567e-07, "loss": 1.5406, "step": 2221 }, { "epoch": 0.33511801523263707, "grad_norm": 0.44813768609402127, "learning_rate": 3.8639021030231106e-07, "loss": 1.6016, "step": 2222 }, { "epoch": 0.3352688334213106, "grad_norm": 0.24020122561270563, "learning_rate": 3.8629756476495023e-07, "loss": 1.5096, "step": 2223 }, { "epoch": 0.33541965160998416, "grad_norm": 0.24188367358672805, "learning_rate": 3.8620489423804476e-07, "loss": 1.5451, "step": 2224 }, { "epoch": 0.33557046979865773, "grad_norm": 0.24781994031545823, "learning_rate": 3.861121987424019e-07, "loss": 1.5975, "step": 2225 }, { "epoch": 0.33572128798733125, "grad_norm": 0.24221766652606178, "learning_rate": 3.860194782988345e-07, "loss": 1.5831, "step": 2226 }, { "epoch": 0.3358721061760048, "grad_norm": 0.24125890233354633, "learning_rate": 3.8592673292816094e-07, "loss": 1.5697, "step": 2227 }, { "epoch": 0.3360229243646784, "grad_norm": 0.2468993827799427, "learning_rate": 3.858339626512053e-07, "loss": 1.6049, "step": 2228 }, { "epoch": 0.3361737425533519, "grad_norm": 0.25379662786044455, "learning_rate": 3.857411674887972e-07, "loss": 1.5157, "step": 2229 }, { "epoch": 0.3363245607420255, "grad_norm": 1.124038381470129, "learning_rate": 3.856483474617718e-07, "loss": 1.5529, "step": 2230 }, { "epoch": 0.33647537893069907, "grad_norm": 0.26299087033602114, "learning_rate": 3.855555025909701e-07, "loss": 1.5535, "step": 2231 }, { "epoch": 0.3366261971193726, "grad_norm": 0.2474710227528756, "learning_rate": 3.8546263289723826e-07, "loss": 1.5763, "step": 2232 }, { "epoch": 0.33677701530804616, "grad_norm": 0.325380394521446, "learning_rate": 3.8536973840142836e-07, "loss": 1.544, "step": 2233 }, { "epoch": 0.33692783349671973, "grad_norm": 0.2543536772971605, "learning_rate": 3.852768191243977e-07, "loss": 1.542, "step": 2234 }, { "epoch": 0.33707865168539325, "grad_norm": 0.2777082163185314, "learning_rate": 3.8518387508700965e-07, "loss": 1.5565, "step": 2235 }, { "epoch": 0.3372294698740668, "grad_norm": 0.2348126950607934, "learning_rate": 3.8509090631013277e-07, "loss": 1.6255, "step": 2236 }, { "epoch": 0.33738028806274034, "grad_norm": 0.2588743938062479, "learning_rate": 3.849979128146412e-07, "loss": 1.5567, "step": 2237 }, { "epoch": 0.3375311062514139, "grad_norm": 0.24878932088777453, "learning_rate": 3.8490489462141474e-07, "loss": 1.6238, "step": 2238 }, { "epoch": 0.3376819244400875, "grad_norm": 0.24395580668015335, "learning_rate": 3.8481185175133856e-07, "loss": 1.5998, "step": 2239 }, { "epoch": 0.337832742628761, "grad_norm": 0.23968055186264078, "learning_rate": 3.847187842253036e-07, "loss": 1.6052, "step": 2240 }, { "epoch": 0.3379835608174346, "grad_norm": 0.2571877019236364, "learning_rate": 3.846256920642063e-07, "loss": 1.548, "step": 2241 }, { "epoch": 0.33813437900610815, "grad_norm": 0.2605293877175822, "learning_rate": 3.8453257528894844e-07, "loss": 1.639, "step": 2242 }, { "epoch": 0.3382851971947817, "grad_norm": 0.2505420542873784, "learning_rate": 3.8443943392043743e-07, "loss": 1.553, "step": 2243 }, { "epoch": 0.33843601538345525, "grad_norm": 0.27697603176255636, "learning_rate": 3.843462679795863e-07, "loss": 1.6327, "step": 2244 }, { "epoch": 0.3385868335721288, "grad_norm": 0.248589658769722, "learning_rate": 3.842530774873136e-07, "loss": 1.4545, "step": 2245 }, { "epoch": 0.33873765176080234, "grad_norm": 0.2972959581016365, "learning_rate": 3.841598624645431e-07, "loss": 1.6338, "step": 2246 }, { "epoch": 0.3388884699494759, "grad_norm": 0.25531625243830247, "learning_rate": 3.840666229322044e-07, "loss": 1.52, "step": 2247 }, { "epoch": 0.3390392881381495, "grad_norm": 0.3986180218938449, "learning_rate": 3.839733589112325e-07, "loss": 1.5638, "step": 2248 }, { "epoch": 0.339190106326823, "grad_norm": 0.2416717535849375, "learning_rate": 3.838800704225678e-07, "loss": 1.5555, "step": 2249 }, { "epoch": 0.3393409245154966, "grad_norm": 0.2445233088574405, "learning_rate": 3.837867574871565e-07, "loss": 1.5463, "step": 2250 }, { "epoch": 0.3394917427041701, "grad_norm": 0.25238831100568343, "learning_rate": 3.836934201259498e-07, "loss": 1.5774, "step": 2251 }, { "epoch": 0.33964256089284367, "grad_norm": 0.2525198477527483, "learning_rate": 3.836000583599048e-07, "loss": 1.5103, "step": 2252 }, { "epoch": 0.33979337908151724, "grad_norm": 0.2567497176908914, "learning_rate": 3.835066722099839e-07, "loss": 1.5488, "step": 2253 }, { "epoch": 0.33994419727019076, "grad_norm": 0.25762041730322705, "learning_rate": 3.834132616971551e-07, "loss": 1.5655, "step": 2254 }, { "epoch": 0.34009501545886434, "grad_norm": 0.28672621826467015, "learning_rate": 3.8331982684239165e-07, "loss": 1.5366, "step": 2255 }, { "epoch": 0.3402458336475379, "grad_norm": 0.2425603370892537, "learning_rate": 3.832263676666725e-07, "loss": 1.4937, "step": 2256 }, { "epoch": 0.34039665183621143, "grad_norm": 0.2441307058970796, "learning_rate": 3.8313288419098184e-07, "loss": 1.571, "step": 2257 }, { "epoch": 0.340547470024885, "grad_norm": 0.25371961723176456, "learning_rate": 3.8303937643630956e-07, "loss": 1.5487, "step": 2258 }, { "epoch": 0.3406982882135586, "grad_norm": 0.24999275713882987, "learning_rate": 3.8294584442365076e-07, "loss": 1.5377, "step": 2259 }, { "epoch": 0.3408491064022321, "grad_norm": 0.2901363335818867, "learning_rate": 3.8285228817400614e-07, "loss": 1.5308, "step": 2260 }, { "epoch": 0.34099992459090567, "grad_norm": 0.246290205087347, "learning_rate": 3.8275870770838185e-07, "loss": 1.5847, "step": 2261 }, { "epoch": 0.34115074277957924, "grad_norm": 0.2493151932192866, "learning_rate": 3.8266510304778944e-07, "loss": 1.6125, "step": 2262 }, { "epoch": 0.34130156096825276, "grad_norm": 0.2547615126894274, "learning_rate": 3.825714742132458e-07, "loss": 1.593, "step": 2263 }, { "epoch": 0.34145237915692633, "grad_norm": 0.25709531771387, "learning_rate": 3.824778212257734e-07, "loss": 1.5697, "step": 2264 }, { "epoch": 0.34160319734559985, "grad_norm": 0.2512506159715361, "learning_rate": 3.823841441064e-07, "loss": 1.6255, "step": 2265 }, { "epoch": 0.3417540155342734, "grad_norm": 0.2507657433009384, "learning_rate": 3.8229044287615895e-07, "loss": 1.5278, "step": 2266 }, { "epoch": 0.341904833722947, "grad_norm": 0.27216665659354644, "learning_rate": 3.8219671755608875e-07, "loss": 1.5638, "step": 2267 }, { "epoch": 0.3420556519116205, "grad_norm": 0.26557635715854694, "learning_rate": 3.8210296816723356e-07, "loss": 1.5815, "step": 2268 }, { "epoch": 0.3422064701002941, "grad_norm": 0.24469797082153816, "learning_rate": 3.820091947306429e-07, "loss": 1.4977, "step": 2269 }, { "epoch": 0.34235728828896766, "grad_norm": 0.2497279085693623, "learning_rate": 3.819153972673715e-07, "loss": 1.5477, "step": 2270 }, { "epoch": 0.3425081064776412, "grad_norm": 0.2527735661968631, "learning_rate": 3.8182157579847965e-07, "loss": 1.5595, "step": 2271 }, { "epoch": 0.34265892466631476, "grad_norm": 0.2540785759331565, "learning_rate": 3.8172773034503314e-07, "loss": 1.5908, "step": 2272 }, { "epoch": 0.34280974285498833, "grad_norm": 0.28467700544081004, "learning_rate": 3.816338609281028e-07, "loss": 1.5355, "step": 2273 }, { "epoch": 0.34296056104366185, "grad_norm": 0.2462037570531213, "learning_rate": 3.815399675687653e-07, "loss": 1.5683, "step": 2274 }, { "epoch": 0.3431113792323354, "grad_norm": 0.25120363303396226, "learning_rate": 3.814460502881023e-07, "loss": 1.5563, "step": 2275 }, { "epoch": 0.343262197421009, "grad_norm": 0.24889628742764763, "learning_rate": 3.8135210910720084e-07, "loss": 1.5814, "step": 2276 }, { "epoch": 0.3434130156096825, "grad_norm": 0.2502347607258946, "learning_rate": 3.8125814404715364e-07, "loss": 1.5306, "step": 2277 }, { "epoch": 0.3435638337983561, "grad_norm": 1.4362376842967715, "learning_rate": 3.8116415512905843e-07, "loss": 1.5799, "step": 2278 }, { "epoch": 0.34371465198702966, "grad_norm": 0.24982516403294563, "learning_rate": 3.8107014237401867e-07, "loss": 1.5381, "step": 2279 }, { "epoch": 0.3438654701757032, "grad_norm": 0.3185115617545277, "learning_rate": 3.8097610580314284e-07, "loss": 1.526, "step": 2280 }, { "epoch": 0.34401628836437675, "grad_norm": 0.33250873403709974, "learning_rate": 3.8088204543754483e-07, "loss": 1.6106, "step": 2281 }, { "epoch": 0.3441671065530503, "grad_norm": 0.24036323630812328, "learning_rate": 3.80787961298344e-07, "loss": 1.5334, "step": 2282 }, { "epoch": 0.34431792474172385, "grad_norm": 0.24417765915610368, "learning_rate": 3.8069385340666497e-07, "loss": 1.5566, "step": 2283 }, { "epoch": 0.3444687429303974, "grad_norm": 0.25273304777011013, "learning_rate": 3.8059972178363777e-07, "loss": 1.578, "step": 2284 }, { "epoch": 0.34461956111907094, "grad_norm": 0.24623320287757744, "learning_rate": 3.805055664503975e-07, "loss": 1.5188, "step": 2285 }, { "epoch": 0.3447703793077445, "grad_norm": 0.23877957030498415, "learning_rate": 3.8041138742808506e-07, "loss": 1.5832, "step": 2286 }, { "epoch": 0.3449211974964181, "grad_norm": 0.25406449718842056, "learning_rate": 3.803171847378461e-07, "loss": 1.5424, "step": 2287 }, { "epoch": 0.3450720156850916, "grad_norm": 0.23913358715230615, "learning_rate": 3.80222958400832e-07, "loss": 1.5886, "step": 2288 }, { "epoch": 0.3452228338737652, "grad_norm": 0.6101911876858062, "learning_rate": 3.8012870843819934e-07, "loss": 1.6251, "step": 2289 }, { "epoch": 0.34537365206243875, "grad_norm": 0.2509776593810166, "learning_rate": 3.8003443487110995e-07, "loss": 1.6073, "step": 2290 }, { "epoch": 0.34552447025111227, "grad_norm": 0.2422242713596001, "learning_rate": 3.79940137720731e-07, "loss": 1.5951, "step": 2291 }, { "epoch": 0.34567528843978584, "grad_norm": 0.24824865731058396, "learning_rate": 3.79845817008235e-07, "loss": 1.5508, "step": 2292 }, { "epoch": 0.3458261066284594, "grad_norm": 0.25626805095048233, "learning_rate": 3.7975147275479947e-07, "loss": 1.6062, "step": 2293 }, { "epoch": 0.34597692481713294, "grad_norm": 0.2942650248769778, "learning_rate": 3.7965710498160767e-07, "loss": 1.6115, "step": 2294 }, { "epoch": 0.3461277430058065, "grad_norm": 0.24907806981702166, "learning_rate": 3.795627137098479e-07, "loss": 1.5798, "step": 2295 }, { "epoch": 0.34627856119448003, "grad_norm": 0.2536491718380689, "learning_rate": 3.794682989607137e-07, "loss": 1.5955, "step": 2296 }, { "epoch": 0.3464293793831536, "grad_norm": 0.2536962046438531, "learning_rate": 3.793738607554039e-07, "loss": 1.5237, "step": 2297 }, { "epoch": 0.3465801975718272, "grad_norm": 0.27702294905095187, "learning_rate": 3.792793991151226e-07, "loss": 1.5761, "step": 2298 }, { "epoch": 0.3467310157605007, "grad_norm": 0.2552120558949902, "learning_rate": 3.791849140610794e-07, "loss": 1.5531, "step": 2299 }, { "epoch": 0.34688183394917427, "grad_norm": 0.2844932996298561, "learning_rate": 3.790904056144887e-07, "loss": 1.5634, "step": 2300 }, { "epoch": 0.34703265213784784, "grad_norm": 0.31350790888658125, "learning_rate": 3.7899587379657043e-07, "loss": 1.6519, "step": 2301 }, { "epoch": 0.34718347032652136, "grad_norm": 0.327166975783665, "learning_rate": 3.789013186285499e-07, "loss": 1.5463, "step": 2302 }, { "epoch": 0.34733428851519493, "grad_norm": 0.3671001662868849, "learning_rate": 3.7880674013165737e-07, "loss": 1.4951, "step": 2303 }, { "epoch": 0.3474851067038685, "grad_norm": 0.25791788975509533, "learning_rate": 3.7871213832712847e-07, "loss": 1.5153, "step": 2304 }, { "epoch": 0.347635924892542, "grad_norm": 0.25043027941632756, "learning_rate": 3.7861751323620405e-07, "loss": 1.5284, "step": 2305 }, { "epoch": 0.3477867430812156, "grad_norm": 0.27288469174574215, "learning_rate": 3.785228648801304e-07, "loss": 1.5774, "step": 2306 }, { "epoch": 0.3479375612698892, "grad_norm": 0.24291337383099834, "learning_rate": 3.7842819328015847e-07, "loss": 1.5544, "step": 2307 }, { "epoch": 0.3480883794585627, "grad_norm": 0.2462920734808384, "learning_rate": 3.783334984575451e-07, "loss": 1.4761, "step": 2308 }, { "epoch": 0.34823919764723626, "grad_norm": 0.24337486105201564, "learning_rate": 3.7823878043355185e-07, "loss": 1.5438, "step": 2309 }, { "epoch": 0.3483900158359098, "grad_norm": 0.24901269879840904, "learning_rate": 3.781440392294458e-07, "loss": 1.5417, "step": 2310 }, { "epoch": 0.34854083402458336, "grad_norm": 0.2574773833389837, "learning_rate": 3.78049274866499e-07, "loss": 1.5779, "step": 2311 }, { "epoch": 0.34869165221325693, "grad_norm": 0.27317633569887817, "learning_rate": 3.7795448736598895e-07, "loss": 1.6136, "step": 2312 }, { "epoch": 0.34884247040193045, "grad_norm": 0.24881592357265409, "learning_rate": 3.7785967674919806e-07, "loss": 1.6093, "step": 2313 }, { "epoch": 0.348993288590604, "grad_norm": 0.24956684849307417, "learning_rate": 3.7776484303741414e-07, "loss": 1.5973, "step": 2314 }, { "epoch": 0.3491441067792776, "grad_norm": 0.2464129365453703, "learning_rate": 3.776699862519301e-07, "loss": 1.5477, "step": 2315 }, { "epoch": 0.3492949249679511, "grad_norm": 0.25191976238902325, "learning_rate": 3.775751064140441e-07, "loss": 1.5582, "step": 2316 }, { "epoch": 0.3494457431566247, "grad_norm": 0.264147892478328, "learning_rate": 3.7748020354505936e-07, "loss": 1.6021, "step": 2317 }, { "epoch": 0.34959656134529826, "grad_norm": 0.2757351180919783, "learning_rate": 3.773852776662844e-07, "loss": 1.5334, "step": 2318 }, { "epoch": 0.3497473795339718, "grad_norm": 0.24862941758346846, "learning_rate": 3.7729032879903285e-07, "loss": 1.5957, "step": 2319 }, { "epoch": 0.34989819772264535, "grad_norm": 0.24897857861626696, "learning_rate": 3.771953569646234e-07, "loss": 1.5855, "step": 2320 }, { "epoch": 0.35004901591131893, "grad_norm": 0.2517463859076541, "learning_rate": 3.7710036218438007e-07, "loss": 1.5371, "step": 2321 }, { "epoch": 0.35019983409999245, "grad_norm": 0.268956424751186, "learning_rate": 3.7700534447963194e-07, "loss": 1.4844, "step": 2322 }, { "epoch": 0.350350652288666, "grad_norm": 0.2500277125108567, "learning_rate": 3.769103038717133e-07, "loss": 1.52, "step": 2323 }, { "epoch": 0.3505014704773396, "grad_norm": 0.27279449038724674, "learning_rate": 3.7681524038196337e-07, "loss": 1.5496, "step": 2324 }, { "epoch": 0.3506522886660131, "grad_norm": 0.2521931291040799, "learning_rate": 3.7672015403172687e-07, "loss": 1.5905, "step": 2325 }, { "epoch": 0.3508031068546867, "grad_norm": 0.24490012882151652, "learning_rate": 3.766250448423534e-07, "loss": 1.5401, "step": 2326 }, { "epoch": 0.3509539250433602, "grad_norm": 0.2924535818177954, "learning_rate": 3.765299128351976e-07, "loss": 1.5453, "step": 2327 }, { "epoch": 0.3511047432320338, "grad_norm": 0.29600395237439076, "learning_rate": 3.764347580316196e-07, "loss": 1.5717, "step": 2328 }, { "epoch": 0.35125556142070735, "grad_norm": 0.26532230372293647, "learning_rate": 3.7633958045298433e-07, "loss": 1.5291, "step": 2329 }, { "epoch": 0.35140637960938087, "grad_norm": 0.24916894584154997, "learning_rate": 3.762443801206618e-07, "loss": 1.5405, "step": 2330 }, { "epoch": 0.35155719779805444, "grad_norm": 0.2581134867558839, "learning_rate": 3.761491570560274e-07, "loss": 1.5962, "step": 2331 }, { "epoch": 0.351708015986728, "grad_norm": 0.2438935147855879, "learning_rate": 3.760539112804615e-07, "loss": 1.58, "step": 2332 }, { "epoch": 0.35185883417540154, "grad_norm": 0.37578295769326836, "learning_rate": 3.7595864281534945e-07, "loss": 1.5589, "step": 2333 }, { "epoch": 0.3520096523640751, "grad_norm": 0.24486877017889822, "learning_rate": 3.7586335168208187e-07, "loss": 1.5367, "step": 2334 }, { "epoch": 0.3521604705527487, "grad_norm": 0.2429857832769579, "learning_rate": 3.7576803790205447e-07, "loss": 1.4925, "step": 2335 }, { "epoch": 0.3523112887414222, "grad_norm": 0.2397245100035915, "learning_rate": 3.7567270149666775e-07, "loss": 1.5823, "step": 2336 }, { "epoch": 0.3524621069300958, "grad_norm": 0.2500161975176447, "learning_rate": 3.7557734248732757e-07, "loss": 1.6248, "step": 2337 }, { "epoch": 0.35261292511876935, "grad_norm": 0.28561743382712795, "learning_rate": 3.7548196089544505e-07, "loss": 1.5107, "step": 2338 }, { "epoch": 0.35276374330744287, "grad_norm": 0.26571887001887556, "learning_rate": 3.753865567424358e-07, "loss": 1.5116, "step": 2339 }, { "epoch": 0.35291456149611644, "grad_norm": 0.25473540068810213, "learning_rate": 3.7529113004972114e-07, "loss": 1.5812, "step": 2340 }, { "epoch": 0.35306537968478996, "grad_norm": 0.2555471401859078, "learning_rate": 3.751956808387268e-07, "loss": 1.4831, "step": 2341 }, { "epoch": 0.35321619787346353, "grad_norm": 0.27184839392756677, "learning_rate": 3.751002091308842e-07, "loss": 1.5669, "step": 2342 }, { "epoch": 0.3533670160621371, "grad_norm": 0.2515942203557161, "learning_rate": 3.750047149476294e-07, "loss": 1.6251, "step": 2343 }, { "epoch": 0.3535178342508106, "grad_norm": 0.26112307976157956, "learning_rate": 3.749091983104036e-07, "loss": 1.5943, "step": 2344 }, { "epoch": 0.3536686524394842, "grad_norm": 0.2501421913523755, "learning_rate": 3.748136592406531e-07, "loss": 1.5759, "step": 2345 }, { "epoch": 0.3538194706281578, "grad_norm": 0.24752102036976434, "learning_rate": 3.747180977598293e-07, "loss": 1.5804, "step": 2346 }, { "epoch": 0.3539702888168313, "grad_norm": 0.2455951689312032, "learning_rate": 3.746225138893883e-07, "loss": 1.5124, "step": 2347 }, { "epoch": 0.35412110700550486, "grad_norm": 0.24339549004166428, "learning_rate": 3.745269076507918e-07, "loss": 1.5778, "step": 2348 }, { "epoch": 0.35427192519417844, "grad_norm": 0.7443699191473532, "learning_rate": 3.744312790655058e-07, "loss": 1.5718, "step": 2349 }, { "epoch": 0.35442274338285196, "grad_norm": 0.24946128343868357, "learning_rate": 3.743356281550019e-07, "loss": 1.5991, "step": 2350 }, { "epoch": 0.35457356157152553, "grad_norm": 0.24247758143910678, "learning_rate": 3.742399549407566e-07, "loss": 1.4819, "step": 2351 }, { "epoch": 0.3547243797601991, "grad_norm": 0.2386100112502874, "learning_rate": 3.741442594442512e-07, "loss": 1.5642, "step": 2352 }, { "epoch": 0.3548751979488726, "grad_norm": 0.2505357416538226, "learning_rate": 3.7404854168697214e-07, "loss": 1.5825, "step": 2353 }, { "epoch": 0.3550260161375462, "grad_norm": 0.28867608528120875, "learning_rate": 3.7395280169041087e-07, "loss": 1.5876, "step": 2354 }, { "epoch": 0.3551768343262197, "grad_norm": 0.241730587623635, "learning_rate": 3.7385703947606374e-07, "loss": 1.5736, "step": 2355 }, { "epoch": 0.3553276525148933, "grad_norm": 0.24831574140721346, "learning_rate": 3.7376125506543227e-07, "loss": 1.6067, "step": 2356 }, { "epoch": 0.35547847070356686, "grad_norm": 0.2590154730802224, "learning_rate": 3.7366544848002277e-07, "loss": 1.5333, "step": 2357 }, { "epoch": 0.3556292888922404, "grad_norm": 0.31178147754240004, "learning_rate": 3.735696197413467e-07, "loss": 1.4965, "step": 2358 }, { "epoch": 0.35578010708091395, "grad_norm": 1.3552928491220932, "learning_rate": 3.7347376887092026e-07, "loss": 1.5818, "step": 2359 }, { "epoch": 0.35593092526958753, "grad_norm": 0.24500608110449468, "learning_rate": 3.733778958902649e-07, "loss": 1.5467, "step": 2360 }, { "epoch": 0.35608174345826105, "grad_norm": 0.25089900091150935, "learning_rate": 3.7328200082090677e-07, "loss": 1.5542, "step": 2361 }, { "epoch": 0.3562325616469346, "grad_norm": 0.24158656678132237, "learning_rate": 3.7318608368437716e-07, "loss": 1.5095, "step": 2362 }, { "epoch": 0.3563833798356082, "grad_norm": 0.2534149802126099, "learning_rate": 3.7309014450221225e-07, "loss": 1.5902, "step": 2363 }, { "epoch": 0.3565341980242817, "grad_norm": 0.24798687422411522, "learning_rate": 3.729941832959533e-07, "loss": 1.5484, "step": 2364 }, { "epoch": 0.3566850162129553, "grad_norm": 0.25046258736392985, "learning_rate": 3.728982000871462e-07, "loss": 1.5405, "step": 2365 }, { "epoch": 0.35683583440162886, "grad_norm": 0.2440667878039132, "learning_rate": 3.728021948973421e-07, "loss": 1.6005, "step": 2366 }, { "epoch": 0.3569866525903024, "grad_norm": 0.24610788855981183, "learning_rate": 3.7270616774809693e-07, "loss": 1.5601, "step": 2367 }, { "epoch": 0.35713747077897595, "grad_norm": 0.9454335873908799, "learning_rate": 3.726101186609715e-07, "loss": 1.5024, "step": 2368 }, { "epoch": 0.3572882889676495, "grad_norm": 0.28378028432667163, "learning_rate": 3.7251404765753194e-07, "loss": 1.557, "step": 2369 }, { "epoch": 0.35743910715632304, "grad_norm": 0.2557935616464503, "learning_rate": 3.7241795475934856e-07, "loss": 1.5365, "step": 2370 }, { "epoch": 0.3575899253449966, "grad_norm": 0.2549332722675419, "learning_rate": 3.7232183998799727e-07, "loss": 1.4674, "step": 2371 }, { "epoch": 0.35774074353367014, "grad_norm": 1.1517601933256894, "learning_rate": 3.722257033650586e-07, "loss": 1.5681, "step": 2372 }, { "epoch": 0.3578915617223437, "grad_norm": 0.272656754166095, "learning_rate": 3.72129544912118e-07, "loss": 1.5631, "step": 2373 }, { "epoch": 0.3580423799110173, "grad_norm": 0.29022175862146116, "learning_rate": 3.7203336465076573e-07, "loss": 1.5758, "step": 2374 }, { "epoch": 0.3581931980996908, "grad_norm": 0.24817613832794372, "learning_rate": 3.719371626025973e-07, "loss": 1.6106, "step": 2375 }, { "epoch": 0.3583440162883644, "grad_norm": 0.24399194599218416, "learning_rate": 3.7184093878921267e-07, "loss": 1.4977, "step": 2376 }, { "epoch": 0.35849483447703795, "grad_norm": 0.25767051635975635, "learning_rate": 3.7174469323221704e-07, "loss": 1.5812, "step": 2377 }, { "epoch": 0.35864565266571147, "grad_norm": 0.2523174711087523, "learning_rate": 3.7164842595322015e-07, "loss": 1.5518, "step": 2378 }, { "epoch": 0.35879647085438504, "grad_norm": 0.26428032115396755, "learning_rate": 3.71552136973837e-07, "loss": 1.6004, "step": 2379 }, { "epoch": 0.3589472890430586, "grad_norm": 0.2598334701344459, "learning_rate": 3.714558263156872e-07, "loss": 1.5377, "step": 2380 }, { "epoch": 0.35909810723173213, "grad_norm": 0.24231961396705243, "learning_rate": 3.713594940003952e-07, "loss": 1.5659, "step": 2381 }, { "epoch": 0.3592489254204057, "grad_norm": 0.2557782748611694, "learning_rate": 3.7126314004959056e-07, "loss": 1.4764, "step": 2382 }, { "epoch": 0.3593997436090793, "grad_norm": 0.32643135012971164, "learning_rate": 3.7116676448490754e-07, "loss": 1.5139, "step": 2383 }, { "epoch": 0.3595505617977528, "grad_norm": 0.24383151316441518, "learning_rate": 3.710703673279852e-07, "loss": 1.537, "step": 2384 }, { "epoch": 0.3597013799864264, "grad_norm": 0.2505535549620646, "learning_rate": 3.709739486004674e-07, "loss": 1.5429, "step": 2385 }, { "epoch": 0.3598521981750999, "grad_norm": 0.26628913276403715, "learning_rate": 3.708775083240031e-07, "loss": 1.6109, "step": 2386 }, { "epoch": 0.36000301636377346, "grad_norm": 0.2653073300714376, "learning_rate": 3.70781046520246e-07, "loss": 1.6284, "step": 2387 }, { "epoch": 0.36015383455244704, "grad_norm": 0.3151596090823107, "learning_rate": 3.706845632108545e-07, "loss": 1.6701, "step": 2388 }, { "epoch": 0.36030465274112056, "grad_norm": 0.2580701573700956, "learning_rate": 3.705880584174919e-07, "loss": 1.5415, "step": 2389 }, { "epoch": 0.36045547092979413, "grad_norm": 0.45731481557385795, "learning_rate": 3.704915321618263e-07, "loss": 1.5663, "step": 2390 }, { "epoch": 0.3606062891184677, "grad_norm": 0.29332627821859003, "learning_rate": 3.7039498446553073e-07, "loss": 1.541, "step": 2391 }, { "epoch": 0.3607571073071412, "grad_norm": 0.2539419562347637, "learning_rate": 3.7029841535028295e-07, "loss": 1.5933, "step": 2392 }, { "epoch": 0.3609079254958148, "grad_norm": 0.2487819911491069, "learning_rate": 3.702018248377656e-07, "loss": 1.5854, "step": 2393 }, { "epoch": 0.36105874368448837, "grad_norm": 0.2540973512020146, "learning_rate": 3.7010521294966583e-07, "loss": 1.5584, "step": 2394 }, { "epoch": 0.3612095618731619, "grad_norm": 0.276230147107862, "learning_rate": 3.700085797076761e-07, "loss": 1.5397, "step": 2395 }, { "epoch": 0.36136038006183546, "grad_norm": 0.25044667399851733, "learning_rate": 3.6991192513349324e-07, "loss": 1.5411, "step": 2396 }, { "epoch": 0.36151119825050904, "grad_norm": 0.27066315255091705, "learning_rate": 3.698152492488191e-07, "loss": 1.5477, "step": 2397 }, { "epoch": 0.36166201643918255, "grad_norm": 0.24415988508449232, "learning_rate": 3.697185520753601e-07, "loss": 1.5201, "step": 2398 }, { "epoch": 0.36181283462785613, "grad_norm": 0.24378718076324166, "learning_rate": 3.696218336348276e-07, "loss": 1.5561, "step": 2399 }, { "epoch": 0.36196365281652965, "grad_norm": 0.24117983955024197, "learning_rate": 3.695250939489379e-07, "loss": 1.5658, "step": 2400 }, { "epoch": 0.3621144710052032, "grad_norm": 0.2494661857308847, "learning_rate": 3.6942833303941163e-07, "loss": 1.596, "step": 2401 }, { "epoch": 0.3622652891938768, "grad_norm": 0.2497563413431003, "learning_rate": 3.6933155092797457e-07, "loss": 1.506, "step": 2402 }, { "epoch": 0.3624161073825503, "grad_norm": 0.23857213517373116, "learning_rate": 3.6923474763635716e-07, "loss": 1.5657, "step": 2403 }, { "epoch": 0.3625669255712239, "grad_norm": 0.30240876054872073, "learning_rate": 3.6913792318629433e-07, "loss": 1.5096, "step": 2404 }, { "epoch": 0.36271774375989746, "grad_norm": 0.25689137093919195, "learning_rate": 3.6904107759952623e-07, "loss": 1.574, "step": 2405 }, { "epoch": 0.362868561948571, "grad_norm": 0.24164303407146231, "learning_rate": 3.689442108977974e-07, "loss": 1.599, "step": 2406 }, { "epoch": 0.36301938013724455, "grad_norm": 0.2397255897218177, "learning_rate": 3.688473231028574e-07, "loss": 1.5725, "step": 2407 }, { "epoch": 0.3631701983259181, "grad_norm": 0.2946173688800962, "learning_rate": 3.687504142364601e-07, "loss": 1.5733, "step": 2408 }, { "epoch": 0.36332101651459164, "grad_norm": 0.257563218080772, "learning_rate": 3.6865348432036453e-07, "loss": 1.5593, "step": 2409 }, { "epoch": 0.3634718347032652, "grad_norm": 0.2525257965158378, "learning_rate": 3.6855653337633423e-07, "loss": 1.5775, "step": 2410 }, { "epoch": 0.3636226528919388, "grad_norm": 0.2549787899101132, "learning_rate": 3.684595614261375e-07, "loss": 1.5236, "step": 2411 }, { "epoch": 0.3637734710806123, "grad_norm": 0.25792198225445445, "learning_rate": 3.6836256849154735e-07, "loss": 1.5561, "step": 2412 }, { "epoch": 0.3639242892692859, "grad_norm": 0.24801130882772623, "learning_rate": 3.6826555459434165e-07, "loss": 1.4832, "step": 2413 }, { "epoch": 0.36407510745795946, "grad_norm": 0.2441895056224972, "learning_rate": 3.681685197563027e-07, "loss": 1.5012, "step": 2414 }, { "epoch": 0.364225925646633, "grad_norm": 0.24712875621506114, "learning_rate": 3.680714639992177e-07, "loss": 1.5336, "step": 2415 }, { "epoch": 0.36437674383530655, "grad_norm": 0.2522388109306788, "learning_rate": 3.679743873448785e-07, "loss": 1.6083, "step": 2416 }, { "epoch": 0.36452756202398007, "grad_norm": 0.5903504095147515, "learning_rate": 3.678772898150816e-07, "loss": 1.6368, "step": 2417 }, { "epoch": 0.36467838021265364, "grad_norm": 0.2597251356207736, "learning_rate": 3.6778017143162826e-07, "loss": 1.5642, "step": 2418 }, { "epoch": 0.3648291984013272, "grad_norm": 0.2531960272531877, "learning_rate": 3.676830322163244e-07, "loss": 1.5711, "step": 2419 }, { "epoch": 0.36498001659000073, "grad_norm": 0.7327336650049859, "learning_rate": 3.6758587219098055e-07, "loss": 1.58, "step": 2420 }, { "epoch": 0.3651308347786743, "grad_norm": 0.7568978409238439, "learning_rate": 3.674886913774121e-07, "loss": 1.5805, "step": 2421 }, { "epoch": 0.3652816529673479, "grad_norm": 0.2606031974117596, "learning_rate": 3.673914897974387e-07, "loss": 1.6141, "step": 2422 }, { "epoch": 0.3654324711560214, "grad_norm": 0.25452997507792435, "learning_rate": 3.6729426747288523e-07, "loss": 1.66, "step": 2423 }, { "epoch": 0.365583289344695, "grad_norm": 0.25396229477622617, "learning_rate": 3.671970244255807e-07, "loss": 1.5081, "step": 2424 }, { "epoch": 0.36573410753336855, "grad_norm": 0.25362236963149887, "learning_rate": 3.670997606773592e-07, "loss": 1.5591, "step": 2425 }, { "epoch": 0.36588492572204206, "grad_norm": 0.24657346521114726, "learning_rate": 3.670024762500592e-07, "loss": 1.565, "step": 2426 }, { "epoch": 0.36603574391071564, "grad_norm": 0.25369751050593276, "learning_rate": 3.6690517116552384e-07, "loss": 1.6045, "step": 2427 }, { "epoch": 0.3661865620993892, "grad_norm": 0.2416199710906676, "learning_rate": 3.6680784544560096e-07, "loss": 1.5997, "step": 2428 }, { "epoch": 0.36633738028806273, "grad_norm": 0.25616058190831803, "learning_rate": 3.66710499112143e-07, "loss": 1.6184, "step": 2429 }, { "epoch": 0.3664881984767363, "grad_norm": 0.2598470043980163, "learning_rate": 3.666131321870072e-07, "loss": 1.5097, "step": 2430 }, { "epoch": 0.3666390166654098, "grad_norm": 0.25397390756446714, "learning_rate": 3.6651574469205503e-07, "loss": 1.4104, "step": 2431 }, { "epoch": 0.3667898348540834, "grad_norm": 0.2532940923495218, "learning_rate": 3.66418336649153e-07, "loss": 1.5688, "step": 2432 }, { "epoch": 0.36694065304275697, "grad_norm": 0.26372885031044346, "learning_rate": 3.6632090808017196e-07, "loss": 1.5981, "step": 2433 }, { "epoch": 0.3670914712314305, "grad_norm": 0.736836117933467, "learning_rate": 3.662234590069876e-07, "loss": 1.5987, "step": 2434 }, { "epoch": 0.36724228942010406, "grad_norm": 0.25180599095166145, "learning_rate": 3.661259894514798e-07, "loss": 1.572, "step": 2435 }, { "epoch": 0.36739310760877764, "grad_norm": 0.2522775024815644, "learning_rate": 3.6602849943553366e-07, "loss": 1.5828, "step": 2436 }, { "epoch": 0.36754392579745115, "grad_norm": 0.2879773326396739, "learning_rate": 3.6593098898103825e-07, "loss": 1.5937, "step": 2437 }, { "epoch": 0.36769474398612473, "grad_norm": 0.24090414223098844, "learning_rate": 3.658334581098877e-07, "loss": 1.6163, "step": 2438 }, { "epoch": 0.3678455621747983, "grad_norm": 0.24308021934072732, "learning_rate": 3.6573590684398036e-07, "loss": 1.5329, "step": 2439 }, { "epoch": 0.3679963803634718, "grad_norm": 0.254692886434861, "learning_rate": 3.6563833520521944e-07, "loss": 1.5369, "step": 2440 }, { "epoch": 0.3681471985521454, "grad_norm": 0.2737286974799631, "learning_rate": 3.6554074321551263e-07, "loss": 1.5459, "step": 2441 }, { "epoch": 0.36829801674081897, "grad_norm": 0.23623753229922384, "learning_rate": 3.6544313089677204e-07, "loss": 1.57, "step": 2442 }, { "epoch": 0.3684488349294925, "grad_norm": 0.2555802056096531, "learning_rate": 3.6534549827091465e-07, "loss": 1.4908, "step": 2443 }, { "epoch": 0.36859965311816606, "grad_norm": 0.26079223078952796, "learning_rate": 3.652478453598617e-07, "loss": 1.5763, "step": 2444 }, { "epoch": 0.3687504713068396, "grad_norm": 0.25604931144429594, "learning_rate": 3.651501721855392e-07, "loss": 1.5196, "step": 2445 }, { "epoch": 0.36890128949551315, "grad_norm": 0.29826940759380655, "learning_rate": 3.6505247876987756e-07, "loss": 1.5709, "step": 2446 }, { "epoch": 0.3690521076841867, "grad_norm": 0.25470009222890144, "learning_rate": 3.649547651348118e-07, "loss": 1.5033, "step": 2447 }, { "epoch": 0.36920292587286024, "grad_norm": 0.24832644332181872, "learning_rate": 3.6485703130228147e-07, "loss": 1.5298, "step": 2448 }, { "epoch": 0.3693537440615338, "grad_norm": 0.23751675097100958, "learning_rate": 3.647592772942308e-07, "loss": 1.5558, "step": 2449 }, { "epoch": 0.3695045622502074, "grad_norm": 0.3061235443052442, "learning_rate": 3.6466150313260813e-07, "loss": 1.5705, "step": 2450 }, { "epoch": 0.3696553804388809, "grad_norm": 0.2420150027388054, "learning_rate": 3.6456370883936686e-07, "loss": 1.5607, "step": 2451 }, { "epoch": 0.3698061986275545, "grad_norm": 0.24859998354389287, "learning_rate": 3.644658944364646e-07, "loss": 1.5607, "step": 2452 }, { "epoch": 0.36995701681622806, "grad_norm": 0.24962571236153827, "learning_rate": 3.643680599458635e-07, "loss": 1.5867, "step": 2453 }, { "epoch": 0.3701078350049016, "grad_norm": 0.31443800935488536, "learning_rate": 3.6427020538953025e-07, "loss": 1.5697, "step": 2454 }, { "epoch": 0.37025865319357515, "grad_norm": 0.24373642702895276, "learning_rate": 3.64172330789436e-07, "loss": 1.5622, "step": 2455 }, { "epoch": 0.3704094713822487, "grad_norm": 0.25621149690462364, "learning_rate": 3.6407443616755657e-07, "loss": 1.5073, "step": 2456 }, { "epoch": 0.37056028957092224, "grad_norm": 0.25762597568130835, "learning_rate": 3.6397652154587206e-07, "loss": 1.6092, "step": 2457 }, { "epoch": 0.3707111077595958, "grad_norm": 1.1919798777471118, "learning_rate": 3.6387858694636715e-07, "loss": 1.5425, "step": 2458 }, { "epoch": 0.3708619259482694, "grad_norm": 0.251421077271863, "learning_rate": 3.6378063239103113e-07, "loss": 1.4724, "step": 2459 }, { "epoch": 0.3710127441369429, "grad_norm": 0.25366741666087783, "learning_rate": 3.6368265790185746e-07, "loss": 1.5215, "step": 2460 }, { "epoch": 0.3711635623256165, "grad_norm": 0.4207178094706561, "learning_rate": 3.635846635008444e-07, "loss": 1.5114, "step": 2461 }, { "epoch": 0.37131438051429, "grad_norm": 0.32196750384289907, "learning_rate": 3.6348664920999456e-07, "loss": 1.6209, "step": 2462 }, { "epoch": 0.3714651987029636, "grad_norm": 0.2495336315511228, "learning_rate": 3.633886150513149e-07, "loss": 1.5458, "step": 2463 }, { "epoch": 0.37161601689163715, "grad_norm": 0.7472867025315102, "learning_rate": 3.6329056104681704e-07, "loss": 1.563, "step": 2464 }, { "epoch": 0.37176683508031066, "grad_norm": 0.29741446110595743, "learning_rate": 3.6319248721851685e-07, "loss": 1.5654, "step": 2465 }, { "epoch": 0.37191765326898424, "grad_norm": 0.3236843198755218, "learning_rate": 3.630943935884349e-07, "loss": 1.4952, "step": 2466 }, { "epoch": 0.3720684714576578, "grad_norm": 0.2594476741448998, "learning_rate": 3.62996280178596e-07, "loss": 1.5008, "step": 2467 }, { "epoch": 0.37221928964633133, "grad_norm": 0.2538384281299253, "learning_rate": 3.6289814701102943e-07, "loss": 1.6065, "step": 2468 }, { "epoch": 0.3723701078350049, "grad_norm": 0.27122348792809003, "learning_rate": 3.62799994107769e-07, "loss": 1.5326, "step": 2469 }, { "epoch": 0.3725209260236785, "grad_norm": 0.25404092193633077, "learning_rate": 3.627018214908528e-07, "loss": 1.5207, "step": 2470 }, { "epoch": 0.372671744212352, "grad_norm": 0.24982996965510668, "learning_rate": 3.626036291823236e-07, "loss": 1.566, "step": 2471 }, { "epoch": 0.37282256240102557, "grad_norm": 0.24340133877324788, "learning_rate": 3.625054172042284e-07, "loss": 1.5153, "step": 2472 }, { "epoch": 0.37297338058969914, "grad_norm": 0.2449532469636368, "learning_rate": 3.624071855786185e-07, "loss": 1.5486, "step": 2473 }, { "epoch": 0.37312419877837266, "grad_norm": 0.24393436396790796, "learning_rate": 3.623089343275499e-07, "loss": 1.5293, "step": 2474 }, { "epoch": 0.37327501696704624, "grad_norm": 0.2439837149359486, "learning_rate": 3.6221066347308276e-07, "loss": 1.5383, "step": 2475 }, { "epoch": 0.37342583515571975, "grad_norm": 0.24727496115441294, "learning_rate": 3.621123730372819e-07, "loss": 1.5189, "step": 2476 }, { "epoch": 0.37357665334439333, "grad_norm": 0.2502123111049387, "learning_rate": 3.6201406304221625e-07, "loss": 1.6118, "step": 2477 }, { "epoch": 0.3737274715330669, "grad_norm": 0.2442722196888879, "learning_rate": 3.619157335099593e-07, "loss": 1.6215, "step": 2478 }, { "epoch": 0.3738782897217404, "grad_norm": 0.2798737546309143, "learning_rate": 3.6181738446258897e-07, "loss": 1.5433, "step": 2479 }, { "epoch": 0.374029107910414, "grad_norm": 0.2702349908306434, "learning_rate": 3.617190159221874e-07, "loss": 1.6105, "step": 2480 }, { "epoch": 0.37417992609908757, "grad_norm": 0.25956006639582185, "learning_rate": 3.6162062791084124e-07, "loss": 1.4806, "step": 2481 }, { "epoch": 0.3743307442877611, "grad_norm": 0.25139763886130506, "learning_rate": 3.6152222045064146e-07, "loss": 1.5881, "step": 2482 }, { "epoch": 0.37448156247643466, "grad_norm": 0.277809605479455, "learning_rate": 3.614237935636833e-07, "loss": 1.6212, "step": 2483 }, { "epoch": 0.37463238066510823, "grad_norm": 0.273163332381341, "learning_rate": 3.613253472720666e-07, "loss": 1.5924, "step": 2484 }, { "epoch": 0.37478319885378175, "grad_norm": 0.23879197574640515, "learning_rate": 3.6122688159789533e-07, "loss": 1.6423, "step": 2485 }, { "epoch": 0.3749340170424553, "grad_norm": 0.2546038431049256, "learning_rate": 3.61128396563278e-07, "loss": 1.5171, "step": 2486 }, { "epoch": 0.3750848352311289, "grad_norm": 0.23953276895403006, "learning_rate": 3.6102989219032733e-07, "loss": 1.5643, "step": 2487 }, { "epoch": 0.3752356534198024, "grad_norm": 0.2906806335181811, "learning_rate": 3.609313685011603e-07, "loss": 1.5522, "step": 2488 }, { "epoch": 0.375386471608476, "grad_norm": 0.2750965974767434, "learning_rate": 3.608328255178984e-07, "loss": 1.5947, "step": 2489 }, { "epoch": 0.3755372897971495, "grad_norm": 0.24602389200087268, "learning_rate": 3.6073426326266754e-07, "loss": 1.5711, "step": 2490 }, { "epoch": 0.3756881079858231, "grad_norm": 0.25933028795860896, "learning_rate": 3.6063568175759774e-07, "loss": 1.5236, "step": 2491 }, { "epoch": 0.37583892617449666, "grad_norm": 0.2505269949410393, "learning_rate": 3.6053708102482333e-07, "loss": 1.5529, "step": 2492 }, { "epoch": 0.3759897443631702, "grad_norm": 0.2430829129132987, "learning_rate": 3.6043846108648317e-07, "loss": 1.4762, "step": 2493 }, { "epoch": 0.37614056255184375, "grad_norm": 0.2449505059322527, "learning_rate": 3.6033982196472023e-07, "loss": 1.5179, "step": 2494 }, { "epoch": 0.3762913807405173, "grad_norm": 0.2507260037548037, "learning_rate": 3.602411636816819e-07, "loss": 1.5196, "step": 2495 }, { "epoch": 0.37644219892919084, "grad_norm": 0.37584163526536496, "learning_rate": 3.6014248625951984e-07, "loss": 1.5285, "step": 2496 }, { "epoch": 0.3765930171178644, "grad_norm": 0.25765837448196127, "learning_rate": 3.6004378972038995e-07, "loss": 1.5765, "step": 2497 }, { "epoch": 0.376743835306538, "grad_norm": 0.2524347758209021, "learning_rate": 3.5994507408645254e-07, "loss": 1.5726, "step": 2498 }, { "epoch": 0.3768946534952115, "grad_norm": 0.2464378769806035, "learning_rate": 3.598463393798721e-07, "loss": 1.5018, "step": 2499 }, { "epoch": 0.3770454716838851, "grad_norm": 0.3205722010062426, "learning_rate": 3.597475856228175e-07, "loss": 1.5475, "step": 2500 }, { "epoch": 0.37719628987255865, "grad_norm": 0.2973979291512468, "learning_rate": 3.596488128374618e-07, "loss": 1.6129, "step": 2501 }, { "epoch": 0.3773471080612322, "grad_norm": 0.25193464512230107, "learning_rate": 3.5955002104598233e-07, "loss": 1.639, "step": 2502 }, { "epoch": 0.37749792624990575, "grad_norm": 0.2564177040565832, "learning_rate": 3.5945121027056085e-07, "loss": 1.5603, "step": 2503 }, { "epoch": 0.3776487444385793, "grad_norm": 0.25736901468499984, "learning_rate": 3.5935238053338306e-07, "loss": 1.5685, "step": 2504 }, { "epoch": 0.37779956262725284, "grad_norm": 0.3265279800860292, "learning_rate": 3.592535318566393e-07, "loss": 1.5626, "step": 2505 }, { "epoch": 0.3779503808159264, "grad_norm": 0.2624954697548865, "learning_rate": 3.5915466426252395e-07, "loss": 1.4911, "step": 2506 }, { "epoch": 0.37810119900459993, "grad_norm": 0.6135301735853039, "learning_rate": 3.590557777732355e-07, "loss": 1.5582, "step": 2507 }, { "epoch": 0.3782520171932735, "grad_norm": 0.4120016764911682, "learning_rate": 3.589568724109771e-07, "loss": 1.5042, "step": 2508 }, { "epoch": 0.3784028353819471, "grad_norm": 0.2421724367048175, "learning_rate": 3.588579481979556e-07, "loss": 1.5401, "step": 2509 }, { "epoch": 0.3785536535706206, "grad_norm": 0.2530420392567595, "learning_rate": 3.587590051563826e-07, "loss": 1.5677, "step": 2510 }, { "epoch": 0.37870447175929417, "grad_norm": 0.2468715043787992, "learning_rate": 3.586600433084737e-07, "loss": 1.5502, "step": 2511 }, { "epoch": 0.37885528994796774, "grad_norm": 0.2524770444053603, "learning_rate": 3.585610626764485e-07, "loss": 1.6097, "step": 2512 }, { "epoch": 0.37900610813664126, "grad_norm": 0.2620919997505972, "learning_rate": 3.584620632825312e-07, "loss": 1.6128, "step": 2513 }, { "epoch": 0.37915692632531484, "grad_norm": 0.2658547516077316, "learning_rate": 3.5836304514894997e-07, "loss": 1.5252, "step": 2514 }, { "epoch": 0.3793077445139884, "grad_norm": 0.24404822219882544, "learning_rate": 3.5826400829793734e-07, "loss": 1.6027, "step": 2515 }, { "epoch": 0.37945856270266193, "grad_norm": 0.2468154377798453, "learning_rate": 3.5816495275173e-07, "loss": 1.5466, "step": 2516 }, { "epoch": 0.3796093808913355, "grad_norm": 0.24188988756601712, "learning_rate": 3.580658785325686e-07, "loss": 1.5999, "step": 2517 }, { "epoch": 0.3797601990800091, "grad_norm": 0.23888994835645322, "learning_rate": 3.5796678566269847e-07, "loss": 1.5828, "step": 2518 }, { "epoch": 0.3799110172686826, "grad_norm": 0.24089657083130359, "learning_rate": 3.578676741643686e-07, "loss": 1.5927, "step": 2519 }, { "epoch": 0.38006183545735617, "grad_norm": 0.2424084665110092, "learning_rate": 3.5776854405983247e-07, "loss": 1.5275, "step": 2520 }, { "epoch": 0.3802126536460297, "grad_norm": 0.2433942345662491, "learning_rate": 3.5766939537134785e-07, "loss": 1.529, "step": 2521 }, { "epoch": 0.38036347183470326, "grad_norm": 0.25291436213073637, "learning_rate": 3.575702281211762e-07, "loss": 1.5307, "step": 2522 }, { "epoch": 0.38051429002337683, "grad_norm": 0.2539334006876831, "learning_rate": 3.5747104233158376e-07, "loss": 1.5848, "step": 2523 }, { "epoch": 0.38066510821205035, "grad_norm": 0.27029092109111064, "learning_rate": 3.573718380248404e-07, "loss": 1.5669, "step": 2524 }, { "epoch": 0.3808159264007239, "grad_norm": 0.25850559989477917, "learning_rate": 3.5727261522322053e-07, "loss": 1.5996, "step": 2525 }, { "epoch": 0.3809667445893975, "grad_norm": 0.2671086272117505, "learning_rate": 3.571733739490025e-07, "loss": 1.5646, "step": 2526 }, { "epoch": 0.381117562778071, "grad_norm": 0.2408624437543867, "learning_rate": 3.5707411422446887e-07, "loss": 1.5752, "step": 2527 }, { "epoch": 0.3812683809667446, "grad_norm": 0.25504735550342, "learning_rate": 3.5697483607190634e-07, "loss": 1.5252, "step": 2528 }, { "epoch": 0.38141919915541816, "grad_norm": 0.3044244213239282, "learning_rate": 3.5687553951360565e-07, "loss": 1.5546, "step": 2529 }, { "epoch": 0.3815700173440917, "grad_norm": 0.9044492121709756, "learning_rate": 3.5677622457186197e-07, "loss": 1.5406, "step": 2530 }, { "epoch": 0.38172083553276526, "grad_norm": 0.25451969623397536, "learning_rate": 3.566768912689743e-07, "loss": 1.5538, "step": 2531 }, { "epoch": 0.38187165372143883, "grad_norm": 0.24781650874974845, "learning_rate": 3.5657753962724574e-07, "loss": 1.6394, "step": 2532 }, { "epoch": 0.38202247191011235, "grad_norm": 0.2556923716105965, "learning_rate": 3.5647816966898376e-07, "loss": 1.5072, "step": 2533 }, { "epoch": 0.3821732900987859, "grad_norm": 0.25536826723602823, "learning_rate": 3.5637878141649986e-07, "loss": 1.587, "step": 2534 }, { "epoch": 0.38232410828745944, "grad_norm": 0.35107493199094714, "learning_rate": 3.5627937489210946e-07, "loss": 1.5287, "step": 2535 }, { "epoch": 0.382474926476133, "grad_norm": 0.25482272417125473, "learning_rate": 3.5617995011813217e-07, "loss": 1.5592, "step": 2536 }, { "epoch": 0.3826257446648066, "grad_norm": 0.23779771356649057, "learning_rate": 3.5608050711689206e-07, "loss": 1.5347, "step": 2537 }, { "epoch": 0.3827765628534801, "grad_norm": 0.2536661448143563, "learning_rate": 3.5598104591071666e-07, "loss": 1.6121, "step": 2538 }, { "epoch": 0.3829273810421537, "grad_norm": 0.2743867604526638, "learning_rate": 3.55881566521938e-07, "loss": 1.6331, "step": 2539 }, { "epoch": 0.38307819923082725, "grad_norm": 0.28125217492724036, "learning_rate": 3.557820689728922e-07, "loss": 1.5647, "step": 2540 }, { "epoch": 0.3832290174195008, "grad_norm": 0.29985129969621616, "learning_rate": 3.5568255328591927e-07, "loss": 1.5693, "step": 2541 }, { "epoch": 0.38337983560817435, "grad_norm": 0.2575088246208123, "learning_rate": 3.555830194833634e-07, "loss": 1.5679, "step": 2542 }, { "epoch": 0.3835306537968479, "grad_norm": 0.24227748590037862, "learning_rate": 3.5548346758757276e-07, "loss": 1.5884, "step": 2543 }, { "epoch": 0.38368147198552144, "grad_norm": 0.25730342177076415, "learning_rate": 3.553838976208998e-07, "loss": 1.5064, "step": 2544 }, { "epoch": 0.383832290174195, "grad_norm": 0.2518233947961992, "learning_rate": 3.552843096057007e-07, "loss": 1.5662, "step": 2545 }, { "epoch": 0.3839831083628686, "grad_norm": 0.24662947641756058, "learning_rate": 3.5518470356433607e-07, "loss": 1.533, "step": 2546 }, { "epoch": 0.3841339265515421, "grad_norm": 0.2707362685244374, "learning_rate": 3.550850795191701e-07, "loss": 1.5642, "step": 2547 }, { "epoch": 0.3842847447402157, "grad_norm": 0.24828838160982408, "learning_rate": 3.549854374925715e-07, "loss": 1.6159, "step": 2548 }, { "epoch": 0.38443556292888925, "grad_norm": 0.32263784919365934, "learning_rate": 3.548857775069128e-07, "loss": 1.5652, "step": 2549 }, { "epoch": 0.38458638111756277, "grad_norm": 0.24798883657905776, "learning_rate": 3.547860995845705e-07, "loss": 1.5941, "step": 2550 }, { "epoch": 0.38473719930623634, "grad_norm": 0.24788484749743359, "learning_rate": 3.546864037479252e-07, "loss": 1.575, "step": 2551 }, { "epoch": 0.38488801749490986, "grad_norm": 0.2679950197180121, "learning_rate": 3.5458669001936154e-07, "loss": 1.5955, "step": 2552 }, { "epoch": 0.38503883568358344, "grad_norm": 0.27920578096628507, "learning_rate": 3.5448695842126815e-07, "loss": 1.5511, "step": 2553 }, { "epoch": 0.385189653872257, "grad_norm": 0.26415238272720953, "learning_rate": 3.543872089760376e-07, "loss": 1.5579, "step": 2554 }, { "epoch": 0.38534047206093053, "grad_norm": 0.27149305935859813, "learning_rate": 3.542874417060667e-07, "loss": 1.4529, "step": 2555 }, { "epoch": 0.3854912902496041, "grad_norm": 0.250792164223544, "learning_rate": 3.5418765663375613e-07, "loss": 1.4984, "step": 2556 }, { "epoch": 0.3856421084382777, "grad_norm": 0.2616412145153655, "learning_rate": 3.540878537815103e-07, "loss": 1.5504, "step": 2557 }, { "epoch": 0.3857929266269512, "grad_norm": 0.2538882269923289, "learning_rate": 3.53988033171738e-07, "loss": 1.6398, "step": 2558 }, { "epoch": 0.38594374481562477, "grad_norm": 0.2474357832914362, "learning_rate": 3.53888194826852e-07, "loss": 1.6093, "step": 2559 }, { "epoch": 0.38609456300429834, "grad_norm": 0.2960300497422995, "learning_rate": 3.5378833876926865e-07, "loss": 1.5457, "step": 2560 }, { "epoch": 0.38624538119297186, "grad_norm": 0.25897439161674723, "learning_rate": 3.536884650214087e-07, "loss": 1.5842, "step": 2561 }, { "epoch": 0.38639619938164543, "grad_norm": 0.25717842800991464, "learning_rate": 3.535885736056968e-07, "loss": 1.6571, "step": 2562 }, { "epoch": 0.386547017570319, "grad_norm": 0.24206278661173006, "learning_rate": 3.534886645445613e-07, "loss": 1.5238, "step": 2563 }, { "epoch": 0.3866978357589925, "grad_norm": 0.25262257528774823, "learning_rate": 3.5338873786043464e-07, "loss": 1.5816, "step": 2564 }, { "epoch": 0.3868486539476661, "grad_norm": 0.26635351438110183, "learning_rate": 3.5328879357575357e-07, "loss": 1.5669, "step": 2565 }, { "epoch": 0.3869994721363396, "grad_norm": 0.23778541687692611, "learning_rate": 3.5318883171295827e-07, "loss": 1.6074, "step": 2566 }, { "epoch": 0.3871502903250132, "grad_norm": 0.3082502795672136, "learning_rate": 3.5308885229449314e-07, "loss": 1.5698, "step": 2567 }, { "epoch": 0.38730110851368676, "grad_norm": 0.2716911724218832, "learning_rate": 3.529888553428065e-07, "loss": 1.6068, "step": 2568 }, { "epoch": 0.3874519267023603, "grad_norm": 0.24057573264881182, "learning_rate": 3.5288884088035047e-07, "loss": 1.5755, "step": 2569 }, { "epoch": 0.38760274489103386, "grad_norm": 0.3001263217639516, "learning_rate": 3.527888089295813e-07, "loss": 1.5804, "step": 2570 }, { "epoch": 0.38775356307970743, "grad_norm": 0.2849484147258995, "learning_rate": 3.526887595129592e-07, "loss": 1.6406, "step": 2571 }, { "epoch": 0.38790438126838095, "grad_norm": 0.45158680187400113, "learning_rate": 3.525886926529479e-07, "loss": 1.5028, "step": 2572 }, { "epoch": 0.3880551994570545, "grad_norm": 0.23841489018388096, "learning_rate": 3.524886083720155e-07, "loss": 1.5066, "step": 2573 }, { "epoch": 0.3882060176457281, "grad_norm": 0.2505767757173705, "learning_rate": 3.523885066926338e-07, "loss": 1.5941, "step": 2574 }, { "epoch": 0.3883568358344016, "grad_norm": 0.24869451937061052, "learning_rate": 3.522883876372786e-07, "loss": 1.5655, "step": 2575 }, { "epoch": 0.3885076540230752, "grad_norm": 0.2506663800167466, "learning_rate": 3.521882512284294e-07, "loss": 1.5076, "step": 2576 }, { "epoch": 0.38865847221174876, "grad_norm": 0.27321653566711673, "learning_rate": 3.5208809748856993e-07, "loss": 1.5595, "step": 2577 }, { "epoch": 0.3888092904004223, "grad_norm": 0.27974757704273684, "learning_rate": 3.519879264401874e-07, "loss": 1.5089, "step": 2578 }, { "epoch": 0.38896010858909585, "grad_norm": 0.46818369455980297, "learning_rate": 3.518877381057733e-07, "loss": 1.6114, "step": 2579 }, { "epoch": 0.3891109267777694, "grad_norm": 0.25803071382785336, "learning_rate": 3.517875325078229e-07, "loss": 1.5559, "step": 2580 }, { "epoch": 0.38926174496644295, "grad_norm": 0.2583288630929102, "learning_rate": 3.51687309668835e-07, "loss": 1.5375, "step": 2581 }, { "epoch": 0.3894125631551165, "grad_norm": 0.2640446446214051, "learning_rate": 3.515870696113127e-07, "loss": 1.6234, "step": 2582 }, { "epoch": 0.38956338134379004, "grad_norm": 0.27696063835751633, "learning_rate": 3.5148681235776285e-07, "loss": 1.5855, "step": 2583 }, { "epoch": 0.3897141995324636, "grad_norm": 0.2573817113265274, "learning_rate": 3.513865379306961e-07, "loss": 1.5196, "step": 2584 }, { "epoch": 0.3898650177211372, "grad_norm": 0.2423886129051474, "learning_rate": 3.512862463526269e-07, "loss": 1.5065, "step": 2585 }, { "epoch": 0.3900158359098107, "grad_norm": 4.06785605153624, "learning_rate": 3.511859376460738e-07, "loss": 1.5648, "step": 2586 }, { "epoch": 0.3901666540984843, "grad_norm": 0.2635253748519167, "learning_rate": 3.5108561183355884e-07, "loss": 1.6157, "step": 2587 }, { "epoch": 0.39031747228715785, "grad_norm": 0.2505502056943804, "learning_rate": 3.5098526893760823e-07, "loss": 1.6346, "step": 2588 }, { "epoch": 0.39046829047583137, "grad_norm": 0.2530678378552461, "learning_rate": 3.508849089807517e-07, "loss": 1.5542, "step": 2589 }, { "epoch": 0.39061910866450494, "grad_norm": 0.26292279162317655, "learning_rate": 3.507845319855232e-07, "loss": 1.56, "step": 2590 }, { "epoch": 0.3907699268531785, "grad_norm": 0.29078243973984197, "learning_rate": 3.506841379744601e-07, "loss": 1.5713, "step": 2591 }, { "epoch": 0.39092074504185204, "grad_norm": 0.32868406059323346, "learning_rate": 3.5058372697010384e-07, "loss": 1.6039, "step": 2592 }, { "epoch": 0.3910715632305256, "grad_norm": 0.2525036536065519, "learning_rate": 3.5048329899499975e-07, "loss": 1.5555, "step": 2593 }, { "epoch": 0.39122238141919913, "grad_norm": 0.2630087537592004, "learning_rate": 3.5038285407169665e-07, "loss": 1.5709, "step": 2594 }, { "epoch": 0.3913731996078727, "grad_norm": 0.28143126812196245, "learning_rate": 3.502823922227474e-07, "loss": 1.5727, "step": 2595 }, { "epoch": 0.3915240177965463, "grad_norm": 0.24490217881819218, "learning_rate": 3.501819134707087e-07, "loss": 1.6159, "step": 2596 }, { "epoch": 0.3916748359852198, "grad_norm": 0.7937926014143071, "learning_rate": 3.5008141783814077e-07, "loss": 1.5408, "step": 2597 }, { "epoch": 0.39182565417389337, "grad_norm": 0.24505278919691315, "learning_rate": 3.4998090534760804e-07, "loss": 1.6168, "step": 2598 }, { "epoch": 0.39197647236256694, "grad_norm": 0.2536043806196611, "learning_rate": 3.498803760216783e-07, "loss": 1.6077, "step": 2599 }, { "epoch": 0.39212729055124046, "grad_norm": 0.40477068068604105, "learning_rate": 3.497798298829234e-07, "loss": 1.5067, "step": 2600 }, { "epoch": 0.39227810873991403, "grad_norm": 0.24658759789845558, "learning_rate": 3.4967926695391885e-07, "loss": 1.5888, "step": 2601 }, { "epoch": 0.3924289269285876, "grad_norm": 0.250593063485952, "learning_rate": 3.49578687257244e-07, "loss": 1.6084, "step": 2602 }, { "epoch": 0.3925797451172611, "grad_norm": 0.24253691878675834, "learning_rate": 3.494780908154819e-07, "loss": 1.6294, "step": 2603 }, { "epoch": 0.3927305633059347, "grad_norm": 0.24103654342297792, "learning_rate": 3.4937747765121934e-07, "loss": 1.5471, "step": 2604 }, { "epoch": 0.3928813814946083, "grad_norm": 0.2728927050656147, "learning_rate": 3.4927684778704693e-07, "loss": 1.6348, "step": 2605 }, { "epoch": 0.3930321996832818, "grad_norm": 0.24506085358646462, "learning_rate": 3.491762012455591e-07, "loss": 1.4937, "step": 2606 }, { "epoch": 0.39318301787195536, "grad_norm": 0.24014219757507166, "learning_rate": 3.4907553804935376e-07, "loss": 1.538, "step": 2607 }, { "epoch": 0.39333383606062894, "grad_norm": 0.24429115753577182, "learning_rate": 3.489748582210328e-07, "loss": 1.5113, "step": 2608 }, { "epoch": 0.39348465424930246, "grad_norm": 0.28537403136894296, "learning_rate": 3.4887416178320185e-07, "loss": 1.568, "step": 2609 }, { "epoch": 0.39363547243797603, "grad_norm": 0.23825991470195018, "learning_rate": 3.4877344875847e-07, "loss": 1.5273, "step": 2610 }, { "epoch": 0.39378629062664955, "grad_norm": 0.2576386407608002, "learning_rate": 3.486727191694505e-07, "loss": 1.5482, "step": 2611 }, { "epoch": 0.3939371088153231, "grad_norm": 0.23839533111212974, "learning_rate": 3.485719730387599e-07, "loss": 1.5535, "step": 2612 }, { "epoch": 0.3940879270039967, "grad_norm": 0.2576875948496164, "learning_rate": 3.484712103890187e-07, "loss": 1.5805, "step": 2613 }, { "epoch": 0.3942387451926702, "grad_norm": 0.2578129139016205, "learning_rate": 3.483704312428511e-07, "loss": 1.5268, "step": 2614 }, { "epoch": 0.3943895633813438, "grad_norm": 0.25788059541015274, "learning_rate": 3.4826963562288493e-07, "loss": 1.5542, "step": 2615 }, { "epoch": 0.39454038157001736, "grad_norm": 0.2755588250827172, "learning_rate": 3.4816882355175164e-07, "loss": 1.5969, "step": 2616 }, { "epoch": 0.3946911997586909, "grad_norm": 0.25099898704009094, "learning_rate": 3.4806799505208654e-07, "loss": 1.5318, "step": 2617 }, { "epoch": 0.39484201794736445, "grad_norm": 0.24912268915215166, "learning_rate": 3.479671501465286e-07, "loss": 1.6251, "step": 2618 }, { "epoch": 0.39499283613603803, "grad_norm": 0.33127514454847884, "learning_rate": 3.478662888577205e-07, "loss": 1.5788, "step": 2619 }, { "epoch": 0.39514365432471155, "grad_norm": 0.25157259363870266, "learning_rate": 3.4776541120830824e-07, "loss": 1.5696, "step": 2620 }, { "epoch": 0.3952944725133851, "grad_norm": 0.2738464876476835, "learning_rate": 3.476645172209422e-07, "loss": 1.5516, "step": 2621 }, { "epoch": 0.3954452907020587, "grad_norm": 0.23811298747874665, "learning_rate": 3.475636069182758e-07, "loss": 1.6193, "step": 2622 }, { "epoch": 0.3955961088907322, "grad_norm": 0.25875497952173504, "learning_rate": 3.474626803229662e-07, "loss": 1.5521, "step": 2623 }, { "epoch": 0.3957469270794058, "grad_norm": 0.28847182070562355, "learning_rate": 3.473617374576747e-07, "loss": 1.5747, "step": 2624 }, { "epoch": 0.3958977452680793, "grad_norm": 0.23824845500467842, "learning_rate": 3.472607783450657e-07, "loss": 1.5624, "step": 2625 }, { "epoch": 0.3960485634567529, "grad_norm": 0.2542487631269686, "learning_rate": 3.471598030078074e-07, "loss": 1.5358, "step": 2626 }, { "epoch": 0.39619938164542645, "grad_norm": 0.2554835540513806, "learning_rate": 3.4705881146857186e-07, "loss": 1.6046, "step": 2627 }, { "epoch": 0.39635019983409997, "grad_norm": 0.23936580356092402, "learning_rate": 3.469578037500346e-07, "loss": 1.5496, "step": 2628 }, { "epoch": 0.39650101802277354, "grad_norm": 0.24235177378643663, "learning_rate": 3.4685677987487464e-07, "loss": 1.6188, "step": 2629 }, { "epoch": 0.3966518362114471, "grad_norm": 0.24893632276790695, "learning_rate": 3.4675573986577505e-07, "loss": 1.5966, "step": 2630 }, { "epoch": 0.39680265440012064, "grad_norm": 0.2629841664263705, "learning_rate": 3.4665468374542197e-07, "loss": 1.5352, "step": 2631 }, { "epoch": 0.3969534725887942, "grad_norm": 0.25793233524671794, "learning_rate": 3.4655361153650565e-07, "loss": 1.5239, "step": 2632 }, { "epoch": 0.3971042907774678, "grad_norm": 0.24429982363546654, "learning_rate": 3.4645252326171966e-07, "loss": 1.6371, "step": 2633 }, { "epoch": 0.3972551089661413, "grad_norm": 0.2573467243824734, "learning_rate": 3.463514189437613e-07, "loss": 1.5225, "step": 2634 }, { "epoch": 0.3974059271548149, "grad_norm": 0.679063985170996, "learning_rate": 3.4625029860533134e-07, "loss": 1.5862, "step": 2635 }, { "epoch": 0.39755674534348845, "grad_norm": 0.25171052854332737, "learning_rate": 3.4614916226913433e-07, "loss": 1.5616, "step": 2636 }, { "epoch": 0.39770756353216197, "grad_norm": 0.24633977010308533, "learning_rate": 3.4604800995787826e-07, "loss": 1.5499, "step": 2637 }, { "epoch": 0.39785838172083554, "grad_norm": 0.2951019560113865, "learning_rate": 3.4594684169427487e-07, "loss": 1.6254, "step": 2638 }, { "epoch": 0.39800919990950906, "grad_norm": 0.23811915492407157, "learning_rate": 3.4584565750103927e-07, "loss": 1.5325, "step": 2639 }, { "epoch": 0.39816001809818263, "grad_norm": 0.24558452212160967, "learning_rate": 3.4574445740089036e-07, "loss": 1.5681, "step": 2640 }, { "epoch": 0.3983108362868562, "grad_norm": 0.25258021327735125, "learning_rate": 3.456432414165504e-07, "loss": 1.5854, "step": 2641 }, { "epoch": 0.3984616544755297, "grad_norm": 0.24705997248524708, "learning_rate": 3.4554200957074535e-07, "loss": 1.4883, "step": 2642 }, { "epoch": 0.3986124726642033, "grad_norm": 0.9393781438635932, "learning_rate": 3.4544076188620484e-07, "loss": 1.5811, "step": 2643 }, { "epoch": 0.3987632908528769, "grad_norm": 0.24081186996868634, "learning_rate": 3.453394983856618e-07, "loss": 1.5615, "step": 2644 }, { "epoch": 0.3989141090415504, "grad_norm": 0.3548482655547463, "learning_rate": 3.4523821909185287e-07, "loss": 1.5285, "step": 2645 }, { "epoch": 0.39906492723022396, "grad_norm": 0.25244220556229424, "learning_rate": 3.4513692402751813e-07, "loss": 1.5206, "step": 2646 }, { "epoch": 0.39921574541889754, "grad_norm": 0.3010429865855147, "learning_rate": 3.4503561321540137e-07, "loss": 1.5322, "step": 2647 }, { "epoch": 0.39936656360757106, "grad_norm": 0.24745554269634792, "learning_rate": 3.4493428667824976e-07, "loss": 1.5899, "step": 2648 }, { "epoch": 0.39951738179624463, "grad_norm": 0.24692220967298187, "learning_rate": 3.4483294443881414e-07, "loss": 1.5691, "step": 2649 }, { "epoch": 0.3996681999849182, "grad_norm": 0.24957805000851577, "learning_rate": 3.4473158651984866e-07, "loss": 1.5357, "step": 2650 }, { "epoch": 0.3998190181735917, "grad_norm": 0.26422320009653594, "learning_rate": 3.4463021294411125e-07, "loss": 1.5988, "step": 2651 }, { "epoch": 0.3999698363622653, "grad_norm": 0.2620999742863255, "learning_rate": 3.4452882373436316e-07, "loss": 1.5631, "step": 2652 }, { "epoch": 0.40012065455093887, "grad_norm": 0.261503235413909, "learning_rate": 3.4442741891336926e-07, "loss": 1.6121, "step": 2653 }, { "epoch": 0.4002714727396124, "grad_norm": 0.2491475970261047, "learning_rate": 3.443259985038978e-07, "loss": 1.5968, "step": 2654 }, { "epoch": 0.40042229092828596, "grad_norm": 0.25473496113658356, "learning_rate": 3.4422456252872074e-07, "loss": 1.6159, "step": 2655 }, { "epoch": 0.4005731091169595, "grad_norm": 0.2796363857378898, "learning_rate": 3.441231110106133e-07, "loss": 1.5169, "step": 2656 }, { "epoch": 0.40072392730563305, "grad_norm": 0.2580527347727552, "learning_rate": 3.440216439723544e-07, "loss": 1.5556, "step": 2657 }, { "epoch": 0.40087474549430663, "grad_norm": 0.2961651530051946, "learning_rate": 3.439201614367263e-07, "loss": 1.5758, "step": 2658 }, { "epoch": 0.40102556368298015, "grad_norm": 0.24727990480200315, "learning_rate": 3.4381866342651474e-07, "loss": 1.559, "step": 2659 }, { "epoch": 0.4011763818716537, "grad_norm": 1.8441125513901913, "learning_rate": 3.43717149964509e-07, "loss": 1.5534, "step": 2660 }, { "epoch": 0.4013272000603273, "grad_norm": 0.4130681269520023, "learning_rate": 3.436156210735019e-07, "loss": 1.5958, "step": 2661 }, { "epoch": 0.4014780182490008, "grad_norm": 0.24928135649559705, "learning_rate": 3.4351407677628955e-07, "loss": 1.6431, "step": 2662 }, { "epoch": 0.4016288364376744, "grad_norm": 0.2557444447850399, "learning_rate": 3.434125170956716e-07, "loss": 1.555, "step": 2663 }, { "epoch": 0.40177965462634796, "grad_norm": 0.2569607174542637, "learning_rate": 3.433109420544511e-07, "loss": 1.5394, "step": 2664 }, { "epoch": 0.4019304728150215, "grad_norm": 0.2550208704852457, "learning_rate": 3.4320935167543476e-07, "loss": 1.5759, "step": 2665 }, { "epoch": 0.40208129100369505, "grad_norm": 0.2462457579459392, "learning_rate": 3.4310774598143244e-07, "loss": 1.5612, "step": 2666 }, { "epoch": 0.4022321091923686, "grad_norm": 0.2539463489198951, "learning_rate": 3.430061249952577e-07, "loss": 1.5093, "step": 2667 }, { "epoch": 0.40238292738104214, "grad_norm": 0.25143123583424176, "learning_rate": 3.429044887397273e-07, "loss": 1.5635, "step": 2668 }, { "epoch": 0.4025337455697157, "grad_norm": 0.2546528852562432, "learning_rate": 3.4280283723766156e-07, "loss": 1.6254, "step": 2669 }, { "epoch": 0.40268456375838924, "grad_norm": 0.2577657012079826, "learning_rate": 3.4270117051188427e-07, "loss": 1.4889, "step": 2670 }, { "epoch": 0.4028353819470628, "grad_norm": 0.2614460935676659, "learning_rate": 3.425994885852225e-07, "loss": 1.6185, "step": 2671 }, { "epoch": 0.4029862001357364, "grad_norm": 0.3979414514693881, "learning_rate": 3.424977914805068e-07, "loss": 1.558, "step": 2672 }, { "epoch": 0.4031370183244099, "grad_norm": 0.24555757646921053, "learning_rate": 3.4239607922057126e-07, "loss": 1.5621, "step": 2673 }, { "epoch": 0.4032878365130835, "grad_norm": 0.26714140749175647, "learning_rate": 3.42294351828253e-07, "loss": 1.507, "step": 2674 }, { "epoch": 0.40343865470175705, "grad_norm": 0.25886542993727496, "learning_rate": 3.42192609326393e-07, "loss": 1.6575, "step": 2675 }, { "epoch": 0.40358947289043057, "grad_norm": 0.2431401880097489, "learning_rate": 3.420908517378353e-07, "loss": 1.588, "step": 2676 }, { "epoch": 0.40374029107910414, "grad_norm": 0.23557654628041694, "learning_rate": 3.4198907908542756e-07, "loss": 1.5192, "step": 2677 }, { "epoch": 0.4038911092677777, "grad_norm": 0.28842546678351816, "learning_rate": 3.418872913920206e-07, "loss": 1.6055, "step": 2678 }, { "epoch": 0.40404192745645123, "grad_norm": 0.24739791616041887, "learning_rate": 3.4178548868046877e-07, "loss": 1.5644, "step": 2679 }, { "epoch": 0.4041927456451248, "grad_norm": 0.24882884949485204, "learning_rate": 3.4168367097362964e-07, "loss": 1.5794, "step": 2680 }, { "epoch": 0.4043435638337984, "grad_norm": 0.2861327829733997, "learning_rate": 3.415818382943644e-07, "loss": 1.5698, "step": 2681 }, { "epoch": 0.4044943820224719, "grad_norm": 0.3916787715235066, "learning_rate": 3.414799906655373e-07, "loss": 1.5366, "step": 2682 }, { "epoch": 0.4046452002111455, "grad_norm": 0.5960627175521959, "learning_rate": 3.4137812811001633e-07, "loss": 1.535, "step": 2683 }, { "epoch": 0.404796018399819, "grad_norm": 0.2562472635187039, "learning_rate": 3.4127625065067234e-07, "loss": 1.5125, "step": 2684 }, { "epoch": 0.40494683658849256, "grad_norm": 0.2513351595066356, "learning_rate": 3.4117435831037993e-07, "loss": 1.5257, "step": 2685 }, { "epoch": 0.40509765477716614, "grad_norm": 0.24845408181487894, "learning_rate": 3.4107245111201695e-07, "loss": 1.6515, "step": 2686 }, { "epoch": 0.40524847296583966, "grad_norm": 0.25030534842547747, "learning_rate": 3.4097052907846434e-07, "loss": 1.5621, "step": 2687 }, { "epoch": 0.40539929115451323, "grad_norm": 0.2512963303725947, "learning_rate": 3.408685922326067e-07, "loss": 1.5798, "step": 2688 }, { "epoch": 0.4055501093431868, "grad_norm": 0.2542188616465837, "learning_rate": 3.4076664059733186e-07, "loss": 1.5431, "step": 2689 }, { "epoch": 0.4057009275318603, "grad_norm": 0.2401795250475313, "learning_rate": 3.406646741955307e-07, "loss": 1.5495, "step": 2690 }, { "epoch": 0.4058517457205339, "grad_norm": 0.2544121534001211, "learning_rate": 3.4056269305009803e-07, "loss": 1.5652, "step": 2691 }, { "epoch": 0.40600256390920747, "grad_norm": 0.26262048054862935, "learning_rate": 3.4046069718393125e-07, "loss": 1.5826, "step": 2692 }, { "epoch": 0.406153382097881, "grad_norm": 0.27831235971342577, "learning_rate": 3.403586866199316e-07, "loss": 1.555, "step": 2693 }, { "epoch": 0.40630420028655456, "grad_norm": 0.2453047521498626, "learning_rate": 3.4025666138100327e-07, "loss": 1.547, "step": 2694 }, { "epoch": 0.40645501847522814, "grad_norm": 0.2547803857460511, "learning_rate": 3.4015462149005405e-07, "loss": 1.5881, "step": 2695 }, { "epoch": 0.40660583666390165, "grad_norm": 0.25239583782409786, "learning_rate": 3.400525669699948e-07, "loss": 1.5354, "step": 2696 }, { "epoch": 0.40675665485257523, "grad_norm": 0.2463906252270824, "learning_rate": 3.3995049784373966e-07, "loss": 1.5675, "step": 2697 }, { "epoch": 0.4069074730412488, "grad_norm": 0.2668245641763361, "learning_rate": 3.3984841413420616e-07, "loss": 1.5503, "step": 2698 }, { "epoch": 0.4070582912299223, "grad_norm": 0.40821576169654655, "learning_rate": 3.397463158643152e-07, "loss": 1.52, "step": 2699 }, { "epoch": 0.4072091094185959, "grad_norm": 0.2733214381159171, "learning_rate": 3.3964420305699064e-07, "loss": 1.6039, "step": 2700 }, { "epoch": 0.4073599276072694, "grad_norm": 0.24540087175714917, "learning_rate": 3.395420757351599e-07, "loss": 1.5331, "step": 2701 }, { "epoch": 0.407510745795943, "grad_norm": 0.303911791630233, "learning_rate": 3.394399339217534e-07, "loss": 1.5215, "step": 2702 }, { "epoch": 0.40766156398461656, "grad_norm": 0.27149720289102464, "learning_rate": 3.3933777763970507e-07, "loss": 1.7047, "step": 2703 }, { "epoch": 0.4078123821732901, "grad_norm": 0.2532605667347167, "learning_rate": 3.3923560691195194e-07, "loss": 1.5458, "step": 2704 }, { "epoch": 0.40796320036196365, "grad_norm": 1.1476192380139676, "learning_rate": 3.3913342176143435e-07, "loss": 1.5811, "step": 2705 }, { "epoch": 0.4081140185506372, "grad_norm": 0.2650873428391726, "learning_rate": 3.390312222110958e-07, "loss": 1.5058, "step": 2706 }, { "epoch": 0.40826483673931074, "grad_norm": 0.2507828693312355, "learning_rate": 3.3892900828388297e-07, "loss": 1.625, "step": 2707 }, { "epoch": 0.4084156549279843, "grad_norm": 0.2495042684876567, "learning_rate": 3.3882678000274607e-07, "loss": 1.6235, "step": 2708 }, { "epoch": 0.4085664731166579, "grad_norm": 0.26317188043167067, "learning_rate": 3.3872453739063815e-07, "loss": 1.6039, "step": 2709 }, { "epoch": 0.4087172913053314, "grad_norm": 0.2596342542294235, "learning_rate": 3.386222804705157e-07, "loss": 1.6036, "step": 2710 }, { "epoch": 0.408868109494005, "grad_norm": 0.25026437509997485, "learning_rate": 3.385200092653385e-07, "loss": 1.5756, "step": 2711 }, { "epoch": 0.40901892768267856, "grad_norm": 0.26226689742056464, "learning_rate": 3.384177237980693e-07, "loss": 1.5487, "step": 2712 }, { "epoch": 0.4091697458713521, "grad_norm": 0.25161489452840613, "learning_rate": 3.3831542409167406e-07, "loss": 1.5503, "step": 2713 }, { "epoch": 0.40932056406002565, "grad_norm": 0.2434400936495514, "learning_rate": 3.382131101691223e-07, "loss": 1.5178, "step": 2714 }, { "epoch": 0.40947138224869917, "grad_norm": 0.3428898089175938, "learning_rate": 3.381107820533863e-07, "loss": 1.5736, "step": 2715 }, { "epoch": 0.40962220043737274, "grad_norm": 0.24419776034880394, "learning_rate": 3.3800843976744165e-07, "loss": 1.5423, "step": 2716 }, { "epoch": 0.4097730186260463, "grad_norm": 0.2407398273910957, "learning_rate": 3.3790608333426733e-07, "loss": 1.5681, "step": 2717 }, { "epoch": 0.40992383681471983, "grad_norm": 0.23939499826381833, "learning_rate": 3.378037127768453e-07, "loss": 1.5486, "step": 2718 }, { "epoch": 0.4100746550033934, "grad_norm": 0.24805477172017687, "learning_rate": 3.377013281181607e-07, "loss": 1.5111, "step": 2719 }, { "epoch": 0.410225473192067, "grad_norm": 0.25290876011385005, "learning_rate": 3.375989293812018e-07, "loss": 1.5495, "step": 2720 }, { "epoch": 0.4103762913807405, "grad_norm": 0.24257394562603146, "learning_rate": 3.3749651658896023e-07, "loss": 1.6402, "step": 2721 }, { "epoch": 0.4105271095694141, "grad_norm": 0.2610073953080338, "learning_rate": 3.3739408976443065e-07, "loss": 1.6429, "step": 2722 }, { "epoch": 0.41067792775808765, "grad_norm": 0.24386260729457818, "learning_rate": 3.3729164893061066e-07, "loss": 1.6125, "step": 2723 }, { "epoch": 0.41082874594676116, "grad_norm": 0.3210020860293678, "learning_rate": 3.3718919411050144e-07, "loss": 1.5688, "step": 2724 }, { "epoch": 0.41097956413543474, "grad_norm": 0.2492960591940099, "learning_rate": 3.37086725327107e-07, "loss": 1.6307, "step": 2725 }, { "epoch": 0.4111303823241083, "grad_norm": 0.27238769894642695, "learning_rate": 3.369842426034345e-07, "loss": 1.5561, "step": 2726 }, { "epoch": 0.41128120051278183, "grad_norm": 0.2504464887219244, "learning_rate": 3.3688174596249446e-07, "loss": 1.5891, "step": 2727 }, { "epoch": 0.4114320187014554, "grad_norm": 0.29778856341510407, "learning_rate": 3.367792354273002e-07, "loss": 1.584, "step": 2728 }, { "epoch": 0.4115828368901289, "grad_norm": 0.27437651029692267, "learning_rate": 3.3667671102086835e-07, "loss": 1.5621, "step": 2729 }, { "epoch": 0.4117336550788025, "grad_norm": 0.32706072566989813, "learning_rate": 3.365741727662187e-07, "loss": 1.5984, "step": 2730 }, { "epoch": 0.41188447326747607, "grad_norm": 0.247227902267857, "learning_rate": 3.3647162068637393e-07, "loss": 1.6167, "step": 2731 }, { "epoch": 0.4120352914561496, "grad_norm": 0.2662345941585568, "learning_rate": 3.363690548043601e-07, "loss": 1.6178, "step": 2732 }, { "epoch": 0.41218610964482316, "grad_norm": 0.24907157756058096, "learning_rate": 3.362664751432063e-07, "loss": 1.5623, "step": 2733 }, { "epoch": 0.41233692783349674, "grad_norm": 0.24995333698556652, "learning_rate": 3.3616388172594446e-07, "loss": 1.5508, "step": 2734 }, { "epoch": 0.41248774602217025, "grad_norm": 0.26098406349777675, "learning_rate": 3.360612745756099e-07, "loss": 1.5691, "step": 2735 }, { "epoch": 0.41263856421084383, "grad_norm": 0.2492959279976695, "learning_rate": 3.359586537152409e-07, "loss": 1.5863, "step": 2736 }, { "epoch": 0.4127893823995174, "grad_norm": 0.26751413781066885, "learning_rate": 3.3585601916787886e-07, "loss": 1.5894, "step": 2737 }, { "epoch": 0.4129402005881909, "grad_norm": 0.2637667909873854, "learning_rate": 3.3575337095656815e-07, "loss": 1.5735, "step": 2738 }, { "epoch": 0.4130910187768645, "grad_norm": 0.25564659236591, "learning_rate": 3.356507091043563e-07, "loss": 1.6042, "step": 2739 }, { "epoch": 0.41324183696553807, "grad_norm": 0.2796082549809646, "learning_rate": 3.3554803363429403e-07, "loss": 1.5887, "step": 2740 }, { "epoch": 0.4133926551542116, "grad_norm": 0.272236777731565, "learning_rate": 3.3544534456943473e-07, "loss": 1.6221, "step": 2741 }, { "epoch": 0.41354347334288516, "grad_norm": 0.2543721016184445, "learning_rate": 3.353426419328352e-07, "loss": 1.5614, "step": 2742 }, { "epoch": 0.41369429153155873, "grad_norm": 0.25385962280711966, "learning_rate": 3.3523992574755525e-07, "loss": 1.5136, "step": 2743 }, { "epoch": 0.41384510972023225, "grad_norm": 0.32122229217521414, "learning_rate": 3.3513719603665764e-07, "loss": 1.5633, "step": 2744 }, { "epoch": 0.4139959279089058, "grad_norm": 0.246448545767713, "learning_rate": 3.350344528232081e-07, "loss": 1.537, "step": 2745 }, { "epoch": 0.41414674609757934, "grad_norm": 0.24637374083252297, "learning_rate": 3.3493169613027537e-07, "loss": 1.551, "step": 2746 }, { "epoch": 0.4142975642862529, "grad_norm": 0.3146802244636798, "learning_rate": 3.348289259809316e-07, "loss": 1.5986, "step": 2747 }, { "epoch": 0.4144483824749265, "grad_norm": 0.2462132778706303, "learning_rate": 3.347261423982515e-07, "loss": 1.5687, "step": 2748 }, { "epoch": 0.4145992006636, "grad_norm": 0.3066452786536837, "learning_rate": 3.34623345405313e-07, "loss": 1.6079, "step": 2749 }, { "epoch": 0.4147500188522736, "grad_norm": 0.2911543804614631, "learning_rate": 3.3452053502519705e-07, "loss": 1.5894, "step": 2750 }, { "epoch": 0.41490083704094716, "grad_norm": 0.3482933065137948, "learning_rate": 3.3441771128098757e-07, "loss": 1.6232, "step": 2751 }, { "epoch": 0.4150516552296207, "grad_norm": 0.23898678439185386, "learning_rate": 3.343148741957714e-07, "loss": 1.5574, "step": 2752 }, { "epoch": 0.41520247341829425, "grad_norm": 0.27797561961030265, "learning_rate": 3.3421202379263867e-07, "loss": 1.5671, "step": 2753 }, { "epoch": 0.4153532916069678, "grad_norm": 0.25358337703610595, "learning_rate": 3.34109160094682e-07, "loss": 1.5522, "step": 2754 }, { "epoch": 0.41550410979564134, "grad_norm": 0.2550694845202222, "learning_rate": 3.340062831249976e-07, "loss": 1.5342, "step": 2755 }, { "epoch": 0.4156549279843149, "grad_norm": 0.2597958803137637, "learning_rate": 3.339033929066841e-07, "loss": 1.5963, "step": 2756 }, { "epoch": 0.4158057461729885, "grad_norm": 0.2496316763184769, "learning_rate": 3.338004894628433e-07, "loss": 1.5589, "step": 2757 }, { "epoch": 0.415956564361662, "grad_norm": 0.25095372701614893, "learning_rate": 3.336975728165804e-07, "loss": 1.5429, "step": 2758 }, { "epoch": 0.4161073825503356, "grad_norm": 0.2631901524092377, "learning_rate": 3.3359464299100283e-07, "loss": 1.5541, "step": 2759 }, { "epoch": 0.4162582007390091, "grad_norm": 0.26130848981171917, "learning_rate": 3.3349170000922147e-07, "loss": 1.5923, "step": 2760 }, { "epoch": 0.4164090189276827, "grad_norm": 0.2865319671503223, "learning_rate": 3.333887438943499e-07, "loss": 1.6258, "step": 2761 }, { "epoch": 0.41655983711635625, "grad_norm": 0.25909940994325115, "learning_rate": 3.332857746695049e-07, "loss": 1.5499, "step": 2762 }, { "epoch": 0.41671065530502976, "grad_norm": 0.33967485157438465, "learning_rate": 3.3318279235780613e-07, "loss": 1.5917, "step": 2763 }, { "epoch": 0.41686147349370334, "grad_norm": 0.24378290445820855, "learning_rate": 3.3307979698237586e-07, "loss": 1.5501, "step": 2764 }, { "epoch": 0.4170122916823769, "grad_norm": 0.24693579990180703, "learning_rate": 3.329767885663398e-07, "loss": 1.5273, "step": 2765 }, { "epoch": 0.41716310987105043, "grad_norm": 0.2501417658905253, "learning_rate": 3.328737671328261e-07, "loss": 1.5227, "step": 2766 }, { "epoch": 0.417313928059724, "grad_norm": 0.2419502820018382, "learning_rate": 3.3277073270496635e-07, "loss": 1.5894, "step": 2767 }, { "epoch": 0.4174647462483976, "grad_norm": 0.2612808760168227, "learning_rate": 3.326676853058946e-07, "loss": 1.5314, "step": 2768 }, { "epoch": 0.4176155644370711, "grad_norm": 0.2540232188196327, "learning_rate": 3.3256462495874796e-07, "loss": 1.5732, "step": 2769 }, { "epoch": 0.41776638262574467, "grad_norm": 0.24457269312722849, "learning_rate": 3.324615516866666e-07, "loss": 1.5317, "step": 2770 }, { "epoch": 0.41791720081441824, "grad_norm": 0.24145564489233273, "learning_rate": 3.3235846551279346e-07, "loss": 1.5454, "step": 2771 }, { "epoch": 0.41806801900309176, "grad_norm": 0.31892115881563216, "learning_rate": 3.322553664602743e-07, "loss": 1.5604, "step": 2772 }, { "epoch": 0.41821883719176534, "grad_norm": 0.2586506860694086, "learning_rate": 3.32152254552258e-07, "loss": 1.5664, "step": 2773 }, { "epoch": 0.41836965538043885, "grad_norm": 0.3439546269177732, "learning_rate": 3.32049129811896e-07, "loss": 1.5469, "step": 2774 }, { "epoch": 0.41852047356911243, "grad_norm": 0.24418138452939914, "learning_rate": 3.3194599226234286e-07, "loss": 1.5578, "step": 2775 }, { "epoch": 0.418671291757786, "grad_norm": 0.2590844173586612, "learning_rate": 3.318428419267561e-07, "loss": 1.5369, "step": 2776 }, { "epoch": 0.4188221099464595, "grad_norm": 0.25552119759810754, "learning_rate": 3.3173967882829587e-07, "loss": 1.5683, "step": 2777 }, { "epoch": 0.4189729281351331, "grad_norm": 0.2669611498457822, "learning_rate": 3.3163650299012523e-07, "loss": 1.5806, "step": 2778 }, { "epoch": 0.41912374632380667, "grad_norm": 0.2611379003511046, "learning_rate": 3.315333144354103e-07, "loss": 1.6681, "step": 2779 }, { "epoch": 0.4192745645124802, "grad_norm": 0.352230203131426, "learning_rate": 3.3143011318731984e-07, "loss": 1.5714, "step": 2780 }, { "epoch": 0.41942538270115376, "grad_norm": 0.23926876122500249, "learning_rate": 3.313268992690255e-07, "loss": 1.62, "step": 2781 }, { "epoch": 0.41957620088982733, "grad_norm": 0.26209112358816156, "learning_rate": 3.312236727037019e-07, "loss": 1.6144, "step": 2782 }, { "epoch": 0.41972701907850085, "grad_norm": 0.3244389317643446, "learning_rate": 3.3112043351452635e-07, "loss": 1.6534, "step": 2783 }, { "epoch": 0.4198778372671744, "grad_norm": 0.24825052232823228, "learning_rate": 3.3101718172467914e-07, "loss": 1.522, "step": 2784 }, { "epoch": 0.420028655455848, "grad_norm": 0.24975060345118297, "learning_rate": 3.309139173573431e-07, "loss": 1.5199, "step": 2785 }, { "epoch": 0.4201794736445215, "grad_norm": 0.24013439238796144, "learning_rate": 3.3081064043570435e-07, "loss": 1.6011, "step": 2786 }, { "epoch": 0.4203302918331951, "grad_norm": 0.3894513948675421, "learning_rate": 3.3070735098295135e-07, "loss": 1.5331, "step": 2787 }, { "epoch": 0.42048111002186866, "grad_norm": 0.24346605286743947, "learning_rate": 3.306040490222756e-07, "loss": 1.5844, "step": 2788 }, { "epoch": 0.4206319282105422, "grad_norm": 0.2536043145275316, "learning_rate": 3.3050073457687165e-07, "loss": 1.6263, "step": 2789 }, { "epoch": 0.42078274639921576, "grad_norm": 0.2622097772722774, "learning_rate": 3.303974076699363e-07, "loss": 1.5541, "step": 2790 }, { "epoch": 0.4209335645878893, "grad_norm": 0.2597367981247587, "learning_rate": 3.3029406832466965e-07, "loss": 1.5695, "step": 2791 }, { "epoch": 0.42108438277656285, "grad_norm": 0.260190379204413, "learning_rate": 3.3019071656427426e-07, "loss": 1.6289, "step": 2792 }, { "epoch": 0.4212352009652364, "grad_norm": 0.25219201790085294, "learning_rate": 3.3008735241195573e-07, "loss": 1.5846, "step": 2793 }, { "epoch": 0.42138601915390994, "grad_norm": 0.2517974939357513, "learning_rate": 3.2998397589092224e-07, "loss": 1.5589, "step": 2794 }, { "epoch": 0.4215368373425835, "grad_norm": 0.27580484795282745, "learning_rate": 3.2988058702438486e-07, "loss": 1.5779, "step": 2795 }, { "epoch": 0.4216876555312571, "grad_norm": 0.2735825859157333, "learning_rate": 3.2977718583555744e-07, "loss": 1.5275, "step": 2796 }, { "epoch": 0.4218384737199306, "grad_norm": 0.27188477004475226, "learning_rate": 3.2967377234765635e-07, "loss": 1.6078, "step": 2797 }, { "epoch": 0.4219892919086042, "grad_norm": 0.2601800017524752, "learning_rate": 3.2957034658390125e-07, "loss": 1.6309, "step": 2798 }, { "epoch": 0.42214011009727775, "grad_norm": 0.24986428985487438, "learning_rate": 3.2946690856751404e-07, "loss": 1.5696, "step": 2799 }, { "epoch": 0.4222909282859513, "grad_norm": 0.2586438246567823, "learning_rate": 3.293634583217195e-07, "loss": 1.5653, "step": 2800 }, { "epoch": 0.42244174647462485, "grad_norm": 0.26875412070114774, "learning_rate": 3.2925999586974545e-07, "loss": 1.5808, "step": 2801 }, { "epoch": 0.4225925646632984, "grad_norm": 0.2459104246750409, "learning_rate": 3.2915652123482215e-07, "loss": 1.5677, "step": 2802 }, { "epoch": 0.42274338285197194, "grad_norm": 0.2839961716031081, "learning_rate": 3.2905303444018253e-07, "loss": 1.563, "step": 2803 }, { "epoch": 0.4228942010406455, "grad_norm": 0.24736716745677786, "learning_rate": 3.2894953550906246e-07, "loss": 1.523, "step": 2804 }, { "epoch": 0.42304501922931903, "grad_norm": 0.27418143350308405, "learning_rate": 3.288460244647005e-07, "loss": 1.5728, "step": 2805 }, { "epoch": 0.4231958374179926, "grad_norm": 0.3195441770865284, "learning_rate": 3.2874250133033786e-07, "loss": 1.6053, "step": 2806 }, { "epoch": 0.4233466556066662, "grad_norm": 0.24631535702514804, "learning_rate": 3.286389661292186e-07, "loss": 1.5078, "step": 2807 }, { "epoch": 0.4234974737953397, "grad_norm": 0.25668273674967906, "learning_rate": 3.285354188845892e-07, "loss": 1.6132, "step": 2808 }, { "epoch": 0.42364829198401327, "grad_norm": 0.2507520791825448, "learning_rate": 3.284318596196992e-07, "loss": 1.5598, "step": 2809 }, { "epoch": 0.42379911017268684, "grad_norm": 0.26571287124523163, "learning_rate": 3.2832828835780065e-07, "loss": 1.5631, "step": 2810 }, { "epoch": 0.42394992836136036, "grad_norm": 0.2439539718460567, "learning_rate": 3.2822470512214816e-07, "loss": 1.5626, "step": 2811 }, { "epoch": 0.42410074655003394, "grad_norm": 0.2674550525834154, "learning_rate": 3.281211099359994e-07, "loss": 1.5791, "step": 2812 }, { "epoch": 0.4242515647387075, "grad_norm": 0.24557218187885993, "learning_rate": 3.280175028226143e-07, "loss": 1.554, "step": 2813 }, { "epoch": 0.42440238292738103, "grad_norm": 0.2564709265749323, "learning_rate": 3.2791388380525583e-07, "loss": 1.5749, "step": 2814 }, { "epoch": 0.4245532011160546, "grad_norm": 0.2471382396594902, "learning_rate": 3.278102529071894e-07, "loss": 1.6417, "step": 2815 }, { "epoch": 0.4247040193047282, "grad_norm": 0.28361301842175707, "learning_rate": 3.2770661015168327e-07, "loss": 1.5692, "step": 2816 }, { "epoch": 0.4248548374934017, "grad_norm": 0.24649131145090075, "learning_rate": 3.2760295556200813e-07, "loss": 1.5649, "step": 2817 }, { "epoch": 0.42500565568207527, "grad_norm": 0.25011966391967305, "learning_rate": 3.274992891614375e-07, "loss": 1.4936, "step": 2818 }, { "epoch": 0.4251564738707488, "grad_norm": 0.26769277225623017, "learning_rate": 3.2739561097324746e-07, "loss": 1.5286, "step": 2819 }, { "epoch": 0.42530729205942236, "grad_norm": 0.25553634024998556, "learning_rate": 3.2729192102071683e-07, "loss": 1.5756, "step": 2820 }, { "epoch": 0.42545811024809593, "grad_norm": 0.26612513346681166, "learning_rate": 3.2718821932712704e-07, "loss": 1.5705, "step": 2821 }, { "epoch": 0.42560892843676945, "grad_norm": 0.270197771738574, "learning_rate": 3.270845059157622e-07, "loss": 1.6422, "step": 2822 }, { "epoch": 0.425759746625443, "grad_norm": 0.30734421602780476, "learning_rate": 3.269807808099088e-07, "loss": 1.6486, "step": 2823 }, { "epoch": 0.4259105648141166, "grad_norm": 0.2514541593491989, "learning_rate": 3.268770440328563e-07, "loss": 1.5418, "step": 2824 }, { "epoch": 0.4260613830027901, "grad_norm": 0.2614494218667816, "learning_rate": 3.2677329560789664e-07, "loss": 1.5701, "step": 2825 }, { "epoch": 0.4262122011914637, "grad_norm": 0.2511602678497061, "learning_rate": 3.266695355583242e-07, "loss": 1.5158, "step": 2826 }, { "epoch": 0.42636301938013726, "grad_norm": 0.26400297098564646, "learning_rate": 3.265657639074363e-07, "loss": 1.6608, "step": 2827 }, { "epoch": 0.4265138375688108, "grad_norm": 0.2564658434956295, "learning_rate": 3.264619806785327e-07, "loss": 1.5434, "step": 2828 }, { "epoch": 0.42666465575748436, "grad_norm": 0.38207198468778597, "learning_rate": 3.2635818589491567e-07, "loss": 1.5657, "step": 2829 }, { "epoch": 0.42681547394615793, "grad_norm": 0.27509380119835014, "learning_rate": 3.262543795798902e-07, "loss": 1.5504, "step": 2830 }, { "epoch": 0.42696629213483145, "grad_norm": 0.25133984016735206, "learning_rate": 3.2615056175676376e-07, "loss": 1.5923, "step": 2831 }, { "epoch": 0.427117110323505, "grad_norm": 0.244357207006092, "learning_rate": 3.2604673244884656e-07, "loss": 1.5887, "step": 2832 }, { "epoch": 0.4272679285121786, "grad_norm": 0.29225697760141345, "learning_rate": 3.259428916794513e-07, "loss": 1.561, "step": 2833 }, { "epoch": 0.4274187467008521, "grad_norm": 0.24786007772500737, "learning_rate": 3.2583903947189324e-07, "loss": 1.5648, "step": 2834 }, { "epoch": 0.4275695648895257, "grad_norm": 0.27825183380842167, "learning_rate": 3.2573517584949016e-07, "loss": 1.5252, "step": 2835 }, { "epoch": 0.4277203830781992, "grad_norm": 0.24809497916679388, "learning_rate": 3.2563130083556263e-07, "loss": 1.5027, "step": 2836 }, { "epoch": 0.4278712012668728, "grad_norm": 0.2597344556034982, "learning_rate": 3.2552741445343343e-07, "loss": 1.5662, "step": 2837 }, { "epoch": 0.42802201945554635, "grad_norm": 0.2567179160633774, "learning_rate": 3.254235167264282e-07, "loss": 1.5976, "step": 2838 }, { "epoch": 0.4281728376442199, "grad_norm": 0.24920655921464463, "learning_rate": 3.25319607677875e-07, "loss": 1.571, "step": 2839 }, { "epoch": 0.42832365583289345, "grad_norm": 0.27293874543904306, "learning_rate": 3.2521568733110435e-07, "loss": 1.6008, "step": 2840 }, { "epoch": 0.428474474021567, "grad_norm": 0.2878965239814466, "learning_rate": 3.251117557094495e-07, "loss": 1.5456, "step": 2841 }, { "epoch": 0.42862529221024054, "grad_norm": 0.24775200312433204, "learning_rate": 3.2500781283624606e-07, "loss": 1.5894, "step": 2842 }, { "epoch": 0.4287761103989141, "grad_norm": 0.2615997014805728, "learning_rate": 3.249038587348323e-07, "loss": 1.5442, "step": 2843 }, { "epoch": 0.4289269285875877, "grad_norm": 0.2812310283534168, "learning_rate": 3.247998934285488e-07, "loss": 1.6119, "step": 2844 }, { "epoch": 0.4290777467762612, "grad_norm": 0.2640209190131911, "learning_rate": 3.246959169407389e-07, "loss": 1.5895, "step": 2845 }, { "epoch": 0.4292285649649348, "grad_norm": 0.2894966145146055, "learning_rate": 3.245919292947484e-07, "loss": 1.5367, "step": 2846 }, { "epoch": 0.42937938315360835, "grad_norm": 0.2514511542793688, "learning_rate": 3.2448793051392546e-07, "loss": 1.5186, "step": 2847 }, { "epoch": 0.42953020134228187, "grad_norm": 0.2515606117685996, "learning_rate": 3.243839206216209e-07, "loss": 1.5835, "step": 2848 }, { "epoch": 0.42968101953095544, "grad_norm": 0.2393427989651705, "learning_rate": 3.2427989964118784e-07, "loss": 1.5502, "step": 2849 }, { "epoch": 0.42983183771962896, "grad_norm": 0.2793399959806875, "learning_rate": 3.2417586759598225e-07, "loss": 1.5552, "step": 2850 }, { "epoch": 0.42998265590830254, "grad_norm": 0.25010490471432933, "learning_rate": 3.2407182450936217e-07, "loss": 1.5586, "step": 2851 }, { "epoch": 0.4301334740969761, "grad_norm": 0.251943279627523, "learning_rate": 3.2396777040468833e-07, "loss": 1.5344, "step": 2852 }, { "epoch": 0.4302842922856496, "grad_norm": 0.2605855624176244, "learning_rate": 3.2386370530532397e-07, "loss": 1.5077, "step": 2853 }, { "epoch": 0.4304351104743232, "grad_norm": 0.2534032198280918, "learning_rate": 3.2375962923463476e-07, "loss": 1.5534, "step": 2854 }, { "epoch": 0.4305859286629968, "grad_norm": 0.26276413466059373, "learning_rate": 3.236555422159887e-07, "loss": 1.5436, "step": 2855 }, { "epoch": 0.4307367468516703, "grad_norm": 0.24882989004331033, "learning_rate": 3.2355144427275637e-07, "loss": 1.5596, "step": 2856 }, { "epoch": 0.43088756504034387, "grad_norm": 0.2842043516544378, "learning_rate": 3.2344733542831086e-07, "loss": 1.5613, "step": 2857 }, { "epoch": 0.43103838322901744, "grad_norm": 0.23323082429628905, "learning_rate": 3.233432157060276e-07, "loss": 1.6159, "step": 2858 }, { "epoch": 0.43118920141769096, "grad_norm": 0.36941519147313756, "learning_rate": 3.232390851292845e-07, "loss": 1.5453, "step": 2859 }, { "epoch": 0.43134001960636453, "grad_norm": 0.23799456270118677, "learning_rate": 3.231349437214619e-07, "loss": 1.5456, "step": 2860 }, { "epoch": 0.4314908377950381, "grad_norm": 0.2550429890826443, "learning_rate": 3.2303079150594264e-07, "loss": 1.5765, "step": 2861 }, { "epoch": 0.4316416559837116, "grad_norm": 0.25440445508837595, "learning_rate": 3.229266285061118e-07, "loss": 1.6726, "step": 2862 }, { "epoch": 0.4317924741723852, "grad_norm": 0.2540189384768522, "learning_rate": 3.228224547453571e-07, "loss": 1.5742, "step": 2863 }, { "epoch": 0.4319432923610587, "grad_norm": 0.25509213382412893, "learning_rate": 3.227182702470686e-07, "loss": 1.638, "step": 2864 }, { "epoch": 0.4320941105497323, "grad_norm": 0.25794465385284837, "learning_rate": 3.2261407503463874e-07, "loss": 1.609, "step": 2865 }, { "epoch": 0.43224492873840586, "grad_norm": 0.26403881152382164, "learning_rate": 3.225098691314623e-07, "loss": 1.6337, "step": 2866 }, { "epoch": 0.4323957469270794, "grad_norm": 0.24986562370671758, "learning_rate": 3.224056525609366e-07, "loss": 1.5586, "step": 2867 }, { "epoch": 0.43254656511575296, "grad_norm": 0.24697766775196395, "learning_rate": 3.223014253464614e-07, "loss": 1.5504, "step": 2868 }, { "epoch": 0.43269738330442653, "grad_norm": 0.2550985722985474, "learning_rate": 3.221971875114385e-07, "loss": 1.5356, "step": 2869 }, { "epoch": 0.43284820149310005, "grad_norm": 0.2471461523400474, "learning_rate": 3.2209293907927256e-07, "loss": 1.5304, "step": 2870 }, { "epoch": 0.4329990196817736, "grad_norm": 0.30486015466742766, "learning_rate": 3.219886800733703e-07, "loss": 1.5771, "step": 2871 }, { "epoch": 0.4331498378704472, "grad_norm": 0.24892059657585422, "learning_rate": 3.2188441051714086e-07, "loss": 1.6376, "step": 2872 }, { "epoch": 0.4333006560591207, "grad_norm": 0.25228093749319536, "learning_rate": 3.217801304339958e-07, "loss": 1.5088, "step": 2873 }, { "epoch": 0.4334514742477943, "grad_norm": 0.24124651139886405, "learning_rate": 3.216758398473492e-07, "loss": 1.5354, "step": 2874 }, { "epoch": 0.43360229243646786, "grad_norm": 0.24938492951014538, "learning_rate": 3.2157153878061705e-07, "loss": 1.5817, "step": 2875 }, { "epoch": 0.4337531106251414, "grad_norm": 0.25757749869874946, "learning_rate": 3.2146722725721826e-07, "loss": 1.5755, "step": 2876 }, { "epoch": 0.43390392881381495, "grad_norm": 0.25676529276672144, "learning_rate": 3.213629053005736e-07, "loss": 1.6694, "step": 2877 }, { "epoch": 0.4340547470024885, "grad_norm": 0.25196048770398494, "learning_rate": 3.212585729341064e-07, "loss": 1.6174, "step": 2878 }, { "epoch": 0.43420556519116205, "grad_norm": 0.2498727193813626, "learning_rate": 3.211542301812425e-07, "loss": 1.5537, "step": 2879 }, { "epoch": 0.4343563833798356, "grad_norm": 0.2477314087337444, "learning_rate": 3.2104987706540975e-07, "loss": 1.5551, "step": 2880 }, { "epoch": 0.43450720156850914, "grad_norm": 0.2767502201857406, "learning_rate": 3.209455136100385e-07, "loss": 1.5546, "step": 2881 }, { "epoch": 0.4346580197571827, "grad_norm": 0.2548227677714052, "learning_rate": 3.2084113983856125e-07, "loss": 1.5466, "step": 2882 }, { "epoch": 0.4348088379458563, "grad_norm": 0.24014031121639437, "learning_rate": 3.2073675577441314e-07, "loss": 1.5498, "step": 2883 }, { "epoch": 0.4349596561345298, "grad_norm": 0.24294953710051626, "learning_rate": 3.206323614410314e-07, "loss": 1.548, "step": 2884 }, { "epoch": 0.4351104743232034, "grad_norm": 0.26743355263189356, "learning_rate": 3.205279568618554e-07, "loss": 1.5803, "step": 2885 }, { "epoch": 0.43526129251187695, "grad_norm": 0.2663085951111445, "learning_rate": 3.2042354206032727e-07, "loss": 1.5586, "step": 2886 }, { "epoch": 0.43541211070055047, "grad_norm": 0.2718118661493261, "learning_rate": 3.2031911705989113e-07, "loss": 1.5262, "step": 2887 }, { "epoch": 0.43556292888922404, "grad_norm": 0.2765415943315982, "learning_rate": 3.2021468188399325e-07, "loss": 1.5542, "step": 2888 }, { "epoch": 0.4357137470778976, "grad_norm": 0.25151574286126827, "learning_rate": 3.2011023655608256e-07, "loss": 1.5123, "step": 2889 }, { "epoch": 0.43586456526657114, "grad_norm": 0.2569228721930573, "learning_rate": 3.2000578109961003e-07, "loss": 1.5425, "step": 2890 }, { "epoch": 0.4360153834552447, "grad_norm": 0.2581041939483696, "learning_rate": 3.199013155380289e-07, "loss": 1.5974, "step": 2891 }, { "epoch": 0.4361662016439183, "grad_norm": 0.24858529905108517, "learning_rate": 3.197968398947947e-07, "loss": 1.5472, "step": 2892 }, { "epoch": 0.4363170198325918, "grad_norm": 0.2655668765193959, "learning_rate": 3.1969235419336533e-07, "loss": 1.6225, "step": 2893 }, { "epoch": 0.4364678380212654, "grad_norm": 0.25538155411435326, "learning_rate": 3.1958785845720094e-07, "loss": 1.4932, "step": 2894 }, { "epoch": 0.4366186562099389, "grad_norm": 0.2513728933246105, "learning_rate": 3.194833527097637e-07, "loss": 1.6067, "step": 2895 }, { "epoch": 0.43676947439861247, "grad_norm": 0.2688528228720648, "learning_rate": 3.1937883697451835e-07, "loss": 1.5867, "step": 2896 }, { "epoch": 0.43692029258728604, "grad_norm": 0.27820117449902454, "learning_rate": 3.1927431127493164e-07, "loss": 1.5504, "step": 2897 }, { "epoch": 0.43707111077595956, "grad_norm": 0.2587225093120209, "learning_rate": 3.1916977563447257e-07, "loss": 1.5992, "step": 2898 }, { "epoch": 0.43722192896463313, "grad_norm": 0.25957784875946716, "learning_rate": 3.1906523007661256e-07, "loss": 1.4921, "step": 2899 }, { "epoch": 0.4373727471533067, "grad_norm": 0.25242465566484146, "learning_rate": 3.189606746248251e-07, "loss": 1.5765, "step": 2900 }, { "epoch": 0.4375235653419802, "grad_norm": 0.2936725023072022, "learning_rate": 3.1885610930258585e-07, "loss": 1.5895, "step": 2901 }, { "epoch": 0.4376743835306538, "grad_norm": 0.24296660652410004, "learning_rate": 3.187515341333729e-07, "loss": 1.6201, "step": 2902 }, { "epoch": 0.4378252017193274, "grad_norm": 0.2660340565961371, "learning_rate": 3.1864694914066624e-07, "loss": 1.4993, "step": 2903 }, { "epoch": 0.4379760199080009, "grad_norm": 0.2678230768413757, "learning_rate": 3.185423543479484e-07, "loss": 1.6239, "step": 2904 }, { "epoch": 0.43812683809667446, "grad_norm": 0.23864790824148918, "learning_rate": 3.18437749778704e-07, "loss": 1.5226, "step": 2905 }, { "epoch": 0.43827765628534804, "grad_norm": 0.24579629971463482, "learning_rate": 3.183331354564196e-07, "loss": 1.5225, "step": 2906 }, { "epoch": 0.43842847447402156, "grad_norm": 0.3521358692586838, "learning_rate": 3.182285114045844e-07, "loss": 1.576, "step": 2907 }, { "epoch": 0.43857929266269513, "grad_norm": 0.2590866917875334, "learning_rate": 3.181238776466894e-07, "loss": 1.5539, "step": 2908 }, { "epoch": 0.43873011085136865, "grad_norm": 0.25550321049136315, "learning_rate": 3.180192342062279e-07, "loss": 1.5528, "step": 2909 }, { "epoch": 0.4388809290400422, "grad_norm": 0.24770792136612937, "learning_rate": 3.1791458110669557e-07, "loss": 1.5544, "step": 2910 }, { "epoch": 0.4390317472287158, "grad_norm": 0.27145834138714725, "learning_rate": 3.178099183715899e-07, "loss": 1.6244, "step": 2911 }, { "epoch": 0.4391825654173893, "grad_norm": 0.7306377787337551, "learning_rate": 3.177052460244108e-07, "loss": 1.5126, "step": 2912 }, { "epoch": 0.4393333836060629, "grad_norm": 0.2423739274479909, "learning_rate": 3.176005640886603e-07, "loss": 1.5814, "step": 2913 }, { "epoch": 0.43948420179473646, "grad_norm": 0.27344887574578947, "learning_rate": 3.174958725878424e-07, "loss": 1.5617, "step": 2914 }, { "epoch": 0.43963501998341, "grad_norm": 0.2719054718336235, "learning_rate": 3.1739117154546357e-07, "loss": 1.6183, "step": 2915 }, { "epoch": 0.43978583817208355, "grad_norm": 0.2519172488059112, "learning_rate": 3.1728646098503216e-07, "loss": 1.5871, "step": 2916 }, { "epoch": 0.43993665636075713, "grad_norm": 0.25133162404568815, "learning_rate": 3.171817409300587e-07, "loss": 1.5503, "step": 2917 }, { "epoch": 0.44008747454943065, "grad_norm": 0.2792435671678952, "learning_rate": 3.17077011404056e-07, "loss": 1.5728, "step": 2918 }, { "epoch": 0.4402382927381042, "grad_norm": 0.24427337476009683, "learning_rate": 3.169722724305387e-07, "loss": 1.6415, "step": 2919 }, { "epoch": 0.4403891109267778, "grad_norm": 0.3465354844042566, "learning_rate": 3.16867524033024e-07, "loss": 1.5484, "step": 2920 }, { "epoch": 0.4405399291154513, "grad_norm": 0.2586043215420097, "learning_rate": 3.1676276623503075e-07, "loss": 1.603, "step": 2921 }, { "epoch": 0.4406907473041249, "grad_norm": 0.24859283143903943, "learning_rate": 3.1665799906008023e-07, "loss": 1.5771, "step": 2922 }, { "epoch": 0.4408415654927984, "grad_norm": 0.24570719218752252, "learning_rate": 3.1655322253169576e-07, "loss": 1.4971, "step": 2923 }, { "epoch": 0.440992383681472, "grad_norm": 0.31073154586652807, "learning_rate": 3.1644843667340257e-07, "loss": 1.545, "step": 2924 }, { "epoch": 0.44114320187014555, "grad_norm": 0.25075904681549044, "learning_rate": 3.163436415087283e-07, "loss": 1.5834, "step": 2925 }, { "epoch": 0.44129402005881907, "grad_norm": 0.25314906080943284, "learning_rate": 3.1623883706120247e-07, "loss": 1.5579, "step": 2926 }, { "epoch": 0.44144483824749264, "grad_norm": 0.2360378844836213, "learning_rate": 3.161340233543567e-07, "loss": 1.5135, "step": 2927 }, { "epoch": 0.4415956564361662, "grad_norm": 0.24292165414084746, "learning_rate": 3.1602920041172473e-07, "loss": 1.5137, "step": 2928 }, { "epoch": 0.44174647462483974, "grad_norm": 0.2997251514094543, "learning_rate": 3.159243682568423e-07, "loss": 1.6018, "step": 2929 }, { "epoch": 0.4418972928135133, "grad_norm": 0.3497763748048155, "learning_rate": 3.158195269132474e-07, "loss": 1.5502, "step": 2930 }, { "epoch": 0.4420481110021869, "grad_norm": 0.27338608798627173, "learning_rate": 3.1571467640447997e-07, "loss": 1.6476, "step": 2931 }, { "epoch": 0.4421989291908604, "grad_norm": 0.26887073330294453, "learning_rate": 3.156098167540818e-07, "loss": 1.549, "step": 2932 }, { "epoch": 0.442349747379534, "grad_norm": 0.2559339493115962, "learning_rate": 3.155049479855973e-07, "loss": 1.5785, "step": 2933 }, { "epoch": 0.44250056556820755, "grad_norm": 0.24419500081836334, "learning_rate": 3.1540007012257215e-07, "loss": 1.5312, "step": 2934 }, { "epoch": 0.44265138375688107, "grad_norm": 0.3446463400081573, "learning_rate": 3.1529518318855474e-07, "loss": 1.5488, "step": 2935 }, { "epoch": 0.44280220194555464, "grad_norm": 0.250792786442541, "learning_rate": 3.1519028720709515e-07, "loss": 1.5238, "step": 2936 }, { "epoch": 0.4429530201342282, "grad_norm": 0.25277066988345265, "learning_rate": 3.1508538220174565e-07, "loss": 1.5322, "step": 2937 }, { "epoch": 0.44310383832290173, "grad_norm": 0.2574346793763021, "learning_rate": 3.149804681960605e-07, "loss": 1.5623, "step": 2938 }, { "epoch": 0.4432546565115753, "grad_norm": 0.3390501313428078, "learning_rate": 3.148755452135957e-07, "loss": 1.5602, "step": 2939 }, { "epoch": 0.4434054747002488, "grad_norm": 0.25222635073802524, "learning_rate": 3.147706132779098e-07, "loss": 1.5604, "step": 2940 }, { "epoch": 0.4435562928889224, "grad_norm": 0.2376778230089622, "learning_rate": 3.1466567241256304e-07, "loss": 1.4962, "step": 2941 }, { "epoch": 0.443707111077596, "grad_norm": 0.2703135489064138, "learning_rate": 3.145607226411175e-07, "loss": 1.6, "step": 2942 }, { "epoch": 0.4438579292662695, "grad_norm": 0.24834680304175888, "learning_rate": 3.144557639871377e-07, "loss": 1.5691, "step": 2943 }, { "epoch": 0.44400874745494306, "grad_norm": 0.24414712441004643, "learning_rate": 3.143507964741897e-07, "loss": 1.5583, "step": 2944 }, { "epoch": 0.44415956564361664, "grad_norm": 0.2634739776520894, "learning_rate": 3.1424582012584197e-07, "loss": 1.5459, "step": 2945 }, { "epoch": 0.44431038383229016, "grad_norm": 0.2486655684747634, "learning_rate": 3.141408349656646e-07, "loss": 1.5303, "step": 2946 }, { "epoch": 0.44446120202096373, "grad_norm": 0.2552836521878257, "learning_rate": 3.1403584101722984e-07, "loss": 1.5216, "step": 2947 }, { "epoch": 0.4446120202096373, "grad_norm": 0.24430062763377616, "learning_rate": 3.13930838304112e-07, "loss": 1.6221, "step": 2948 }, { "epoch": 0.4447628383983108, "grad_norm": 0.2547464676737134, "learning_rate": 3.1382582684988713e-07, "loss": 1.5393, "step": 2949 }, { "epoch": 0.4449136565869844, "grad_norm": 0.2582525317132371, "learning_rate": 3.1372080667813346e-07, "loss": 1.5336, "step": 2950 }, { "epoch": 0.44506447477565797, "grad_norm": 0.25259837083207576, "learning_rate": 3.136157778124309e-07, "loss": 1.5129, "step": 2951 }, { "epoch": 0.4452152929643315, "grad_norm": 0.24497532992454443, "learning_rate": 3.135107402763617e-07, "loss": 1.5764, "step": 2952 }, { "epoch": 0.44536611115300506, "grad_norm": 0.2684931409910681, "learning_rate": 3.134056940935097e-07, "loss": 1.6112, "step": 2953 }, { "epoch": 0.4455169293416786, "grad_norm": 0.2772107291494322, "learning_rate": 3.133006392874609e-07, "loss": 1.4843, "step": 2954 }, { "epoch": 0.44566774753035215, "grad_norm": 0.2571460569029784, "learning_rate": 3.1319557588180313e-07, "loss": 1.558, "step": 2955 }, { "epoch": 0.44581856571902573, "grad_norm": 0.2560514367861158, "learning_rate": 3.130905039001262e-07, "loss": 1.5363, "step": 2956 }, { "epoch": 0.44596938390769925, "grad_norm": 0.24476015297538142, "learning_rate": 3.129854233660218e-07, "loss": 1.579, "step": 2957 }, { "epoch": 0.4461202020963728, "grad_norm": 0.31510599110357745, "learning_rate": 3.1288033430308356e-07, "loss": 1.5831, "step": 2958 }, { "epoch": 0.4462710202850464, "grad_norm": 0.3357344820178751, "learning_rate": 3.1277523673490723e-07, "loss": 1.5095, "step": 2959 }, { "epoch": 0.4464218384737199, "grad_norm": 0.2502751163810743, "learning_rate": 3.1267013068509e-07, "loss": 1.6416, "step": 2960 }, { "epoch": 0.4465726566623935, "grad_norm": 0.23393203221738712, "learning_rate": 3.125650161772314e-07, "loss": 1.5145, "step": 2961 }, { "epoch": 0.44672347485106706, "grad_norm": 0.24746422788563224, "learning_rate": 3.1245989323493265e-07, "loss": 1.5863, "step": 2962 }, { "epoch": 0.4468742930397406, "grad_norm": 0.2525927927400594, "learning_rate": 3.123547618817969e-07, "loss": 1.5796, "step": 2963 }, { "epoch": 0.44702511122841415, "grad_norm": 0.2774687244669163, "learning_rate": 3.1224962214142926e-07, "loss": 1.5681, "step": 2964 }, { "epoch": 0.4471759294170877, "grad_norm": 0.2826685061590997, "learning_rate": 3.1214447403743654e-07, "loss": 1.5822, "step": 2965 }, { "epoch": 0.44732674760576124, "grad_norm": 0.48549240991872344, "learning_rate": 3.120393175934277e-07, "loss": 1.5265, "step": 2966 }, { "epoch": 0.4474775657944348, "grad_norm": 0.24606427073477874, "learning_rate": 3.1193415283301327e-07, "loss": 1.6047, "step": 2967 }, { "epoch": 0.44762838398310834, "grad_norm": 0.31989339634341063, "learning_rate": 3.11828979779806e-07, "loss": 1.4801, "step": 2968 }, { "epoch": 0.4477792021717819, "grad_norm": 0.28372722722218535, "learning_rate": 3.1172379845742013e-07, "loss": 1.5434, "step": 2969 }, { "epoch": 0.4479300203604555, "grad_norm": 0.23650434231940212, "learning_rate": 3.1161860888947196e-07, "loss": 1.5475, "step": 2970 }, { "epoch": 0.448080838549129, "grad_norm": 0.2614404249119146, "learning_rate": 3.115134110995797e-07, "loss": 1.5261, "step": 2971 }, { "epoch": 0.4482316567378026, "grad_norm": 0.2582017725860931, "learning_rate": 3.1140820511136324e-07, "loss": 1.5125, "step": 2972 }, { "epoch": 0.44838247492647615, "grad_norm": 0.26566919697793095, "learning_rate": 3.113029909484444e-07, "loss": 1.5394, "step": 2973 }, { "epoch": 0.44853329311514967, "grad_norm": 0.25140893194319885, "learning_rate": 3.111977686344469e-07, "loss": 1.5757, "step": 2974 }, { "epoch": 0.44868411130382324, "grad_norm": 0.24657104009489716, "learning_rate": 3.1109253819299604e-07, "loss": 1.587, "step": 2975 }, { "epoch": 0.4488349294924968, "grad_norm": 0.2583748689263846, "learning_rate": 3.1098729964771927e-07, "loss": 1.4778, "step": 2976 }, { "epoch": 0.44898574768117033, "grad_norm": 0.27974542845709593, "learning_rate": 3.108820530222457e-07, "loss": 1.5788, "step": 2977 }, { "epoch": 0.4491365658698439, "grad_norm": 0.2472039824429126, "learning_rate": 3.1077679834020614e-07, "loss": 1.5599, "step": 2978 }, { "epoch": 0.4492873840585175, "grad_norm": 0.25024405958197155, "learning_rate": 3.1067153562523353e-07, "loss": 1.5422, "step": 2979 }, { "epoch": 0.449438202247191, "grad_norm": 0.2963575226276379, "learning_rate": 3.1056626490096217e-07, "loss": 1.5686, "step": 2980 }, { "epoch": 0.4495890204358646, "grad_norm": 0.3800371321944895, "learning_rate": 3.104609861910286e-07, "loss": 1.4612, "step": 2981 }, { "epoch": 0.44973983862453815, "grad_norm": 0.29875841844483303, "learning_rate": 3.10355699519071e-07, "loss": 1.5737, "step": 2982 }, { "epoch": 0.44989065681321166, "grad_norm": 0.3116187392098609, "learning_rate": 3.10250404908729e-07, "loss": 1.6017, "step": 2983 }, { "epoch": 0.45004147500188524, "grad_norm": 0.2684681459704842, "learning_rate": 3.101451023836446e-07, "loss": 1.5569, "step": 2984 }, { "epoch": 0.45019229319055876, "grad_norm": 0.24529043085332278, "learning_rate": 3.100397919674611e-07, "loss": 1.4803, "step": 2985 }, { "epoch": 0.45034311137923233, "grad_norm": 0.24358539839894225, "learning_rate": 3.0993447368382394e-07, "loss": 1.5215, "step": 2986 }, { "epoch": 0.4504939295679059, "grad_norm": 0.2502172714852228, "learning_rate": 3.0982914755637994e-07, "loss": 1.5563, "step": 2987 }, { "epoch": 0.4506447477565794, "grad_norm": 0.2582609803453465, "learning_rate": 3.0972381360877805e-07, "loss": 1.6238, "step": 2988 }, { "epoch": 0.450795565945253, "grad_norm": 0.23778611944596112, "learning_rate": 3.0961847186466865e-07, "loss": 1.625, "step": 2989 }, { "epoch": 0.45094638413392657, "grad_norm": 0.24203211370235533, "learning_rate": 3.0951312234770424e-07, "loss": 1.5784, "step": 2990 }, { "epoch": 0.4510972023226001, "grad_norm": 0.2624287141724599, "learning_rate": 3.094077650815386e-07, "loss": 1.5745, "step": 2991 }, { "epoch": 0.45124802051127366, "grad_norm": 0.2515527215618168, "learning_rate": 3.093024000898277e-07, "loss": 1.5258, "step": 2992 }, { "epoch": 0.45139883869994724, "grad_norm": 0.2455295872157118, "learning_rate": 3.09197027396229e-07, "loss": 1.5576, "step": 2993 }, { "epoch": 0.45154965688862075, "grad_norm": 0.2531843365496076, "learning_rate": 3.0909164702440166e-07, "loss": 1.5989, "step": 2994 }, { "epoch": 0.45170047507729433, "grad_norm": 0.6742080276924932, "learning_rate": 3.089862589980067e-07, "loss": 1.6633, "step": 2995 }, { "epoch": 0.4518512932659679, "grad_norm": 0.25751381611676893, "learning_rate": 3.088808633407069e-07, "loss": 1.5642, "step": 2996 }, { "epoch": 0.4520021114546414, "grad_norm": 0.2805235006759765, "learning_rate": 3.0877546007616656e-07, "loss": 1.5874, "step": 2997 }, { "epoch": 0.452152929643315, "grad_norm": 0.2811107254257494, "learning_rate": 3.086700492280517e-07, "loss": 1.4869, "step": 2998 }, { "epoch": 0.4523037478319885, "grad_norm": 0.25027636099509565, "learning_rate": 3.085646308200302e-07, "loss": 1.5712, "step": 2999 }, { "epoch": 0.4524545660206621, "grad_norm": 0.2576381839230806, "learning_rate": 3.084592048757716e-07, "loss": 1.5812, "step": 3000 }, { "epoch": 0.45260538420933566, "grad_norm": 0.24988067035681372, "learning_rate": 3.083537714189471e-07, "loss": 1.5602, "step": 3001 }, { "epoch": 0.4527562023980092, "grad_norm": 0.35235848555933336, "learning_rate": 3.082483304732295e-07, "loss": 1.5458, "step": 3002 }, { "epoch": 0.45290702058668275, "grad_norm": 0.3335011281009601, "learning_rate": 3.0814288206229343e-07, "loss": 1.5411, "step": 3003 }, { "epoch": 0.4530578387753563, "grad_norm": 0.2521306698791328, "learning_rate": 3.080374262098152e-07, "loss": 1.551, "step": 3004 }, { "epoch": 0.45320865696402984, "grad_norm": 0.24140037223562982, "learning_rate": 3.079319629394724e-07, "loss": 1.5487, "step": 3005 }, { "epoch": 0.4533594751527034, "grad_norm": 0.254843874309324, "learning_rate": 3.07826492274945e-07, "loss": 1.5515, "step": 3006 }, { "epoch": 0.453510293341377, "grad_norm": 0.255684472071238, "learning_rate": 3.0772101423991407e-07, "loss": 1.5745, "step": 3007 }, { "epoch": 0.4536611115300505, "grad_norm": 0.29836640169693746, "learning_rate": 3.076155288580625e-07, "loss": 1.5514, "step": 3008 }, { "epoch": 0.4538119297187241, "grad_norm": 0.2518153306090227, "learning_rate": 3.0751003615307474e-07, "loss": 1.5746, "step": 3009 }, { "epoch": 0.45396274790739766, "grad_norm": 0.23894055808240028, "learning_rate": 3.074045361486372e-07, "loss": 1.5579, "step": 3010 }, { "epoch": 0.4541135660960712, "grad_norm": 0.24187245513753344, "learning_rate": 3.072990288684375e-07, "loss": 1.5418, "step": 3011 }, { "epoch": 0.45426438428474475, "grad_norm": 0.252062380284074, "learning_rate": 3.071935143361651e-07, "loss": 1.5777, "step": 3012 }, { "epoch": 0.45441520247341827, "grad_norm": 0.2505843481528514, "learning_rate": 3.0708799257551124e-07, "loss": 1.6039, "step": 3013 }, { "epoch": 0.45456602066209184, "grad_norm": 0.5721456375520992, "learning_rate": 3.0698246361016855e-07, "loss": 1.4929, "step": 3014 }, { "epoch": 0.4547168388507654, "grad_norm": 0.27545721685806984, "learning_rate": 3.068769274638314e-07, "loss": 1.5755, "step": 3015 }, { "epoch": 0.45486765703943893, "grad_norm": 0.2782343763490815, "learning_rate": 3.0677138416019556e-07, "loss": 1.4811, "step": 3016 }, { "epoch": 0.4550184752281125, "grad_norm": 0.261269937796045, "learning_rate": 3.066658337229587e-07, "loss": 1.5506, "step": 3017 }, { "epoch": 0.4551692934167861, "grad_norm": 0.2710553683918107, "learning_rate": 3.0656027617582e-07, "loss": 1.634, "step": 3018 }, { "epoch": 0.4553201116054596, "grad_norm": 0.28984734875281387, "learning_rate": 3.064547115424802e-07, "loss": 1.5897, "step": 3019 }, { "epoch": 0.4554709297941332, "grad_norm": 0.2570452835035077, "learning_rate": 3.063491398466415e-07, "loss": 1.608, "step": 3020 }, { "epoch": 0.45562174798280675, "grad_norm": 0.27443754458266795, "learning_rate": 3.06243561112008e-07, "loss": 1.5134, "step": 3021 }, { "epoch": 0.45577256617148026, "grad_norm": 0.24472397015478006, "learning_rate": 3.0613797536228507e-07, "loss": 1.5881, "step": 3022 }, { "epoch": 0.45592338436015384, "grad_norm": 0.2797386856040963, "learning_rate": 3.060323826211799e-07, "loss": 1.5273, "step": 3023 }, { "epoch": 0.4560742025488274, "grad_norm": 0.24202137903029028, "learning_rate": 3.05926782912401e-07, "loss": 1.6152, "step": 3024 }, { "epoch": 0.45622502073750093, "grad_norm": 0.2589108746963981, "learning_rate": 3.0582117625965864e-07, "loss": 1.5963, "step": 3025 }, { "epoch": 0.4563758389261745, "grad_norm": 0.25427670811518555, "learning_rate": 3.0571556268666463e-07, "loss": 1.6224, "step": 3026 }, { "epoch": 0.4565266571148481, "grad_norm": 0.2587703030443672, "learning_rate": 3.0560994221713223e-07, "loss": 1.5612, "step": 3027 }, { "epoch": 0.4566774753035216, "grad_norm": 0.2886694938081297, "learning_rate": 3.055043148747764e-07, "loss": 1.5324, "step": 3028 }, { "epoch": 0.45682829349219517, "grad_norm": 0.31831612429832107, "learning_rate": 3.0539868068331344e-07, "loss": 1.518, "step": 3029 }, { "epoch": 0.4569791116808687, "grad_norm": 0.24782382733151562, "learning_rate": 3.0529303966646137e-07, "loss": 1.5928, "step": 3030 }, { "epoch": 0.45712992986954226, "grad_norm": 0.2599559228051775, "learning_rate": 3.051873918479397e-07, "loss": 1.5563, "step": 3031 }, { "epoch": 0.45728074805821584, "grad_norm": 0.2444135393221858, "learning_rate": 3.0508173725146934e-07, "loss": 1.5639, "step": 3032 }, { "epoch": 0.45743156624688935, "grad_norm": 0.25805125896054365, "learning_rate": 3.049760759007729e-07, "loss": 1.5088, "step": 3033 }, { "epoch": 0.4575823844355629, "grad_norm": 0.2738356446282475, "learning_rate": 3.0487040781957447e-07, "loss": 1.5391, "step": 3034 }, { "epoch": 0.4577332026242365, "grad_norm": 0.2570037548605811, "learning_rate": 3.0476473303159957e-07, "loss": 1.4543, "step": 3035 }, { "epoch": 0.45788402081291, "grad_norm": 0.271751224706055, "learning_rate": 3.0465905156057527e-07, "loss": 1.5705, "step": 3036 }, { "epoch": 0.4580348390015836, "grad_norm": 0.24170106683820172, "learning_rate": 3.045533634302301e-07, "loss": 1.506, "step": 3037 }, { "epoch": 0.45818565719025717, "grad_norm": 0.2590299736767002, "learning_rate": 3.044476686642941e-07, "loss": 1.5761, "step": 3038 }, { "epoch": 0.4583364753789307, "grad_norm": 0.23943188904155357, "learning_rate": 3.043419672864989e-07, "loss": 1.5635, "step": 3039 }, { "epoch": 0.45848729356760426, "grad_norm": 0.3128732040011657, "learning_rate": 3.042362593205775e-07, "loss": 1.5952, "step": 3040 }, { "epoch": 0.45863811175627783, "grad_norm": 0.25503919824492927, "learning_rate": 3.0413054479026445e-07, "loss": 1.575, "step": 3041 }, { "epoch": 0.45878892994495135, "grad_norm": 0.24210971507913032, "learning_rate": 3.040248237192958e-07, "loss": 1.5675, "step": 3042 }, { "epoch": 0.4589397481336249, "grad_norm": 0.26816280778517526, "learning_rate": 3.039190961314088e-07, "loss": 1.559, "step": 3043 }, { "epoch": 0.45909056632229844, "grad_norm": 0.3617187003581852, "learning_rate": 3.0381336205034254e-07, "loss": 1.5871, "step": 3044 }, { "epoch": 0.459241384510972, "grad_norm": 0.25964865678477417, "learning_rate": 3.0370762149983743e-07, "loss": 1.5954, "step": 3045 }, { "epoch": 0.4593922026996456, "grad_norm": 0.306725325338039, "learning_rate": 3.036018745036351e-07, "loss": 1.5613, "step": 3046 }, { "epoch": 0.4595430208883191, "grad_norm": 0.32936632251261777, "learning_rate": 3.034961210854791e-07, "loss": 1.5149, "step": 3047 }, { "epoch": 0.4596938390769927, "grad_norm": 0.25780996824913954, "learning_rate": 3.033903612691139e-07, "loss": 1.6034, "step": 3048 }, { "epoch": 0.45984465726566626, "grad_norm": 0.24749906096744248, "learning_rate": 3.032845950782859e-07, "loss": 1.5395, "step": 3049 }, { "epoch": 0.4599954754543398, "grad_norm": 0.27887263046014993, "learning_rate": 3.031788225367425e-07, "loss": 1.5389, "step": 3050 }, { "epoch": 0.46014629364301335, "grad_norm": 0.2564103298744664, "learning_rate": 3.0307304366823276e-07, "loss": 1.5118, "step": 3051 }, { "epoch": 0.4602971118316869, "grad_norm": 0.24527121197302515, "learning_rate": 3.029672584965072e-07, "loss": 1.6176, "step": 3052 }, { "epoch": 0.46044793002036044, "grad_norm": 0.25248485727309306, "learning_rate": 3.0286146704531756e-07, "loss": 1.5881, "step": 3053 }, { "epoch": 0.460598748209034, "grad_norm": 0.24647820855713623, "learning_rate": 3.027556693384172e-07, "loss": 1.6539, "step": 3054 }, { "epoch": 0.4607495663977076, "grad_norm": 0.24630504819560922, "learning_rate": 3.0264986539956065e-07, "loss": 1.5517, "step": 3055 }, { "epoch": 0.4609003845863811, "grad_norm": 0.2670965324389809, "learning_rate": 3.025440552525042e-07, "loss": 1.5974, "step": 3056 }, { "epoch": 0.4610512027750547, "grad_norm": 0.2439765244039028, "learning_rate": 3.02438238921005e-07, "loss": 1.5821, "step": 3057 }, { "epoch": 0.4612020209637282, "grad_norm": 0.24861411178427464, "learning_rate": 3.0233241642882225e-07, "loss": 1.5246, "step": 3058 }, { "epoch": 0.4613528391524018, "grad_norm": 0.3855492466112723, "learning_rate": 3.0222658779971587e-07, "loss": 1.5778, "step": 3059 }, { "epoch": 0.46150365734107535, "grad_norm": 0.3141524633237537, "learning_rate": 3.0212075305744767e-07, "loss": 1.521, "step": 3060 }, { "epoch": 0.46165447552974886, "grad_norm": 0.26132407224167226, "learning_rate": 3.020149122257805e-07, "loss": 1.578, "step": 3061 }, { "epoch": 0.46180529371842244, "grad_norm": 0.3654418283420951, "learning_rate": 3.019090653284789e-07, "loss": 1.562, "step": 3062 }, { "epoch": 0.461956111907096, "grad_norm": 0.6281917699855586, "learning_rate": 3.0180321238930833e-07, "loss": 1.6092, "step": 3063 }, { "epoch": 0.46210693009576953, "grad_norm": 0.4258721548197494, "learning_rate": 3.016973534320361e-07, "loss": 1.5411, "step": 3064 }, { "epoch": 0.4622577482844431, "grad_norm": 0.2658585703340917, "learning_rate": 3.015914884804304e-07, "loss": 1.5973, "step": 3065 }, { "epoch": 0.4624085664731167, "grad_norm": 0.25974042461853114, "learning_rate": 3.0148561755826117e-07, "loss": 1.6426, "step": 3066 }, { "epoch": 0.4625593846617902, "grad_norm": 0.36689913431669297, "learning_rate": 3.0137974068929945e-07, "loss": 1.593, "step": 3067 }, { "epoch": 0.46271020285046377, "grad_norm": 0.25700433697530406, "learning_rate": 3.0127385789731765e-07, "loss": 1.5169, "step": 3068 }, { "epoch": 0.46286102103913734, "grad_norm": 0.5325222957041995, "learning_rate": 3.011679692060896e-07, "loss": 1.5924, "step": 3069 }, { "epoch": 0.46301183922781086, "grad_norm": 0.28376571813597284, "learning_rate": 3.010620746393903e-07, "loss": 1.5968, "step": 3070 }, { "epoch": 0.46316265741648444, "grad_norm": 0.2497627878737484, "learning_rate": 3.0095617422099636e-07, "loss": 1.6907, "step": 3071 }, { "epoch": 0.463313475605158, "grad_norm": 0.258362993634684, "learning_rate": 3.008502679746853e-07, "loss": 1.5409, "step": 3072 }, { "epoch": 0.4634642937938315, "grad_norm": 0.27915505042843364, "learning_rate": 3.0074435592423624e-07, "loss": 1.569, "step": 3073 }, { "epoch": 0.4636151119825051, "grad_norm": 0.24501981442207682, "learning_rate": 3.0063843809342956e-07, "loss": 1.4864, "step": 3074 }, { "epoch": 0.4637659301711786, "grad_norm": 0.2510754371313347, "learning_rate": 3.005325145060468e-07, "loss": 1.5793, "step": 3075 }, { "epoch": 0.4639167483598522, "grad_norm": 0.2918637363545153, "learning_rate": 3.0042658518587095e-07, "loss": 1.5442, "step": 3076 }, { "epoch": 0.46406756654852577, "grad_norm": 0.41762300505875893, "learning_rate": 3.003206501566862e-07, "loss": 1.5794, "step": 3077 }, { "epoch": 0.4642183847371993, "grad_norm": 0.2499665774551008, "learning_rate": 3.002147094422781e-07, "loss": 1.4934, "step": 3078 }, { "epoch": 0.46436920292587286, "grad_norm": 0.26302306616238386, "learning_rate": 3.001087630664334e-07, "loss": 1.5621, "step": 3079 }, { "epoch": 0.46452002111454643, "grad_norm": 0.2897997184428593, "learning_rate": 3.0000281105294016e-07, "loss": 1.5685, "step": 3080 }, { "epoch": 0.46467083930321995, "grad_norm": 0.2920128436423677, "learning_rate": 2.998968534255877e-07, "loss": 1.4975, "step": 3081 }, { "epoch": 0.4648216574918935, "grad_norm": 0.2430946934216263, "learning_rate": 2.9979089020816656e-07, "loss": 1.5453, "step": 3082 }, { "epoch": 0.4649724756805671, "grad_norm": 0.29485762214691674, "learning_rate": 2.996849214244685e-07, "loss": 1.5913, "step": 3083 }, { "epoch": 0.4651232938692406, "grad_norm": 0.27298216278564874, "learning_rate": 2.995789470982867e-07, "loss": 1.604, "step": 3084 }, { "epoch": 0.4652741120579142, "grad_norm": 0.24041170018947816, "learning_rate": 2.994729672534155e-07, "loss": 1.637, "step": 3085 }, { "epoch": 0.46542493024658776, "grad_norm": 0.2443504997293059, "learning_rate": 2.9936698191365047e-07, "loss": 1.5101, "step": 3086 }, { "epoch": 0.4655757484352613, "grad_norm": 0.24946065590667868, "learning_rate": 2.9926099110278823e-07, "loss": 1.6411, "step": 3087 }, { "epoch": 0.46572656662393486, "grad_norm": 0.2517409955245919, "learning_rate": 2.99154994844627e-07, "loss": 1.5731, "step": 3088 }, { "epoch": 0.4658773848126084, "grad_norm": 0.4712226365325641, "learning_rate": 2.9904899316296593e-07, "loss": 1.5635, "step": 3089 }, { "epoch": 0.46602820300128195, "grad_norm": 0.26267189317506817, "learning_rate": 2.9894298608160557e-07, "loss": 1.5482, "step": 3090 }, { "epoch": 0.4661790211899555, "grad_norm": 0.25096318846535004, "learning_rate": 2.9883697362434743e-07, "loss": 1.5473, "step": 3091 }, { "epoch": 0.46632983937862904, "grad_norm": 0.26490772731617646, "learning_rate": 2.9873095581499456e-07, "loss": 1.5459, "step": 3092 }, { "epoch": 0.4664806575673026, "grad_norm": 0.25622179136950635, "learning_rate": 2.986249326773511e-07, "loss": 1.5778, "step": 3093 }, { "epoch": 0.4666314757559762, "grad_norm": 0.4181208343695358, "learning_rate": 2.98518904235222e-07, "loss": 1.5525, "step": 3094 }, { "epoch": 0.4667822939446497, "grad_norm": 0.24402009173677996, "learning_rate": 2.984128705124141e-07, "loss": 1.4885, "step": 3095 }, { "epoch": 0.4669331121333233, "grad_norm": 0.27012935338310423, "learning_rate": 2.983068315327349e-07, "loss": 1.5589, "step": 3096 }, { "epoch": 0.46708393032199685, "grad_norm": 0.2822387723132698, "learning_rate": 2.982007873199932e-07, "loss": 1.583, "step": 3097 }, { "epoch": 0.4672347485106704, "grad_norm": 1.384971284858126, "learning_rate": 2.980947378979991e-07, "loss": 1.5548, "step": 3098 }, { "epoch": 0.46738556669934395, "grad_norm": 0.24793988515751572, "learning_rate": 2.9798868329056377e-07, "loss": 1.5959, "step": 3099 }, { "epoch": 0.4675363848880175, "grad_norm": 0.2562310729223106, "learning_rate": 2.978826235214995e-07, "loss": 1.5912, "step": 3100 }, { "epoch": 0.46768720307669104, "grad_norm": 0.26118119823269886, "learning_rate": 2.9777655861461983e-07, "loss": 1.5677, "step": 3101 }, { "epoch": 0.4678380212653646, "grad_norm": 0.25834739995520306, "learning_rate": 2.976704885937394e-07, "loss": 1.5678, "step": 3102 }, { "epoch": 0.46798883945403813, "grad_norm": 0.34608384913561635, "learning_rate": 2.9756441348267415e-07, "loss": 1.5201, "step": 3103 }, { "epoch": 0.4681396576427117, "grad_norm": 0.3488184526720022, "learning_rate": 2.9745833330524077e-07, "loss": 1.5947, "step": 3104 }, { "epoch": 0.4682904758313853, "grad_norm": 0.24643312832198783, "learning_rate": 2.973522480852576e-07, "loss": 1.5429, "step": 3105 }, { "epoch": 0.4684412940200588, "grad_norm": 0.24367773790462405, "learning_rate": 2.9724615784654376e-07, "loss": 1.577, "step": 3106 }, { "epoch": 0.46859211220873237, "grad_norm": 0.25019008015061117, "learning_rate": 2.9714006261291964e-07, "loss": 1.5464, "step": 3107 }, { "epoch": 0.46874293039740594, "grad_norm": 1.4631440444528507, "learning_rate": 2.970339624082066e-07, "loss": 1.53, "step": 3108 }, { "epoch": 0.46889374858607946, "grad_norm": 0.2553858145910921, "learning_rate": 2.9692785725622737e-07, "loss": 1.602, "step": 3109 }, { "epoch": 0.46904456677475304, "grad_norm": 0.28008841638244675, "learning_rate": 2.968217471808055e-07, "loss": 1.5214, "step": 3110 }, { "epoch": 0.4691953849634266, "grad_norm": 0.465635413118692, "learning_rate": 2.967156322057659e-07, "loss": 1.6019, "step": 3111 }, { "epoch": 0.4693462031521001, "grad_norm": 0.26496000760437305, "learning_rate": 2.966095123549345e-07, "loss": 1.5575, "step": 3112 }, { "epoch": 0.4694970213407737, "grad_norm": 0.2574373126488699, "learning_rate": 2.965033876521382e-07, "loss": 1.588, "step": 3113 }, { "epoch": 0.4696478395294473, "grad_norm": 0.2541011934834178, "learning_rate": 2.9639725812120505e-07, "loss": 1.6224, "step": 3114 }, { "epoch": 0.4697986577181208, "grad_norm": 0.25609855203229226, "learning_rate": 2.962911237859643e-07, "loss": 1.6339, "step": 3115 }, { "epoch": 0.46994947590679437, "grad_norm": 0.2457549898166761, "learning_rate": 2.9618498467024634e-07, "loss": 1.6087, "step": 3116 }, { "epoch": 0.47010029409546794, "grad_norm": 0.26116033707396874, "learning_rate": 2.960788407978822e-07, "loss": 1.472, "step": 3117 }, { "epoch": 0.47025111228414146, "grad_norm": 0.2494061211539528, "learning_rate": 2.9597269219270444e-07, "loss": 1.5734, "step": 3118 }, { "epoch": 0.47040193047281503, "grad_norm": 0.24559955866207117, "learning_rate": 2.9586653887854655e-07, "loss": 1.5252, "step": 3119 }, { "epoch": 0.47055274866148855, "grad_norm": 0.24695024440276261, "learning_rate": 2.957603808792429e-07, "loss": 1.5879, "step": 3120 }, { "epoch": 0.4707035668501621, "grad_norm": 0.7056298660755264, "learning_rate": 2.956542182186293e-07, "loss": 1.5673, "step": 3121 }, { "epoch": 0.4708543850388357, "grad_norm": 6.141345332581017, "learning_rate": 2.95548050920542e-07, "loss": 1.613, "step": 3122 }, { "epoch": 0.4710052032275092, "grad_norm": 0.26760459105842394, "learning_rate": 2.9544187900881893e-07, "loss": 1.5802, "step": 3123 }, { "epoch": 0.4711560214161828, "grad_norm": 0.27852768061774646, "learning_rate": 2.9533570250729875e-07, "loss": 1.5725, "step": 3124 }, { "epoch": 0.47130683960485636, "grad_norm": 0.2553576747982788, "learning_rate": 2.95229521439821e-07, "loss": 1.4959, "step": 3125 }, { "epoch": 0.4714576577935299, "grad_norm": 1.228355826218024, "learning_rate": 2.951233358302266e-07, "loss": 1.5456, "step": 3126 }, { "epoch": 0.47160847598220346, "grad_norm": 0.26371091643543787, "learning_rate": 2.950171457023572e-07, "loss": 1.5563, "step": 3127 }, { "epoch": 0.47175929417087703, "grad_norm": 0.2970776800812065, "learning_rate": 2.9491095108005567e-07, "loss": 1.576, "step": 3128 }, { "epoch": 0.47191011235955055, "grad_norm": 0.25871172786052893, "learning_rate": 2.9480475198716576e-07, "loss": 1.4758, "step": 3129 }, { "epoch": 0.4720609305482241, "grad_norm": 0.25080898052773576, "learning_rate": 2.9469854844753217e-07, "loss": 1.6025, "step": 3130 }, { "epoch": 0.4722117487368977, "grad_norm": 0.264607073476825, "learning_rate": 2.9459234048500086e-07, "loss": 1.5558, "step": 3131 }, { "epoch": 0.4723625669255712, "grad_norm": 0.25309447185710987, "learning_rate": 2.9448612812341855e-07, "loss": 1.5803, "step": 3132 }, { "epoch": 0.4725133851142448, "grad_norm": 0.2651919462998949, "learning_rate": 2.9437991138663286e-07, "loss": 1.5595, "step": 3133 }, { "epoch": 0.4726642033029183, "grad_norm": 0.25671274255063314, "learning_rate": 2.9427369029849276e-07, "loss": 1.6497, "step": 3134 }, { "epoch": 0.4728150214915919, "grad_norm": 0.2523704972360593, "learning_rate": 2.941674648828478e-07, "loss": 1.5396, "step": 3135 }, { "epoch": 0.47296583968026545, "grad_norm": 0.24582104701925128, "learning_rate": 2.9406123516354873e-07, "loss": 1.56, "step": 3136 }, { "epoch": 0.473116657868939, "grad_norm": 0.2414495282978198, "learning_rate": 2.9395500116444736e-07, "loss": 1.4739, "step": 3137 }, { "epoch": 0.47326747605761255, "grad_norm": 0.2700220556284739, "learning_rate": 2.9384876290939613e-07, "loss": 1.5267, "step": 3138 }, { "epoch": 0.4734182942462861, "grad_norm": 0.2502324744654963, "learning_rate": 2.9374252042224875e-07, "loss": 1.5575, "step": 3139 }, { "epoch": 0.47356911243495964, "grad_norm": 0.24877617133567306, "learning_rate": 2.9363627372685957e-07, "loss": 1.5306, "step": 3140 }, { "epoch": 0.4737199306236332, "grad_norm": 0.2763882887099, "learning_rate": 2.935300228470843e-07, "loss": 1.5368, "step": 3141 }, { "epoch": 0.4738707488123068, "grad_norm": 0.23525758933340374, "learning_rate": 2.934237678067792e-07, "loss": 1.5106, "step": 3142 }, { "epoch": 0.4740215670009803, "grad_norm": 0.2940263563569856, "learning_rate": 2.9331750862980166e-07, "loss": 1.5898, "step": 3143 }, { "epoch": 0.4741723851896539, "grad_norm": 0.2596657573389365, "learning_rate": 2.9321124534001e-07, "loss": 1.5476, "step": 3144 }, { "epoch": 0.47432320337832745, "grad_norm": 0.2529751981624563, "learning_rate": 2.931049779612634e-07, "loss": 1.5242, "step": 3145 }, { "epoch": 0.47447402156700097, "grad_norm": 0.2500990983748969, "learning_rate": 2.929987065174219e-07, "loss": 1.6329, "step": 3146 }, { "epoch": 0.47462483975567454, "grad_norm": 0.24968977008231993, "learning_rate": 2.9289243103234665e-07, "loss": 1.5681, "step": 3147 }, { "epoch": 0.47477565794434806, "grad_norm": 0.24654663425811366, "learning_rate": 2.9278615152989955e-07, "loss": 1.5237, "step": 3148 }, { "epoch": 0.47492647613302164, "grad_norm": 0.24200923324226278, "learning_rate": 2.9267986803394346e-07, "loss": 1.482, "step": 3149 }, { "epoch": 0.4750772943216952, "grad_norm": 0.26612235405655793, "learning_rate": 2.9257358056834206e-07, "loss": 1.604, "step": 3150 }, { "epoch": 0.4752281125103687, "grad_norm": 0.24988595389853024, "learning_rate": 2.9246728915696006e-07, "loss": 1.5565, "step": 3151 }, { "epoch": 0.4753789306990423, "grad_norm": 0.2651130099799271, "learning_rate": 2.923609938236629e-07, "loss": 1.4943, "step": 3152 }, { "epoch": 0.4755297488877159, "grad_norm": 0.28713226071833503, "learning_rate": 2.9225469459231703e-07, "loss": 1.5498, "step": 3153 }, { "epoch": 0.4756805670763894, "grad_norm": 0.2525910632174739, "learning_rate": 2.921483914867897e-07, "loss": 1.5784, "step": 3154 }, { "epoch": 0.47583138526506297, "grad_norm": 0.3356412422958949, "learning_rate": 2.920420845309492e-07, "loss": 1.5583, "step": 3155 }, { "epoch": 0.47598220345373654, "grad_norm": 0.24307731624206946, "learning_rate": 2.9193577374866423e-07, "loss": 1.5684, "step": 3156 }, { "epoch": 0.47613302164241006, "grad_norm": 0.24358134785836383, "learning_rate": 2.9182945916380493e-07, "loss": 1.6196, "step": 3157 }, { "epoch": 0.47628383983108363, "grad_norm": 0.30551696235434356, "learning_rate": 2.9172314080024187e-07, "loss": 1.5296, "step": 3158 }, { "epoch": 0.4764346580197572, "grad_norm": 0.2881545017403535, "learning_rate": 2.9161681868184666e-07, "loss": 1.6224, "step": 3159 }, { "epoch": 0.4765854762084307, "grad_norm": 0.25304889949388404, "learning_rate": 2.915104928324917e-07, "loss": 1.5174, "step": 3160 }, { "epoch": 0.4767362943971043, "grad_norm": 0.24605270505538374, "learning_rate": 2.9140416327605025e-07, "loss": 1.6102, "step": 3161 }, { "epoch": 0.4768871125857779, "grad_norm": 0.25091348999395624, "learning_rate": 2.9129783003639657e-07, "loss": 1.5096, "step": 3162 }, { "epoch": 0.4770379307744514, "grad_norm": 0.24205045088318083, "learning_rate": 2.911914931374052e-07, "loss": 1.5645, "step": 3163 }, { "epoch": 0.47718874896312496, "grad_norm": 0.2478677211449069, "learning_rate": 2.910851526029522e-07, "loss": 1.547, "step": 3164 }, { "epoch": 0.4773395671517985, "grad_norm": 0.2635117483325081, "learning_rate": 2.9097880845691397e-07, "loss": 1.516, "step": 3165 }, { "epoch": 0.47749038534047206, "grad_norm": 0.24323263414281793, "learning_rate": 2.9087246072316787e-07, "loss": 1.5332, "step": 3166 }, { "epoch": 0.47764120352914563, "grad_norm": 0.29558092002313663, "learning_rate": 2.9076610942559204e-07, "loss": 1.6102, "step": 3167 }, { "epoch": 0.47779202171781915, "grad_norm": 0.24736869832808636, "learning_rate": 2.9065975458806553e-07, "loss": 1.5347, "step": 3168 }, { "epoch": 0.4779428399064927, "grad_norm": 0.2646388181945015, "learning_rate": 2.90553396234468e-07, "loss": 1.5861, "step": 3169 }, { "epoch": 0.4780936580951663, "grad_norm": 0.27381585079018494, "learning_rate": 2.904470343886801e-07, "loss": 1.5137, "step": 3170 }, { "epoch": 0.4782444762838398, "grad_norm": 0.2423324557476058, "learning_rate": 2.903406690745831e-07, "loss": 1.5832, "step": 3171 }, { "epoch": 0.4783952944725134, "grad_norm": 0.2517611124533566, "learning_rate": 2.9023430031605923e-07, "loss": 1.5537, "step": 3172 }, { "epoch": 0.47854611266118696, "grad_norm": 0.24805179563772367, "learning_rate": 2.901279281369911e-07, "loss": 1.5987, "step": 3173 }, { "epoch": 0.4786969308498605, "grad_norm": 0.26021823639773467, "learning_rate": 2.900215525612626e-07, "loss": 1.5365, "step": 3174 }, { "epoch": 0.47884774903853405, "grad_norm": 0.24792292288618362, "learning_rate": 2.899151736127581e-07, "loss": 1.5926, "step": 3175 }, { "epoch": 0.47899856722720763, "grad_norm": 0.2415848305018361, "learning_rate": 2.898087913153627e-07, "loss": 1.5433, "step": 3176 }, { "epoch": 0.47914938541588115, "grad_norm": 0.6260731509433154, "learning_rate": 2.8970240569296236e-07, "loss": 1.5663, "step": 3177 }, { "epoch": 0.4793002036045547, "grad_norm": 0.24539307819478479, "learning_rate": 2.8959601676944383e-07, "loss": 1.5906, "step": 3178 }, { "epoch": 0.47945102179322824, "grad_norm": 0.25790011668612844, "learning_rate": 2.8948962456869437e-07, "loss": 1.6438, "step": 3179 }, { "epoch": 0.4796018399819018, "grad_norm": 0.2505504043774616, "learning_rate": 2.8938322911460226e-07, "loss": 1.5964, "step": 3180 }, { "epoch": 0.4797526581705754, "grad_norm": 0.2434342014035931, "learning_rate": 2.8927683043105633e-07, "loss": 1.5113, "step": 3181 }, { "epoch": 0.4799034763592489, "grad_norm": 0.23872019889510285, "learning_rate": 2.8917042854194614e-07, "loss": 1.6068, "step": 3182 }, { "epoch": 0.4800542945479225, "grad_norm": 0.23845824444940553, "learning_rate": 2.890640234711621e-07, "loss": 1.5804, "step": 3183 }, { "epoch": 0.48020511273659605, "grad_norm": 0.3319557403459845, "learning_rate": 2.8895761524259515e-07, "loss": 1.5272, "step": 3184 }, { "epoch": 0.48035593092526957, "grad_norm": 0.24694649494124626, "learning_rate": 2.888512038801372e-07, "loss": 1.5647, "step": 3185 }, { "epoch": 0.48050674911394314, "grad_norm": 0.2461811353531077, "learning_rate": 2.8874478940768044e-07, "loss": 1.569, "step": 3186 }, { "epoch": 0.4806575673026167, "grad_norm": 0.25810955839113636, "learning_rate": 2.886383718491182e-07, "loss": 1.5937, "step": 3187 }, { "epoch": 0.48080838549129024, "grad_norm": 0.2366926661385369, "learning_rate": 2.8853195122834435e-07, "loss": 1.562, "step": 3188 }, { "epoch": 0.4809592036799638, "grad_norm": 0.2504882681866331, "learning_rate": 2.8842552756925333e-07, "loss": 1.598, "step": 3189 }, { "epoch": 0.4811100218686374, "grad_norm": 0.3337958326699907, "learning_rate": 2.883191008957404e-07, "loss": 1.5665, "step": 3190 }, { "epoch": 0.4812608400573109, "grad_norm": 0.23940063487306815, "learning_rate": 2.882126712317013e-07, "loss": 1.5629, "step": 3191 }, { "epoch": 0.4814116582459845, "grad_norm": 0.2541958915384889, "learning_rate": 2.8810623860103284e-07, "loss": 1.6082, "step": 3192 }, { "epoch": 0.481562476434658, "grad_norm": 0.24315381580335652, "learning_rate": 2.879998030276321e-07, "loss": 1.5353, "step": 3193 }, { "epoch": 0.48171329462333157, "grad_norm": 0.244890391464526, "learning_rate": 2.8789336453539697e-07, "loss": 1.5962, "step": 3194 }, { "epoch": 0.48186411281200514, "grad_norm": 0.3054327723434127, "learning_rate": 2.87786923148226e-07, "loss": 1.5922, "step": 3195 }, { "epoch": 0.48201493100067866, "grad_norm": 0.28288167217486754, "learning_rate": 2.876804788900184e-07, "loss": 1.5471, "step": 3196 }, { "epoch": 0.48216574918935223, "grad_norm": 0.24939943317378474, "learning_rate": 2.87574031784674e-07, "loss": 1.5403, "step": 3197 }, { "epoch": 0.4823165673780258, "grad_norm": 0.2631547162360356, "learning_rate": 2.8746758185609327e-07, "loss": 1.5974, "step": 3198 }, { "epoch": 0.4824673855666993, "grad_norm": 10.530002750056518, "learning_rate": 2.873611291281773e-07, "loss": 1.58, "step": 3199 }, { "epoch": 0.4826182037553729, "grad_norm": 0.26162315909216316, "learning_rate": 2.872546736248278e-07, "loss": 1.4948, "step": 3200 }, { "epoch": 0.4827690219440465, "grad_norm": 0.24212148008370052, "learning_rate": 2.8714821536994726e-07, "loss": 1.5124, "step": 3201 }, { "epoch": 0.48291984013272, "grad_norm": 0.2801617978718279, "learning_rate": 2.870417543874385e-07, "loss": 1.5935, "step": 3202 }, { "epoch": 0.48307065832139356, "grad_norm": 0.2387419566852273, "learning_rate": 2.869352907012053e-07, "loss": 1.5464, "step": 3203 }, { "epoch": 0.48322147651006714, "grad_norm": 0.26099184209771586, "learning_rate": 2.868288243351516e-07, "loss": 1.5626, "step": 3204 }, { "epoch": 0.48337229469874066, "grad_norm": 0.25288447335531195, "learning_rate": 2.867223553131824e-07, "loss": 1.5689, "step": 3205 }, { "epoch": 0.48352311288741423, "grad_norm": 0.39672504202397724, "learning_rate": 2.866158836592031e-07, "loss": 1.5479, "step": 3206 }, { "epoch": 0.48367393107608775, "grad_norm": 0.26974266115825624, "learning_rate": 2.865094093971195e-07, "loss": 1.5265, "step": 3207 }, { "epoch": 0.4838247492647613, "grad_norm": 0.24965044998372343, "learning_rate": 2.8640293255083836e-07, "loss": 1.6274, "step": 3208 }, { "epoch": 0.4839755674534349, "grad_norm": 0.24944590452093887, "learning_rate": 2.862964531442668e-07, "loss": 1.5188, "step": 3209 }, { "epoch": 0.4841263856421084, "grad_norm": 0.2565371544538948, "learning_rate": 2.8618997120131257e-07, "loss": 1.6013, "step": 3210 }, { "epoch": 0.484277203830782, "grad_norm": 0.33798667562659546, "learning_rate": 2.860834867458838e-07, "loss": 1.5895, "step": 3211 }, { "epoch": 0.48442802201945556, "grad_norm": 0.2812333198255085, "learning_rate": 2.859769998018896e-07, "loss": 1.5577, "step": 3212 }, { "epoch": 0.4845788402081291, "grad_norm": 0.2455805621250143, "learning_rate": 2.8587051039323907e-07, "loss": 1.5776, "step": 3213 }, { "epoch": 0.48472965839680265, "grad_norm": 0.26926124956469405, "learning_rate": 2.8576401854384254e-07, "loss": 1.5413, "step": 3214 }, { "epoch": 0.48488047658547623, "grad_norm": 0.2591508858213702, "learning_rate": 2.856575242776103e-07, "loss": 1.6702, "step": 3215 }, { "epoch": 0.48503129477414975, "grad_norm": 0.28225120745662957, "learning_rate": 2.855510276184535e-07, "loss": 1.5548, "step": 3216 }, { "epoch": 0.4851821129628233, "grad_norm": 0.25265566608438506, "learning_rate": 2.8544452859028366e-07, "loss": 1.6464, "step": 3217 }, { "epoch": 0.4853329311514969, "grad_norm": 0.2517714020018506, "learning_rate": 2.85338027217013e-07, "loss": 1.5778, "step": 3218 }, { "epoch": 0.4854837493401704, "grad_norm": 0.3158410705985503, "learning_rate": 2.852315235225542e-07, "loss": 1.5697, "step": 3219 }, { "epoch": 0.485634567528844, "grad_norm": 0.24993883929936955, "learning_rate": 2.8512501753082034e-07, "loss": 1.586, "step": 3220 }, { "epoch": 0.48578538571751756, "grad_norm": 0.24254905837014484, "learning_rate": 2.850185092657252e-07, "loss": 1.5374, "step": 3221 }, { "epoch": 0.4859362039061911, "grad_norm": 0.2527740538594686, "learning_rate": 2.84911998751183e-07, "loss": 1.6025, "step": 3222 }, { "epoch": 0.48608702209486465, "grad_norm": 0.25117848213803096, "learning_rate": 2.848054860111083e-07, "loss": 1.6301, "step": 3223 }, { "epoch": 0.48623784028353817, "grad_norm": 0.25643678570548345, "learning_rate": 2.8469897106941655e-07, "loss": 1.575, "step": 3224 }, { "epoch": 0.48638865847221174, "grad_norm": 0.25586334913961334, "learning_rate": 2.8459245395002325e-07, "loss": 1.5797, "step": 3225 }, { "epoch": 0.4865394766608853, "grad_norm": 0.25979692487366346, "learning_rate": 2.8448593467684465e-07, "loss": 1.5381, "step": 3226 }, { "epoch": 0.48669029484955884, "grad_norm": 0.2535970150070719, "learning_rate": 2.843794132737975e-07, "loss": 1.6001, "step": 3227 }, { "epoch": 0.4868411130382324, "grad_norm": 0.2596012177129744, "learning_rate": 2.842728897647989e-07, "loss": 1.5991, "step": 3228 }, { "epoch": 0.486991931226906, "grad_norm": 0.2942803486057231, "learning_rate": 2.8416636417376656e-07, "loss": 1.5302, "step": 3229 }, { "epoch": 0.4871427494155795, "grad_norm": 0.25708300455766936, "learning_rate": 2.840598365246184e-07, "loss": 1.5565, "step": 3230 }, { "epoch": 0.4872935676042531, "grad_norm": 0.8878456230619468, "learning_rate": 2.839533068412731e-07, "loss": 1.5992, "step": 3231 }, { "epoch": 0.48744438579292665, "grad_norm": 0.28130475511678493, "learning_rate": 2.8384677514764975e-07, "loss": 1.5777, "step": 3232 }, { "epoch": 0.48759520398160017, "grad_norm": 0.2500117516063452, "learning_rate": 2.8374024146766775e-07, "loss": 1.6402, "step": 3233 }, { "epoch": 0.48774602217027374, "grad_norm": 0.2514115286658896, "learning_rate": 2.836337058252469e-07, "loss": 1.5044, "step": 3234 }, { "epoch": 0.4878968403589473, "grad_norm": 0.25314414769145527, "learning_rate": 2.8352716824430764e-07, "loss": 1.5921, "step": 3235 }, { "epoch": 0.48804765854762083, "grad_norm": 0.24900053045118872, "learning_rate": 2.834206287487708e-07, "loss": 1.5998, "step": 3236 }, { "epoch": 0.4881984767362944, "grad_norm": 0.2439659282480114, "learning_rate": 2.8331408736255765e-07, "loss": 1.5143, "step": 3237 }, { "epoch": 0.4883492949249679, "grad_norm": 0.25941843445222496, "learning_rate": 2.8320754410958964e-07, "loss": 1.5795, "step": 3238 }, { "epoch": 0.4885001131136415, "grad_norm": 0.41423721903258276, "learning_rate": 2.8310099901378897e-07, "loss": 1.5428, "step": 3239 }, { "epoch": 0.4886509313023151, "grad_norm": 0.33465417546104953, "learning_rate": 2.829944520990782e-07, "loss": 1.5999, "step": 3240 }, { "epoch": 0.4888017494909886, "grad_norm": 0.3828687331908, "learning_rate": 2.8288790338937996e-07, "loss": 1.5753, "step": 3241 }, { "epoch": 0.48895256767966216, "grad_norm": 0.2832946698386233, "learning_rate": 2.827813529086178e-07, "loss": 1.5806, "step": 3242 }, { "epoch": 0.48910338586833574, "grad_norm": 0.30331355605832694, "learning_rate": 2.8267480068071525e-07, "loss": 1.6383, "step": 3243 }, { "epoch": 0.48925420405700926, "grad_norm": 0.25922187834565497, "learning_rate": 2.825682467295964e-07, "loss": 1.5128, "step": 3244 }, { "epoch": 0.48940502224568283, "grad_norm": 0.2527270720867368, "learning_rate": 2.8246169107918577e-07, "loss": 1.5834, "step": 3245 }, { "epoch": 0.4895558404343564, "grad_norm": 0.4260916861411897, "learning_rate": 2.823551337534082e-07, "loss": 1.5579, "step": 3246 }, { "epoch": 0.4897066586230299, "grad_norm": 0.2583051452559372, "learning_rate": 2.8224857477618896e-07, "loss": 1.5318, "step": 3247 }, { "epoch": 0.4898574768117035, "grad_norm": 0.25639898547576445, "learning_rate": 2.821420141714535e-07, "loss": 1.5402, "step": 3248 }, { "epoch": 0.49000829500037707, "grad_norm": 0.2561054616826835, "learning_rate": 2.8203545196312783e-07, "loss": 1.5973, "step": 3249 }, { "epoch": 0.4901591131890506, "grad_norm": 0.24645710856747305, "learning_rate": 2.819288881751384e-07, "loss": 1.6246, "step": 3250 }, { "epoch": 0.49030993137772416, "grad_norm": 0.295121881766085, "learning_rate": 2.8182232283141167e-07, "loss": 1.5685, "step": 3251 }, { "epoch": 0.4904607495663977, "grad_norm": 0.26948819881038233, "learning_rate": 2.8171575595587485e-07, "loss": 1.6287, "step": 3252 }, { "epoch": 0.49061156775507125, "grad_norm": 0.23821034619382012, "learning_rate": 2.816091875724552e-07, "loss": 1.5834, "step": 3253 }, { "epoch": 0.4907623859437448, "grad_norm": 0.23316130818787745, "learning_rate": 2.8150261770508043e-07, "loss": 1.5489, "step": 3254 }, { "epoch": 0.49091320413241835, "grad_norm": 0.2536546870683742, "learning_rate": 2.813960463776787e-07, "loss": 1.643, "step": 3255 }, { "epoch": 0.4910640223210919, "grad_norm": 0.2569158519683004, "learning_rate": 2.8128947361417824e-07, "loss": 1.6269, "step": 3256 }, { "epoch": 0.4912148405097655, "grad_norm": 0.24099574529841028, "learning_rate": 2.8118289943850775e-07, "loss": 1.6473, "step": 3257 }, { "epoch": 0.491365658698439, "grad_norm": 0.303913762146176, "learning_rate": 2.8107632387459633e-07, "loss": 1.5334, "step": 3258 }, { "epoch": 0.4915164768871126, "grad_norm": 0.29823244363900175, "learning_rate": 2.809697469463732e-07, "loss": 1.5865, "step": 3259 }, { "epoch": 0.49166729507578616, "grad_norm": 0.2751509847697237, "learning_rate": 2.8086316867776807e-07, "loss": 1.5668, "step": 3260 }, { "epoch": 0.4918181132644597, "grad_norm": 0.2523827456942733, "learning_rate": 2.8075658909271077e-07, "loss": 1.65, "step": 3261 }, { "epoch": 0.49196893145313325, "grad_norm": 0.3973065228061601, "learning_rate": 2.806500082151316e-07, "loss": 1.603, "step": 3262 }, { "epoch": 0.4921197496418068, "grad_norm": 0.24793582805269612, "learning_rate": 2.8054342606896096e-07, "loss": 1.534, "step": 3263 }, { "epoch": 0.49227056783048034, "grad_norm": 0.25163649978972913, "learning_rate": 2.804368426781298e-07, "loss": 1.6261, "step": 3264 }, { "epoch": 0.4924213860191539, "grad_norm": 0.2520544100110073, "learning_rate": 2.8033025806656907e-07, "loss": 1.6166, "step": 3265 }, { "epoch": 0.4925722042078275, "grad_norm": 0.3305439844269301, "learning_rate": 2.802236722582102e-07, "loss": 1.5444, "step": 3266 }, { "epoch": 0.492723022396501, "grad_norm": 0.2599777522374565, "learning_rate": 2.8011708527698476e-07, "loss": 1.5614, "step": 3267 }, { "epoch": 0.4928738405851746, "grad_norm": 0.25958720907226457, "learning_rate": 2.8001049714682467e-07, "loss": 1.5006, "step": 3268 }, { "epoch": 0.4930246587738481, "grad_norm": 0.254499677063202, "learning_rate": 2.79903907891662e-07, "loss": 1.5317, "step": 3269 }, { "epoch": 0.4931754769625217, "grad_norm": 0.26686274358282275, "learning_rate": 2.797973175354293e-07, "loss": 1.6463, "step": 3270 }, { "epoch": 0.49332629515119525, "grad_norm": 0.2699881617841487, "learning_rate": 2.79690726102059e-07, "loss": 1.633, "step": 3271 }, { "epoch": 0.49347711333986877, "grad_norm": 0.25557733518747255, "learning_rate": 2.7958413361548413e-07, "loss": 1.5208, "step": 3272 }, { "epoch": 0.49362793152854234, "grad_norm": 0.2821977085023352, "learning_rate": 2.7947754009963774e-07, "loss": 1.6234, "step": 3273 }, { "epoch": 0.4937787497172159, "grad_norm": 0.27398607201443076, "learning_rate": 2.7937094557845323e-07, "loss": 1.6299, "step": 3274 }, { "epoch": 0.49392956790588943, "grad_norm": 0.25014826686171554, "learning_rate": 2.792643500758641e-07, "loss": 1.6191, "step": 3275 }, { "epoch": 0.494080386094563, "grad_norm": 0.3087411622057582, "learning_rate": 2.7915775361580425e-07, "loss": 1.519, "step": 3276 }, { "epoch": 0.4942312042832366, "grad_norm": 0.2584795757804951, "learning_rate": 2.7905115622220756e-07, "loss": 1.6041, "step": 3277 }, { "epoch": 0.4943820224719101, "grad_norm": 0.2524777500062278, "learning_rate": 2.7894455791900835e-07, "loss": 1.5541, "step": 3278 }, { "epoch": 0.4945328406605837, "grad_norm": 0.24427578751511272, "learning_rate": 2.78837958730141e-07, "loss": 1.598, "step": 3279 }, { "epoch": 0.49468365884925725, "grad_norm": 0.42420902335941657, "learning_rate": 2.787313586795401e-07, "loss": 1.562, "step": 3280 }, { "epoch": 0.49483447703793076, "grad_norm": 0.2529363586213235, "learning_rate": 2.786247577911405e-07, "loss": 1.5088, "step": 3281 }, { "epoch": 0.49498529522660434, "grad_norm": 0.25403005821628705, "learning_rate": 2.785181560888773e-07, "loss": 1.6318, "step": 3282 }, { "epoch": 0.49513611341527786, "grad_norm": 0.2370884087039444, "learning_rate": 2.7841155359668557e-07, "loss": 1.5434, "step": 3283 }, { "epoch": 0.49528693160395143, "grad_norm": 0.25205579124353483, "learning_rate": 2.7830495033850063e-07, "loss": 1.5691, "step": 3284 }, { "epoch": 0.495437749792625, "grad_norm": 0.25856685206236524, "learning_rate": 2.7819834633825813e-07, "loss": 1.5557, "step": 3285 }, { "epoch": 0.4955885679812985, "grad_norm": 0.24533069937513297, "learning_rate": 2.780917416198938e-07, "loss": 1.5206, "step": 3286 }, { "epoch": 0.4957393861699721, "grad_norm": 0.24305954156589926, "learning_rate": 2.779851362073434e-07, "loss": 1.5174, "step": 3287 }, { "epoch": 0.49589020435864567, "grad_norm": 0.25054942239574796, "learning_rate": 2.778785301245429e-07, "loss": 1.5329, "step": 3288 }, { "epoch": 0.4960410225473192, "grad_norm": 2.0338205742667887, "learning_rate": 2.7777192339542867e-07, "loss": 1.5391, "step": 3289 }, { "epoch": 0.49619184073599276, "grad_norm": 0.25795490828469647, "learning_rate": 2.7766531604393684e-07, "loss": 1.5867, "step": 3290 }, { "epoch": 0.49634265892466634, "grad_norm": 0.2574914292698842, "learning_rate": 2.7755870809400396e-07, "loss": 1.5698, "step": 3291 }, { "epoch": 0.49649347711333985, "grad_norm": 0.7169073199522987, "learning_rate": 2.7745209956956653e-07, "loss": 1.5594, "step": 3292 }, { "epoch": 0.4966442953020134, "grad_norm": 0.26192159719759245, "learning_rate": 2.7734549049456145e-07, "loss": 1.5836, "step": 3293 }, { "epoch": 0.496795113490687, "grad_norm": 0.24903268730487435, "learning_rate": 2.772388808929254e-07, "loss": 1.511, "step": 3294 }, { "epoch": 0.4969459316793605, "grad_norm": 0.2459377727366106, "learning_rate": 2.771322707885953e-07, "loss": 1.5383, "step": 3295 }, { "epoch": 0.4970967498680341, "grad_norm": 0.24091784685121265, "learning_rate": 2.770256602055084e-07, "loss": 1.5624, "step": 3296 }, { "epoch": 0.4972475680567076, "grad_norm": 0.2444409709467392, "learning_rate": 2.7691904916760184e-07, "loss": 1.635, "step": 3297 }, { "epoch": 0.4973983862453812, "grad_norm": 0.25870568500425084, "learning_rate": 2.768124376988127e-07, "loss": 1.5313, "step": 3298 }, { "epoch": 0.49754920443405476, "grad_norm": 0.26148979055400845, "learning_rate": 2.767058258230787e-07, "loss": 1.5605, "step": 3299 }, { "epoch": 0.4977000226227283, "grad_norm": 0.26489712173147045, "learning_rate": 2.7659921356433695e-07, "loss": 1.5787, "step": 3300 }, { "epoch": 0.49785084081140185, "grad_norm": 0.24771529971415504, "learning_rate": 2.7649260094652535e-07, "loss": 1.5154, "step": 3301 }, { "epoch": 0.4980016590000754, "grad_norm": 0.25038035734122427, "learning_rate": 2.763859879935812e-07, "loss": 1.5992, "step": 3302 }, { "epoch": 0.49815247718874894, "grad_norm": 0.2533812623579808, "learning_rate": 2.762793747294425e-07, "loss": 1.5593, "step": 3303 }, { "epoch": 0.4983032953774225, "grad_norm": 0.29352075710368725, "learning_rate": 2.7617276117804683e-07, "loss": 1.6137, "step": 3304 }, { "epoch": 0.4984541135660961, "grad_norm": 0.25649552704037465, "learning_rate": 2.760661473633321e-07, "loss": 1.5203, "step": 3305 }, { "epoch": 0.4986049317547696, "grad_norm": 0.2538748736753708, "learning_rate": 2.7595953330923637e-07, "loss": 1.5523, "step": 3306 }, { "epoch": 0.4987557499434432, "grad_norm": 0.25538783591488673, "learning_rate": 2.758529190396974e-07, "loss": 1.5785, "step": 3307 }, { "epoch": 0.49890656813211676, "grad_norm": 0.2530921350682668, "learning_rate": 2.7574630457865316e-07, "loss": 1.4433, "step": 3308 }, { "epoch": 0.4990573863207903, "grad_norm": 0.25531060239975967, "learning_rate": 2.75639689950042e-07, "loss": 1.5884, "step": 3309 }, { "epoch": 0.49920820450946385, "grad_norm": 0.25898423712370006, "learning_rate": 2.7553307517780166e-07, "loss": 1.5225, "step": 3310 }, { "epoch": 0.4993590226981374, "grad_norm": 0.2469778206164802, "learning_rate": 2.754264602858705e-07, "loss": 1.5995, "step": 3311 }, { "epoch": 0.49950984088681094, "grad_norm": 0.2447786405845865, "learning_rate": 2.753198452981866e-07, "loss": 1.4748, "step": 3312 }, { "epoch": 0.4996606590754845, "grad_norm": 0.2765619723237778, "learning_rate": 2.75213230238688e-07, "loss": 1.6506, "step": 3313 }, { "epoch": 0.49981147726415803, "grad_norm": 0.24721530332690836, "learning_rate": 2.751066151313131e-07, "loss": 1.5378, "step": 3314 }, { "epoch": 0.4999622954528316, "grad_norm": 0.25053059640832, "learning_rate": 2.75e-07, "loss": 1.6209, "step": 3315 }, { "epoch": 0.5001131136415051, "grad_norm": 0.3042993127685436, "learning_rate": 2.7489338486868686e-07, "loss": 1.5954, "step": 3316 }, { "epoch": 0.5002639318301787, "grad_norm": 0.24973666347748574, "learning_rate": 2.7478676976131196e-07, "loss": 1.5605, "step": 3317 }, { "epoch": 0.5004147500188523, "grad_norm": 0.24988153493709508, "learning_rate": 2.7468015470181343e-07, "loss": 1.5431, "step": 3318 }, { "epoch": 0.5005655682075258, "grad_norm": 0.2865156977594693, "learning_rate": 2.7457353971412954e-07, "loss": 1.5388, "step": 3319 }, { "epoch": 0.5007163863961994, "grad_norm": 0.2527759398124433, "learning_rate": 2.7446692482219837e-07, "loss": 1.6225, "step": 3320 }, { "epoch": 0.5008672045848729, "grad_norm": 0.2693109111241276, "learning_rate": 2.7436031004995805e-07, "loss": 1.4975, "step": 3321 }, { "epoch": 0.5010180227735465, "grad_norm": 0.27130862309093945, "learning_rate": 2.742536954213468e-07, "loss": 1.5414, "step": 3322 }, { "epoch": 0.50116884096222, "grad_norm": 0.24706973863352966, "learning_rate": 2.7414708096030264e-07, "loss": 1.5393, "step": 3323 }, { "epoch": 0.5013196591508936, "grad_norm": 0.26812242254342294, "learning_rate": 2.7404046669076366e-07, "loss": 1.5682, "step": 3324 }, { "epoch": 0.5014704773395672, "grad_norm": 0.31047841476741905, "learning_rate": 2.7393385263666783e-07, "loss": 1.5423, "step": 3325 }, { "epoch": 0.5016212955282408, "grad_norm": 0.24735751950021653, "learning_rate": 2.738272388219532e-07, "loss": 1.5263, "step": 3326 }, { "epoch": 0.5017721137169142, "grad_norm": 0.25692232040787155, "learning_rate": 2.7372062527055755e-07, "loss": 1.5895, "step": 3327 }, { "epoch": 0.5019229319055878, "grad_norm": 0.2418566848114825, "learning_rate": 2.7361401200641884e-07, "loss": 1.5592, "step": 3328 }, { "epoch": 0.5020737500942614, "grad_norm": 0.23863554158574807, "learning_rate": 2.735073990534747e-07, "loss": 1.5274, "step": 3329 }, { "epoch": 0.5022245682829349, "grad_norm": 0.2982027483839481, "learning_rate": 2.734007864356631e-07, "loss": 1.6451, "step": 3330 }, { "epoch": 0.5023753864716085, "grad_norm": 0.24960726137909803, "learning_rate": 2.7329417417692134e-07, "loss": 1.5391, "step": 3331 }, { "epoch": 0.5025262046602821, "grad_norm": 0.2545436508039039, "learning_rate": 2.7318756230118726e-07, "loss": 1.649, "step": 3332 }, { "epoch": 0.5026770228489555, "grad_norm": 0.2513105990902941, "learning_rate": 2.7308095083239824e-07, "loss": 1.58, "step": 3333 }, { "epoch": 0.5028278410376291, "grad_norm": 0.24609194731011771, "learning_rate": 2.729743397944916e-07, "loss": 1.5532, "step": 3334 }, { "epoch": 0.5029786592263027, "grad_norm": 0.2943735804970433, "learning_rate": 2.7286772921140466e-07, "loss": 1.5885, "step": 3335 }, { "epoch": 0.5031294774149763, "grad_norm": 0.25308883997269344, "learning_rate": 2.7276111910707465e-07, "loss": 1.5336, "step": 3336 }, { "epoch": 0.5032802956036498, "grad_norm": 0.39632265194069904, "learning_rate": 2.726545095054386e-07, "loss": 1.5418, "step": 3337 }, { "epoch": 0.5034311137923233, "grad_norm": 0.2495909049144747, "learning_rate": 2.7254790043043344e-07, "loss": 1.5832, "step": 3338 }, { "epoch": 0.5035819319809969, "grad_norm": 0.254632877714315, "learning_rate": 2.7244129190599607e-07, "loss": 1.6509, "step": 3339 }, { "epoch": 0.5037327501696705, "grad_norm": 0.2748382970830817, "learning_rate": 2.723346839560632e-07, "loss": 1.6096, "step": 3340 }, { "epoch": 0.503883568358344, "grad_norm": 0.2699005878780138, "learning_rate": 2.7222807660457136e-07, "loss": 1.5777, "step": 3341 }, { "epoch": 0.5040343865470176, "grad_norm": 0.2553144305506177, "learning_rate": 2.721214698754571e-07, "loss": 1.5563, "step": 3342 }, { "epoch": 0.5041852047356912, "grad_norm": 0.2985671999866322, "learning_rate": 2.7201486379265663e-07, "loss": 1.567, "step": 3343 }, { "epoch": 0.5043360229243646, "grad_norm": 0.23768594996157277, "learning_rate": 2.7190825838010623e-07, "loss": 1.4942, "step": 3344 }, { "epoch": 0.5044868411130382, "grad_norm": 0.26881074903449925, "learning_rate": 2.7180165366174184e-07, "loss": 1.5781, "step": 3345 }, { "epoch": 0.5046376593017118, "grad_norm": 0.4033794521730484, "learning_rate": 2.7169504966149934e-07, "loss": 1.4971, "step": 3346 }, { "epoch": 0.5047884774903854, "grad_norm": 0.25578761257228577, "learning_rate": 2.7158844640331446e-07, "loss": 1.5625, "step": 3347 }, { "epoch": 0.5049392956790589, "grad_norm": 0.25244155440042493, "learning_rate": 2.714818439111227e-07, "loss": 1.6181, "step": 3348 }, { "epoch": 0.5050901138677324, "grad_norm": 0.34838110431199665, "learning_rate": 2.713752422088595e-07, "loss": 1.5672, "step": 3349 }, { "epoch": 0.505240932056406, "grad_norm": 0.29123625682680754, "learning_rate": 2.7126864132045995e-07, "loss": 1.5912, "step": 3350 }, { "epoch": 0.5053917502450795, "grad_norm": 0.2535329957922546, "learning_rate": 2.711620412698591e-07, "loss": 1.5847, "step": 3351 }, { "epoch": 0.5055425684337531, "grad_norm": 0.25675943816014185, "learning_rate": 2.710554420809917e-07, "loss": 1.5175, "step": 3352 }, { "epoch": 0.5056933866224267, "grad_norm": 0.2483481980707172, "learning_rate": 2.7094884377779246e-07, "loss": 1.6137, "step": 3353 }, { "epoch": 0.5058442048111003, "grad_norm": 0.2805203243287897, "learning_rate": 2.708422463841958e-07, "loss": 1.495, "step": 3354 }, { "epoch": 0.5059950229997737, "grad_norm": 0.24657785511710636, "learning_rate": 2.707356499241359e-07, "loss": 1.5425, "step": 3355 }, { "epoch": 0.5061458411884473, "grad_norm": 0.2804663396090381, "learning_rate": 2.706290544215468e-07, "loss": 1.6177, "step": 3356 }, { "epoch": 0.5062966593771209, "grad_norm": 0.2519727641215625, "learning_rate": 2.705224599003622e-07, "loss": 1.5697, "step": 3357 }, { "epoch": 0.5064474775657944, "grad_norm": 0.24965340619312307, "learning_rate": 2.704158663845159e-07, "loss": 1.4687, "step": 3358 }, { "epoch": 0.506598295754468, "grad_norm": 0.2572141299558103, "learning_rate": 2.70309273897941e-07, "loss": 1.5834, "step": 3359 }, { "epoch": 0.5067491139431416, "grad_norm": 0.25474124648079854, "learning_rate": 2.7020268246457074e-07, "loss": 1.5704, "step": 3360 }, { "epoch": 0.5068999321318151, "grad_norm": 0.25162282375325096, "learning_rate": 2.700960921083379e-07, "loss": 1.5595, "step": 3361 }, { "epoch": 0.5070507503204886, "grad_norm": 0.32619383144972003, "learning_rate": 2.6998950285317536e-07, "loss": 1.5805, "step": 3362 }, { "epoch": 0.5072015685091622, "grad_norm": 0.25540840064293446, "learning_rate": 2.6988291472301527e-07, "loss": 1.5823, "step": 3363 }, { "epoch": 0.5073523866978358, "grad_norm": 0.24902918521605308, "learning_rate": 2.6977632774178983e-07, "loss": 1.575, "step": 3364 }, { "epoch": 0.5075032048865094, "grad_norm": 0.243927216521824, "learning_rate": 2.6966974193343096e-07, "loss": 1.5108, "step": 3365 }, { "epoch": 0.5076540230751828, "grad_norm": 0.2359957641037577, "learning_rate": 2.695631573218703e-07, "loss": 1.5134, "step": 3366 }, { "epoch": 0.5078048412638564, "grad_norm": 0.2683406389292298, "learning_rate": 2.694565739310391e-07, "loss": 1.5073, "step": 3367 }, { "epoch": 0.50795565945253, "grad_norm": 0.2486889359494454, "learning_rate": 2.693499917848685e-07, "loss": 1.4811, "step": 3368 }, { "epoch": 0.5081064776412035, "grad_norm": 0.24544466385111835, "learning_rate": 2.692434109072893e-07, "loss": 1.5668, "step": 3369 }, { "epoch": 0.5082572958298771, "grad_norm": 0.2353367859389473, "learning_rate": 2.69136831322232e-07, "loss": 1.547, "step": 3370 }, { "epoch": 0.5084081140185507, "grad_norm": 0.24818901527921186, "learning_rate": 2.6903025305362686e-07, "loss": 1.5081, "step": 3371 }, { "epoch": 0.5085589322072241, "grad_norm": 0.3365507853358683, "learning_rate": 2.6892367612540375e-07, "loss": 1.5223, "step": 3372 }, { "epoch": 0.5087097503958977, "grad_norm": 0.2551285914497227, "learning_rate": 2.688171005614923e-07, "loss": 1.5723, "step": 3373 }, { "epoch": 0.5088605685845713, "grad_norm": 0.28271133388757014, "learning_rate": 2.687105263858218e-07, "loss": 1.5875, "step": 3374 }, { "epoch": 0.5090113867732449, "grad_norm": 0.24363473847958142, "learning_rate": 2.6860395362232136e-07, "loss": 1.5696, "step": 3375 }, { "epoch": 0.5091622049619184, "grad_norm": 0.246165245140535, "learning_rate": 2.684973822949196e-07, "loss": 1.6845, "step": 3376 }, { "epoch": 0.509313023150592, "grad_norm": 0.2526657502202713, "learning_rate": 2.683908124275449e-07, "loss": 1.5608, "step": 3377 }, { "epoch": 0.5094638413392655, "grad_norm": 0.2822124747596741, "learning_rate": 2.682842440441252e-07, "loss": 1.6233, "step": 3378 }, { "epoch": 0.509614659527939, "grad_norm": 0.27211605527560906, "learning_rate": 2.681776771685884e-07, "loss": 1.557, "step": 3379 }, { "epoch": 0.5097654777166126, "grad_norm": 0.25618395269705335, "learning_rate": 2.680711118248617e-07, "loss": 1.5103, "step": 3380 }, { "epoch": 0.5099162959052862, "grad_norm": 0.25714012472043773, "learning_rate": 2.679645480368722e-07, "loss": 1.562, "step": 3381 }, { "epoch": 0.5100671140939598, "grad_norm": 0.2551121867483003, "learning_rate": 2.678579858285466e-07, "loss": 1.5021, "step": 3382 }, { "epoch": 0.5102179322826332, "grad_norm": 0.279795512750936, "learning_rate": 2.677514252238111e-07, "loss": 1.5977, "step": 3383 }, { "epoch": 0.5103687504713068, "grad_norm": 0.2667401842863274, "learning_rate": 2.6764486624659183e-07, "loss": 1.4919, "step": 3384 }, { "epoch": 0.5105195686599804, "grad_norm": 0.24870196970524328, "learning_rate": 2.6753830892081426e-07, "loss": 1.5754, "step": 3385 }, { "epoch": 0.510670386848654, "grad_norm": 0.2638389192651744, "learning_rate": 2.6743175327040363e-07, "loss": 1.5756, "step": 3386 }, { "epoch": 0.5108212050373275, "grad_norm": 0.4072165688694835, "learning_rate": 2.6732519931928483e-07, "loss": 1.6453, "step": 3387 }, { "epoch": 0.5109720232260011, "grad_norm": 0.2422848523750943, "learning_rate": 2.6721864709138225e-07, "loss": 1.4672, "step": 3388 }, { "epoch": 0.5111228414146746, "grad_norm": 0.40391218423175, "learning_rate": 2.6711209661062006e-07, "loss": 1.5901, "step": 3389 }, { "epoch": 0.5112736596033481, "grad_norm": 0.30394247097694993, "learning_rate": 2.6700554790092187e-07, "loss": 1.5573, "step": 3390 }, { "epoch": 0.5114244777920217, "grad_norm": 0.42617749918711456, "learning_rate": 2.6689900098621106e-07, "loss": 1.579, "step": 3391 }, { "epoch": 0.5115752959806953, "grad_norm": 0.2804842346156845, "learning_rate": 2.667924558904104e-07, "loss": 1.5556, "step": 3392 }, { "epoch": 0.5117261141693689, "grad_norm": 0.248293866650627, "learning_rate": 2.6668591263744243e-07, "loss": 1.625, "step": 3393 }, { "epoch": 0.5118769323580423, "grad_norm": 0.25266486877579375, "learning_rate": 2.665793712512293e-07, "loss": 1.5812, "step": 3394 }, { "epoch": 0.5120277505467159, "grad_norm": 0.25281005618560276, "learning_rate": 2.664728317556924e-07, "loss": 1.5527, "step": 3395 }, { "epoch": 0.5121785687353895, "grad_norm": 0.26245082973756306, "learning_rate": 2.6636629417475314e-07, "loss": 1.6174, "step": 3396 }, { "epoch": 0.512329386924063, "grad_norm": 0.24176224348249375, "learning_rate": 2.662597585323324e-07, "loss": 1.5474, "step": 3397 }, { "epoch": 0.5124802051127366, "grad_norm": 0.27145047565218094, "learning_rate": 2.6615322485235023e-07, "loss": 1.5874, "step": 3398 }, { "epoch": 0.5126310233014102, "grad_norm": 0.2457872345203949, "learning_rate": 2.660466931587269e-07, "loss": 1.5548, "step": 3399 }, { "epoch": 0.5127818414900837, "grad_norm": 0.2561704235833319, "learning_rate": 2.659401634753817e-07, "loss": 1.5722, "step": 3400 }, { "epoch": 0.5129326596787572, "grad_norm": 0.257556030514651, "learning_rate": 2.658336358262335e-07, "loss": 1.5961, "step": 3401 }, { "epoch": 0.5130834778674308, "grad_norm": 0.2524081368312293, "learning_rate": 2.6572711023520113e-07, "loss": 1.6134, "step": 3402 }, { "epoch": 0.5132342960561044, "grad_norm": 0.2587914057008669, "learning_rate": 2.656205867262025e-07, "loss": 1.6208, "step": 3403 }, { "epoch": 0.513385114244778, "grad_norm": 0.3278007542839679, "learning_rate": 2.6551406532315533e-07, "loss": 1.6417, "step": 3404 }, { "epoch": 0.5135359324334515, "grad_norm": 0.24718450490251231, "learning_rate": 2.6540754604997683e-07, "loss": 1.4789, "step": 3405 }, { "epoch": 0.513686750622125, "grad_norm": 0.2589284027537417, "learning_rate": 2.6530102893058353e-07, "loss": 1.5179, "step": 3406 }, { "epoch": 0.5138375688107986, "grad_norm": 0.2747977819875336, "learning_rate": 2.6519451398889177e-07, "loss": 1.5357, "step": 3407 }, { "epoch": 0.5139883869994721, "grad_norm": 0.24735369688127637, "learning_rate": 2.650880012488171e-07, "loss": 1.5487, "step": 3408 }, { "epoch": 0.5141392051881457, "grad_norm": 0.24664274099541278, "learning_rate": 2.6498149073427485e-07, "loss": 1.5351, "step": 3409 }, { "epoch": 0.5142900233768193, "grad_norm": 0.2590613272316827, "learning_rate": 2.6487498246917974e-07, "loss": 1.5629, "step": 3410 }, { "epoch": 0.5144408415654927, "grad_norm": 0.3705828058823599, "learning_rate": 2.647684764774459e-07, "loss": 1.5627, "step": 3411 }, { "epoch": 0.5145916597541663, "grad_norm": 0.24319145857519242, "learning_rate": 2.6466197278298703e-07, "loss": 1.4684, "step": 3412 }, { "epoch": 0.5147424779428399, "grad_norm": 0.2472923219641013, "learning_rate": 2.645554714097164e-07, "loss": 1.5518, "step": 3413 }, { "epoch": 0.5148932961315135, "grad_norm": 0.290010629764668, "learning_rate": 2.644489723815466e-07, "loss": 1.5554, "step": 3414 }, { "epoch": 0.515044114320187, "grad_norm": 0.48259758803379066, "learning_rate": 2.643424757223898e-07, "loss": 1.5199, "step": 3415 }, { "epoch": 0.5151949325088606, "grad_norm": 0.2700103396991272, "learning_rate": 2.6423598145615755e-07, "loss": 1.5404, "step": 3416 }, { "epoch": 0.5153457506975341, "grad_norm": 0.27260193450195136, "learning_rate": 2.641294896067609e-07, "loss": 1.5869, "step": 3417 }, { "epoch": 0.5154965688862077, "grad_norm": 0.25736491966765584, "learning_rate": 2.6402300019811056e-07, "loss": 1.4908, "step": 3418 }, { "epoch": 0.5156473870748812, "grad_norm": 0.2617846338780766, "learning_rate": 2.639165132541162e-07, "loss": 1.4805, "step": 3419 }, { "epoch": 0.5157982052635548, "grad_norm": 0.25187905256722865, "learning_rate": 2.6381002879868746e-07, "loss": 1.583, "step": 3420 }, { "epoch": 0.5159490234522284, "grad_norm": 0.24428837350616164, "learning_rate": 2.6370354685573315e-07, "loss": 1.4823, "step": 3421 }, { "epoch": 0.516099841640902, "grad_norm": 0.35219293961394166, "learning_rate": 2.635970674491616e-07, "loss": 1.5976, "step": 3422 }, { "epoch": 0.5162506598295754, "grad_norm": 0.24748325128883544, "learning_rate": 2.6349059060288047e-07, "loss": 1.5116, "step": 3423 }, { "epoch": 0.516401478018249, "grad_norm": 0.26782624643815234, "learning_rate": 2.6338411634079695e-07, "loss": 1.6013, "step": 3424 }, { "epoch": 0.5165522962069226, "grad_norm": 0.2516196167397587, "learning_rate": 2.6327764468681755e-07, "loss": 1.5371, "step": 3425 }, { "epoch": 0.5167031143955961, "grad_norm": 0.37290749259523764, "learning_rate": 2.6317117566484836e-07, "loss": 1.4911, "step": 3426 }, { "epoch": 0.5168539325842697, "grad_norm": 0.251509966071061, "learning_rate": 2.6306470929879475e-07, "loss": 1.5656, "step": 3427 }, { "epoch": 0.5170047507729432, "grad_norm": 0.25849330500463785, "learning_rate": 2.629582456125615e-07, "loss": 1.5666, "step": 3428 }, { "epoch": 0.5171555689616167, "grad_norm": 0.24393511130890536, "learning_rate": 2.628517846300527e-07, "loss": 1.5653, "step": 3429 }, { "epoch": 0.5173063871502903, "grad_norm": 0.24229616697656609, "learning_rate": 2.6274532637517215e-07, "loss": 1.4901, "step": 3430 }, { "epoch": 0.5174572053389639, "grad_norm": 0.258315777973691, "learning_rate": 2.626388708718227e-07, "loss": 1.5753, "step": 3431 }, { "epoch": 0.5176080235276375, "grad_norm": 0.3039069236998845, "learning_rate": 2.6253241814390676e-07, "loss": 1.6208, "step": 3432 }, { "epoch": 0.517758841716311, "grad_norm": 0.2835486242320549, "learning_rate": 2.62425968215326e-07, "loss": 1.6249, "step": 3433 }, { "epoch": 0.5179096599049845, "grad_norm": 0.24902980508283476, "learning_rate": 2.6231952110998156e-07, "loss": 1.6022, "step": 3434 }, { "epoch": 0.5180604780936581, "grad_norm": 0.2492393031975446, "learning_rate": 2.6221307685177395e-07, "loss": 1.5974, "step": 3435 }, { "epoch": 0.5182112962823316, "grad_norm": 0.23794462105076858, "learning_rate": 2.62106635464603e-07, "loss": 1.5895, "step": 3436 }, { "epoch": 0.5183621144710052, "grad_norm": 0.4444198515386016, "learning_rate": 2.6200019697236784e-07, "loss": 1.5255, "step": 3437 }, { "epoch": 0.5185129326596788, "grad_norm": 0.253090569717866, "learning_rate": 2.6189376139896714e-07, "loss": 1.5699, "step": 3438 }, { "epoch": 0.5186637508483523, "grad_norm": 0.2393039961899078, "learning_rate": 2.617873287682986e-07, "loss": 1.5549, "step": 3439 }, { "epoch": 0.5188145690370258, "grad_norm": 0.25196990371497024, "learning_rate": 2.616808991042596e-07, "loss": 1.5806, "step": 3440 }, { "epoch": 0.5189653872256994, "grad_norm": 0.2750144429593545, "learning_rate": 2.6157447243074664e-07, "loss": 1.6465, "step": 3441 }, { "epoch": 0.519116205414373, "grad_norm": 0.272366716894923, "learning_rate": 2.6146804877165563e-07, "loss": 1.491, "step": 3442 }, { "epoch": 0.5192670236030466, "grad_norm": 0.31214694654172437, "learning_rate": 2.6136162815088175e-07, "loss": 1.5951, "step": 3443 }, { "epoch": 0.5194178417917201, "grad_norm": 0.2621271805612479, "learning_rate": 2.6125521059231954e-07, "loss": 1.5426, "step": 3444 }, { "epoch": 0.5195686599803936, "grad_norm": 0.271740818612056, "learning_rate": 2.6114879611986284e-07, "loss": 1.5348, "step": 3445 }, { "epoch": 0.5197194781690672, "grad_norm": 0.23973127661834764, "learning_rate": 2.610423847574048e-07, "loss": 1.5337, "step": 3446 }, { "epoch": 0.5198702963577407, "grad_norm": 0.26665725866133855, "learning_rate": 2.6093597652883784e-07, "loss": 1.6278, "step": 3447 }, { "epoch": 0.5200211145464143, "grad_norm": 0.26003632228083834, "learning_rate": 2.6082957145805384e-07, "loss": 1.6183, "step": 3448 }, { "epoch": 0.5201719327350879, "grad_norm": 0.259081350793201, "learning_rate": 2.607231695689437e-07, "loss": 1.5933, "step": 3449 }, { "epoch": 0.5203227509237615, "grad_norm": 0.2704935949422631, "learning_rate": 2.6061677088539777e-07, "loss": 1.5816, "step": 3450 }, { "epoch": 0.5204735691124349, "grad_norm": 0.5958003619919746, "learning_rate": 2.605103754313056e-07, "loss": 1.5625, "step": 3451 }, { "epoch": 0.5206243873011085, "grad_norm": 0.25068661919329216, "learning_rate": 2.604039832305562e-07, "loss": 1.565, "step": 3452 }, { "epoch": 0.5207752054897821, "grad_norm": 0.23637743445577852, "learning_rate": 2.602975943070376e-07, "loss": 1.5447, "step": 3453 }, { "epoch": 0.5209260236784556, "grad_norm": 0.2771298053934381, "learning_rate": 2.601912086846373e-07, "loss": 1.5708, "step": 3454 }, { "epoch": 0.5210768418671292, "grad_norm": 0.2645802923684436, "learning_rate": 2.600848263872419e-07, "loss": 1.6231, "step": 3455 }, { "epoch": 0.5212276600558027, "grad_norm": 0.2570476354424804, "learning_rate": 2.5997844743873735e-07, "loss": 1.5465, "step": 3456 }, { "epoch": 0.5213784782444763, "grad_norm": 0.24647483848346644, "learning_rate": 2.5987207186300884e-07, "loss": 1.4589, "step": 3457 }, { "epoch": 0.5215292964331498, "grad_norm": 0.2492627062540946, "learning_rate": 2.597656996839408e-07, "loss": 1.5618, "step": 3458 }, { "epoch": 0.5216801146218234, "grad_norm": 0.24356023524839673, "learning_rate": 2.596593309254168e-07, "loss": 1.5075, "step": 3459 }, { "epoch": 0.521830932810497, "grad_norm": 0.2570979238960611, "learning_rate": 2.5955296561131985e-07, "loss": 1.5627, "step": 3460 }, { "epoch": 0.5219817509991705, "grad_norm": 0.2737835754830638, "learning_rate": 2.5944660376553194e-07, "loss": 1.5675, "step": 3461 }, { "epoch": 0.522132569187844, "grad_norm": 0.25765438745021246, "learning_rate": 2.5934024541193444e-07, "loss": 1.525, "step": 3462 }, { "epoch": 0.5222833873765176, "grad_norm": 0.29237538114546824, "learning_rate": 2.5923389057440793e-07, "loss": 1.5651, "step": 3463 }, { "epoch": 0.5224342055651912, "grad_norm": 0.24218855295460975, "learning_rate": 2.591275392768321e-07, "loss": 1.5282, "step": 3464 }, { "epoch": 0.5225850237538647, "grad_norm": 0.31804114366062597, "learning_rate": 2.5902119154308606e-07, "loss": 1.6204, "step": 3465 }, { "epoch": 0.5227358419425383, "grad_norm": 0.26055067066311277, "learning_rate": 2.589148473970478e-07, "loss": 1.5551, "step": 3466 }, { "epoch": 0.5228866601312119, "grad_norm": 0.25097316862996666, "learning_rate": 2.5880850686259475e-07, "loss": 1.5746, "step": 3467 }, { "epoch": 0.5230374783198853, "grad_norm": 0.2506066181626998, "learning_rate": 2.587021699636035e-07, "loss": 1.5946, "step": 3468 }, { "epoch": 0.5231882965085589, "grad_norm": 0.2522370225688385, "learning_rate": 2.5859583672394967e-07, "loss": 1.5721, "step": 3469 }, { "epoch": 0.5233391146972325, "grad_norm": 0.3068255943113725, "learning_rate": 2.584895071675083e-07, "loss": 1.5739, "step": 3470 }, { "epoch": 0.5234899328859061, "grad_norm": 0.23850945286400205, "learning_rate": 2.583831813181533e-07, "loss": 1.5051, "step": 3471 }, { "epoch": 0.5236407510745796, "grad_norm": 0.2584496902340964, "learning_rate": 2.5827685919975816e-07, "loss": 1.5743, "step": 3472 }, { "epoch": 0.5237915692632531, "grad_norm": 0.25513878483580066, "learning_rate": 2.581705408361951e-07, "loss": 1.5676, "step": 3473 }, { "epoch": 0.5239423874519267, "grad_norm": 0.26447952862161583, "learning_rate": 2.580642262513358e-07, "loss": 1.5518, "step": 3474 }, { "epoch": 0.5240932056406002, "grad_norm": 0.2662577459383339, "learning_rate": 2.5795791546905087e-07, "loss": 1.5673, "step": 3475 }, { "epoch": 0.5242440238292738, "grad_norm": 0.32092647569567506, "learning_rate": 2.578516085132103e-07, "loss": 1.5428, "step": 3476 }, { "epoch": 0.5243948420179474, "grad_norm": 0.2498482740773882, "learning_rate": 2.5774530540768294e-07, "loss": 1.5802, "step": 3477 }, { "epoch": 0.524545660206621, "grad_norm": 0.2546959664545933, "learning_rate": 2.576390061763371e-07, "loss": 1.5317, "step": 3478 }, { "epoch": 0.5246964783952944, "grad_norm": 0.25001127382963, "learning_rate": 2.5753271084304e-07, "loss": 1.5412, "step": 3479 }, { "epoch": 0.524847296583968, "grad_norm": 0.25093090770390275, "learning_rate": 2.574264194316579e-07, "loss": 1.5312, "step": 3480 }, { "epoch": 0.5249981147726416, "grad_norm": 0.25287177921984955, "learning_rate": 2.573201319660566e-07, "loss": 1.5148, "step": 3481 }, { "epoch": 0.5251489329613152, "grad_norm": 0.24869925307935759, "learning_rate": 2.572138484701005e-07, "loss": 1.562, "step": 3482 }, { "epoch": 0.5252997511499887, "grad_norm": 0.2611394563145696, "learning_rate": 2.5710756896765337e-07, "loss": 1.5959, "step": 3483 }, { "epoch": 0.5254505693386622, "grad_norm": 0.24753523535044233, "learning_rate": 2.5700129348257814e-07, "loss": 1.5496, "step": 3484 }, { "epoch": 0.5256013875273358, "grad_norm": 0.5874019514703901, "learning_rate": 2.568950220387367e-07, "loss": 1.5877, "step": 3485 }, { "epoch": 0.5257522057160093, "grad_norm": 0.24725630624083114, "learning_rate": 2.5678875465999e-07, "loss": 1.5123, "step": 3486 }, { "epoch": 0.5259030239046829, "grad_norm": 0.25546325957175015, "learning_rate": 2.5668249137019837e-07, "loss": 1.6225, "step": 3487 }, { "epoch": 0.5260538420933565, "grad_norm": 0.2771144622831777, "learning_rate": 2.5657623219322084e-07, "loss": 1.5953, "step": 3488 }, { "epoch": 0.5262046602820301, "grad_norm": 0.25916392737960275, "learning_rate": 2.564699771529158e-07, "loss": 1.5079, "step": 3489 }, { "epoch": 0.5263554784707035, "grad_norm": 0.2527559440958871, "learning_rate": 2.5636372627314046e-07, "loss": 1.5888, "step": 3490 }, { "epoch": 0.5265062966593771, "grad_norm": 0.24067603964044604, "learning_rate": 2.5625747957775133e-07, "loss": 1.575, "step": 3491 }, { "epoch": 0.5266571148480507, "grad_norm": 0.24936834335229452, "learning_rate": 2.5615123709060395e-07, "loss": 1.5443, "step": 3492 }, { "epoch": 0.5268079330367242, "grad_norm": 0.24905887261192292, "learning_rate": 2.560449988355526e-07, "loss": 1.5863, "step": 3493 }, { "epoch": 0.5269587512253978, "grad_norm": 0.27961097872082313, "learning_rate": 2.5593876483645125e-07, "loss": 1.5328, "step": 3494 }, { "epoch": 0.5271095694140714, "grad_norm": 0.26027400008482726, "learning_rate": 2.5583253511715225e-07, "loss": 1.5847, "step": 3495 }, { "epoch": 0.5272603876027449, "grad_norm": 0.3010149921067354, "learning_rate": 2.5572630970150727e-07, "loss": 1.5505, "step": 3496 }, { "epoch": 0.5274112057914184, "grad_norm": 0.24462758422642664, "learning_rate": 2.556200886133672e-07, "loss": 1.5536, "step": 3497 }, { "epoch": 0.527562023980092, "grad_norm": 0.24902558953686316, "learning_rate": 2.5551387187658154e-07, "loss": 1.5946, "step": 3498 }, { "epoch": 0.5277128421687656, "grad_norm": 0.2613620026441812, "learning_rate": 2.554076595149991e-07, "loss": 1.5872, "step": 3499 }, { "epoch": 0.5278636603574391, "grad_norm": 0.31161334990915235, "learning_rate": 2.553014515524678e-07, "loss": 1.5581, "step": 3500 }, { "epoch": 0.5280144785461126, "grad_norm": 0.2460958270425893, "learning_rate": 2.5519524801283427e-07, "loss": 1.5814, "step": 3501 }, { "epoch": 0.5281652967347862, "grad_norm": 0.27346749530025577, "learning_rate": 2.5508904891994436e-07, "loss": 1.5826, "step": 3502 }, { "epoch": 0.5283161149234598, "grad_norm": 0.23930263953290692, "learning_rate": 2.549828542976428e-07, "loss": 1.589, "step": 3503 }, { "epoch": 0.5284669331121333, "grad_norm": 0.26860395403048765, "learning_rate": 2.5487666416977345e-07, "loss": 1.5285, "step": 3504 }, { "epoch": 0.5286177513008069, "grad_norm": 0.23913528179331986, "learning_rate": 2.5477047856017903e-07, "loss": 1.5009, "step": 3505 }, { "epoch": 0.5287685694894805, "grad_norm": 0.24512190716848994, "learning_rate": 2.546642974927013e-07, "loss": 1.519, "step": 3506 }, { "epoch": 0.5289193876781539, "grad_norm": 0.2974454301082803, "learning_rate": 2.5455812099118105e-07, "loss": 1.5193, "step": 3507 }, { "epoch": 0.5290702058668275, "grad_norm": 0.24362165691660376, "learning_rate": 2.5445194907945797e-07, "loss": 1.6427, "step": 3508 }, { "epoch": 0.5292210240555011, "grad_norm": 0.24062987703813193, "learning_rate": 2.543457817813708e-07, "loss": 1.5303, "step": 3509 }, { "epoch": 0.5293718422441747, "grad_norm": 0.25073290950057436, "learning_rate": 2.5423961912075707e-07, "loss": 1.5383, "step": 3510 }, { "epoch": 0.5295226604328482, "grad_norm": 0.24069293518381765, "learning_rate": 2.5413346112145343e-07, "loss": 1.5439, "step": 3511 }, { "epoch": 0.5296734786215218, "grad_norm": 0.245924529616333, "learning_rate": 2.540273078072956e-07, "loss": 1.6223, "step": 3512 }, { "epoch": 0.5298242968101953, "grad_norm": 0.28547544148896653, "learning_rate": 2.539211592021178e-07, "loss": 1.5504, "step": 3513 }, { "epoch": 0.5299751149988688, "grad_norm": 0.26993529938387667, "learning_rate": 2.538150153297537e-07, "loss": 1.594, "step": 3514 }, { "epoch": 0.5301259331875424, "grad_norm": 0.251950812081441, "learning_rate": 2.537088762140357e-07, "loss": 1.6449, "step": 3515 }, { "epoch": 0.530276751376216, "grad_norm": 0.2618791960227477, "learning_rate": 2.5360274187879497e-07, "loss": 1.5748, "step": 3516 }, { "epoch": 0.5304275695648896, "grad_norm": 0.2782190715307245, "learning_rate": 2.5349661234786185e-07, "loss": 1.5571, "step": 3517 }, { "epoch": 0.530578387753563, "grad_norm": 0.2675706137635166, "learning_rate": 2.5339048764506555e-07, "loss": 1.5349, "step": 3518 }, { "epoch": 0.5307292059422366, "grad_norm": 0.26581608327045403, "learning_rate": 2.5328436779423406e-07, "loss": 1.627, "step": 3519 }, { "epoch": 0.5308800241309102, "grad_norm": 0.27574459162247106, "learning_rate": 2.5317825281919453e-07, "loss": 1.587, "step": 3520 }, { "epoch": 0.5310308423195838, "grad_norm": 0.2516827300261559, "learning_rate": 2.530721427437727e-07, "loss": 1.5853, "step": 3521 }, { "epoch": 0.5311816605082573, "grad_norm": 0.28610110489774726, "learning_rate": 2.529660375917934e-07, "loss": 1.5742, "step": 3522 }, { "epoch": 0.5313324786969309, "grad_norm": 0.25757914000919657, "learning_rate": 2.528599373870804e-07, "loss": 1.5944, "step": 3523 }, { "epoch": 0.5314832968856044, "grad_norm": 0.26423042771554117, "learning_rate": 2.5275384215345627e-07, "loss": 1.5864, "step": 3524 }, { "epoch": 0.5316341150742779, "grad_norm": 0.2722270107839057, "learning_rate": 2.526477519147424e-07, "loss": 1.5289, "step": 3525 }, { "epoch": 0.5317849332629515, "grad_norm": 0.29345796456266554, "learning_rate": 2.5254166669475926e-07, "loss": 1.623, "step": 3526 }, { "epoch": 0.5319357514516251, "grad_norm": 0.25898910833468286, "learning_rate": 2.524355865173259e-07, "loss": 1.5909, "step": 3527 }, { "epoch": 0.5320865696402987, "grad_norm": 0.32284026703302277, "learning_rate": 2.523295114062606e-07, "loss": 1.6422, "step": 3528 }, { "epoch": 0.5322373878289721, "grad_norm": 0.26149293616511654, "learning_rate": 2.522234413853802e-07, "loss": 1.5461, "step": 3529 }, { "epoch": 0.5323882060176457, "grad_norm": 0.2521176471990762, "learning_rate": 2.5211737647850053e-07, "loss": 1.5801, "step": 3530 }, { "epoch": 0.5325390242063193, "grad_norm": 0.33083588263990305, "learning_rate": 2.520113167094363e-07, "loss": 1.5678, "step": 3531 }, { "epoch": 0.5326898423949928, "grad_norm": 0.24473178120743763, "learning_rate": 2.5190526210200093e-07, "loss": 1.5813, "step": 3532 }, { "epoch": 0.5328406605836664, "grad_norm": 0.25503783863822016, "learning_rate": 2.517992126800068e-07, "loss": 1.5506, "step": 3533 }, { "epoch": 0.53299147877234, "grad_norm": 0.25021014615026094, "learning_rate": 2.5169316846726515e-07, "loss": 1.5091, "step": 3534 }, { "epoch": 0.5331422969610135, "grad_norm": 0.2541044663955351, "learning_rate": 2.515871294875859e-07, "loss": 1.5303, "step": 3535 }, { "epoch": 0.533293115149687, "grad_norm": 0.37817575805690823, "learning_rate": 2.51481095764778e-07, "loss": 1.5387, "step": 3536 }, { "epoch": 0.5334439333383606, "grad_norm": 0.24535242253342482, "learning_rate": 2.5137506732264895e-07, "loss": 1.5676, "step": 3537 }, { "epoch": 0.5335947515270342, "grad_norm": 0.2733669094173515, "learning_rate": 2.512690441850054e-07, "loss": 1.5428, "step": 3538 }, { "epoch": 0.5337455697157077, "grad_norm": 0.25954789607178197, "learning_rate": 2.5116302637565255e-07, "loss": 1.5862, "step": 3539 }, { "epoch": 0.5338963879043813, "grad_norm": 0.309637144793164, "learning_rate": 2.5105701391839446e-07, "loss": 1.5611, "step": 3540 }, { "epoch": 0.5340472060930548, "grad_norm": 0.2420016020049591, "learning_rate": 2.509510068370341e-07, "loss": 1.5846, "step": 3541 }, { "epoch": 0.5341980242817284, "grad_norm": 0.26169552956251124, "learning_rate": 2.5084500515537304e-07, "loss": 1.573, "step": 3542 }, { "epoch": 0.5343488424704019, "grad_norm": 0.256453723857227, "learning_rate": 2.5073900889721174e-07, "loss": 1.5034, "step": 3543 }, { "epoch": 0.5344996606590755, "grad_norm": 0.27813789544137113, "learning_rate": 2.506330180863496e-07, "loss": 1.5684, "step": 3544 }, { "epoch": 0.5346504788477491, "grad_norm": 0.25234294332864915, "learning_rate": 2.5052703274658446e-07, "loss": 1.5334, "step": 3545 }, { "epoch": 0.5348012970364225, "grad_norm": 0.2625582654158137, "learning_rate": 2.5042105290171326e-07, "loss": 1.582, "step": 3546 }, { "epoch": 0.5349521152250961, "grad_norm": 0.2367841824021432, "learning_rate": 2.5031507857553154e-07, "loss": 1.5782, "step": 3547 }, { "epoch": 0.5351029334137697, "grad_norm": 0.2562501960561911, "learning_rate": 2.5020910979183346e-07, "loss": 1.596, "step": 3548 }, { "epoch": 0.5352537516024433, "grad_norm": 0.4601915570204031, "learning_rate": 2.5010314657441235e-07, "loss": 1.5404, "step": 3549 }, { "epoch": 0.5354045697911168, "grad_norm": 0.24387253387053395, "learning_rate": 2.499971889470598e-07, "loss": 1.5421, "step": 3550 }, { "epoch": 0.5355553879797904, "grad_norm": 0.265517357013569, "learning_rate": 2.498912369335666e-07, "loss": 1.611, "step": 3551 }, { "epoch": 0.5357062061684639, "grad_norm": 0.36906846802969784, "learning_rate": 2.497852905577219e-07, "loss": 1.5658, "step": 3552 }, { "epoch": 0.5358570243571374, "grad_norm": 0.259593341801903, "learning_rate": 2.496793498433138e-07, "loss": 1.5748, "step": 3553 }, { "epoch": 0.536007842545811, "grad_norm": 0.3200271372913328, "learning_rate": 2.495734148141291e-07, "loss": 1.4802, "step": 3554 }, { "epoch": 0.5361586607344846, "grad_norm": 0.2905874652597384, "learning_rate": 2.4946748549395325e-07, "loss": 1.5384, "step": 3555 }, { "epoch": 0.5363094789231582, "grad_norm": 0.25318479822579476, "learning_rate": 2.4936156190657047e-07, "loss": 1.5881, "step": 3556 }, { "epoch": 0.5364602971118317, "grad_norm": 0.2576415973112576, "learning_rate": 2.492556440757638e-07, "loss": 1.5923, "step": 3557 }, { "epoch": 0.5366111153005052, "grad_norm": 0.2583235905639093, "learning_rate": 2.491497320253147e-07, "loss": 1.6253, "step": 3558 }, { "epoch": 0.5367619334891788, "grad_norm": 0.24540277572881986, "learning_rate": 2.4904382577900367e-07, "loss": 1.5298, "step": 3559 }, { "epoch": 0.5369127516778524, "grad_norm": 0.24986792681543368, "learning_rate": 2.4893792536060966e-07, "loss": 1.5946, "step": 3560 }, { "epoch": 0.5370635698665259, "grad_norm": 0.2513171723500393, "learning_rate": 2.488320307939104e-07, "loss": 1.5697, "step": 3561 }, { "epoch": 0.5372143880551995, "grad_norm": 0.3112567416606474, "learning_rate": 2.487261421026823e-07, "loss": 1.5535, "step": 3562 }, { "epoch": 0.537365206243873, "grad_norm": 0.2575564935209731, "learning_rate": 2.486202593107006e-07, "loss": 1.6151, "step": 3563 }, { "epoch": 0.5375160244325465, "grad_norm": 0.2567580878163092, "learning_rate": 2.485143824417389e-07, "loss": 1.5657, "step": 3564 }, { "epoch": 0.5376668426212201, "grad_norm": 0.24389373835248265, "learning_rate": 2.4840851151956957e-07, "loss": 1.6059, "step": 3565 }, { "epoch": 0.5378176608098937, "grad_norm": 0.25309193545332304, "learning_rate": 2.483026465679639e-07, "loss": 1.5418, "step": 3566 }, { "epoch": 0.5379684789985673, "grad_norm": 0.2557894707789005, "learning_rate": 2.481967876106917e-07, "loss": 1.5498, "step": 3567 }, { "epoch": 0.5381192971872408, "grad_norm": 0.24582848285938932, "learning_rate": 2.480909346715211e-07, "loss": 1.5359, "step": 3568 }, { "epoch": 0.5382701153759143, "grad_norm": 0.25466688551768263, "learning_rate": 2.479850877742195e-07, "loss": 1.5738, "step": 3569 }, { "epoch": 0.5384209335645879, "grad_norm": 0.25479550425895725, "learning_rate": 2.4787924694255236e-07, "loss": 1.5369, "step": 3570 }, { "epoch": 0.5385717517532614, "grad_norm": 0.242997356531885, "learning_rate": 2.4777341220028416e-07, "loss": 1.572, "step": 3571 }, { "epoch": 0.538722569941935, "grad_norm": 0.2436254587281785, "learning_rate": 2.4766758357117784e-07, "loss": 1.536, "step": 3572 }, { "epoch": 0.5388733881306086, "grad_norm": 0.3239443419270094, "learning_rate": 2.4756176107899494e-07, "loss": 1.5273, "step": 3573 }, { "epoch": 0.539024206319282, "grad_norm": 0.25572158284117197, "learning_rate": 2.4745594474749585e-07, "loss": 1.4892, "step": 3574 }, { "epoch": 0.5391750245079556, "grad_norm": 0.2625148394944358, "learning_rate": 2.4735013460043933e-07, "loss": 1.6197, "step": 3575 }, { "epoch": 0.5393258426966292, "grad_norm": 0.24336310043788423, "learning_rate": 2.4724433066158283e-07, "loss": 1.653, "step": 3576 }, { "epoch": 0.5394766608853028, "grad_norm": 0.25107677729321576, "learning_rate": 2.471385329546825e-07, "loss": 1.6108, "step": 3577 }, { "epoch": 0.5396274790739763, "grad_norm": 0.25565257098025534, "learning_rate": 2.4703274150349285e-07, "loss": 1.5368, "step": 3578 }, { "epoch": 0.5397782972626499, "grad_norm": 0.2516168212131607, "learning_rate": 2.4692695633176727e-07, "loss": 1.6004, "step": 3579 }, { "epoch": 0.5399291154513234, "grad_norm": 0.357565028404368, "learning_rate": 2.4682117746325753e-07, "loss": 1.5574, "step": 3580 }, { "epoch": 0.540079933639997, "grad_norm": 0.24086631336091513, "learning_rate": 2.467154049217142e-07, "loss": 1.5089, "step": 3581 }, { "epoch": 0.5402307518286705, "grad_norm": 0.3350214806976764, "learning_rate": 2.466096387308861e-07, "loss": 1.6208, "step": 3582 }, { "epoch": 0.5403815700173441, "grad_norm": 0.249638578712114, "learning_rate": 2.4650387891452096e-07, "loss": 1.59, "step": 3583 }, { "epoch": 0.5405323882060177, "grad_norm": 0.2542986654698333, "learning_rate": 2.463981254963649e-07, "loss": 1.543, "step": 3584 }, { "epoch": 0.5406832063946913, "grad_norm": 0.26067240069467273, "learning_rate": 2.462923785001627e-07, "loss": 1.5852, "step": 3585 }, { "epoch": 0.5408340245833647, "grad_norm": 0.2463546732060017, "learning_rate": 2.461866379496575e-07, "loss": 1.5081, "step": 3586 }, { "epoch": 0.5409848427720383, "grad_norm": 0.23489869798759375, "learning_rate": 2.460809038685912e-07, "loss": 1.5447, "step": 3587 }, { "epoch": 0.5411356609607119, "grad_norm": 0.24221533397665837, "learning_rate": 2.459751762807043e-07, "loss": 1.5746, "step": 3588 }, { "epoch": 0.5412864791493854, "grad_norm": 0.24791504924382135, "learning_rate": 2.458694552097356e-07, "loss": 1.5382, "step": 3589 }, { "epoch": 0.541437297338059, "grad_norm": 0.2574534029734148, "learning_rate": 2.4576374067942256e-07, "loss": 1.6216, "step": 3590 }, { "epoch": 0.5415881155267325, "grad_norm": 0.2679891830487016, "learning_rate": 2.4565803271350117e-07, "loss": 1.5621, "step": 3591 }, { "epoch": 0.541738933715406, "grad_norm": 0.34319500827264665, "learning_rate": 2.4555233133570595e-07, "loss": 1.4834, "step": 3592 }, { "epoch": 0.5418897519040796, "grad_norm": 0.24811600901697226, "learning_rate": 2.4544663656977004e-07, "loss": 1.5348, "step": 3593 }, { "epoch": 0.5420405700927532, "grad_norm": 0.23934457658158845, "learning_rate": 2.453409484394248e-07, "loss": 1.5886, "step": 3594 }, { "epoch": 0.5421913882814268, "grad_norm": 0.2466036375870887, "learning_rate": 2.452352669684005e-07, "loss": 1.5959, "step": 3595 }, { "epoch": 0.5423422064701003, "grad_norm": 0.2514097237587263, "learning_rate": 2.4512959218042556e-07, "loss": 1.4942, "step": 3596 }, { "epoch": 0.5424930246587738, "grad_norm": 0.4800774121409877, "learning_rate": 2.450239240992271e-07, "loss": 1.5149, "step": 3597 }, { "epoch": 0.5426438428474474, "grad_norm": 0.254282445301358, "learning_rate": 2.4491826274853074e-07, "loss": 1.6246, "step": 3598 }, { "epoch": 0.542794661036121, "grad_norm": 0.2580213491036497, "learning_rate": 2.448126081520604e-07, "loss": 1.5342, "step": 3599 }, { "epoch": 0.5429454792247945, "grad_norm": 0.25683975686626387, "learning_rate": 2.4470696033353866e-07, "loss": 1.5651, "step": 3600 }, { "epoch": 0.5430962974134681, "grad_norm": 0.24544494870168712, "learning_rate": 2.4460131931668664e-07, "loss": 1.5922, "step": 3601 }, { "epoch": 0.5432471156021417, "grad_norm": 0.2567332109666076, "learning_rate": 2.4449568512522367e-07, "loss": 1.5531, "step": 3602 }, { "epoch": 0.5433979337908151, "grad_norm": 0.26764572837580614, "learning_rate": 2.4439005778286785e-07, "loss": 1.5531, "step": 3603 }, { "epoch": 0.5435487519794887, "grad_norm": 0.3361771588722897, "learning_rate": 2.4428443731333545e-07, "loss": 1.6069, "step": 3604 }, { "epoch": 0.5436995701681623, "grad_norm": 0.2841049813886416, "learning_rate": 2.441788237403414e-07, "loss": 1.5296, "step": 3605 }, { "epoch": 0.5438503883568359, "grad_norm": 0.2545745313697678, "learning_rate": 2.440732170875991e-07, "loss": 1.5929, "step": 3606 }, { "epoch": 0.5440012065455094, "grad_norm": 0.25337715389509585, "learning_rate": 2.439676173788202e-07, "loss": 1.5714, "step": 3607 }, { "epoch": 0.5441520247341829, "grad_norm": 0.2446314960927492, "learning_rate": 2.43862024637715e-07, "loss": 1.5205, "step": 3608 }, { "epoch": 0.5443028429228565, "grad_norm": 0.2583744288497405, "learning_rate": 2.43756438887992e-07, "loss": 1.5997, "step": 3609 }, { "epoch": 0.54445366111153, "grad_norm": 0.2520486322377621, "learning_rate": 2.4365086015335853e-07, "loss": 1.5467, "step": 3610 }, { "epoch": 0.5446044793002036, "grad_norm": 0.2531732482880286, "learning_rate": 2.4354528845751986e-07, "loss": 1.5206, "step": 3611 }, { "epoch": 0.5447552974888772, "grad_norm": 0.2483682771543181, "learning_rate": 2.4343972382418e-07, "loss": 1.6046, "step": 3612 }, { "epoch": 0.5449061156775508, "grad_norm": 0.24439864855994065, "learning_rate": 2.4333416627704136e-07, "loss": 1.4808, "step": 3613 }, { "epoch": 0.5450569338662242, "grad_norm": 0.3600198536008531, "learning_rate": 2.4322861583980447e-07, "loss": 1.529, "step": 3614 }, { "epoch": 0.5452077520548978, "grad_norm": 0.2490329476442969, "learning_rate": 2.431230725361687e-07, "loss": 1.542, "step": 3615 }, { "epoch": 0.5453585702435714, "grad_norm": 0.2473844578166116, "learning_rate": 2.430175363898315e-07, "loss": 1.5114, "step": 3616 }, { "epoch": 0.545509388432245, "grad_norm": 0.23918089282027064, "learning_rate": 2.4291200742448873e-07, "loss": 1.5595, "step": 3617 }, { "epoch": 0.5456602066209185, "grad_norm": 0.23739710855690835, "learning_rate": 2.4280648566383486e-07, "loss": 1.5684, "step": 3618 }, { "epoch": 0.545811024809592, "grad_norm": 0.2486525885290028, "learning_rate": 2.427009711315626e-07, "loss": 1.5525, "step": 3619 }, { "epoch": 0.5459618429982656, "grad_norm": 0.24953932351497338, "learning_rate": 2.425954638513629e-07, "loss": 1.5736, "step": 3620 }, { "epoch": 0.5461126611869391, "grad_norm": 0.31864413884135867, "learning_rate": 2.424899638469253e-07, "loss": 1.5432, "step": 3621 }, { "epoch": 0.5462634793756127, "grad_norm": 0.2517109482883882, "learning_rate": 2.4238447114193757e-07, "loss": 1.5364, "step": 3622 }, { "epoch": 0.5464142975642863, "grad_norm": 0.24347151696480387, "learning_rate": 2.4227898576008596e-07, "loss": 1.5847, "step": 3623 }, { "epoch": 0.5465651157529599, "grad_norm": 0.23677989847203543, "learning_rate": 2.4217350772505503e-07, "loss": 1.4737, "step": 3624 }, { "epoch": 0.5467159339416333, "grad_norm": 0.2468086290035077, "learning_rate": 2.4206803706052757e-07, "loss": 1.5888, "step": 3625 }, { "epoch": 0.5468667521303069, "grad_norm": 0.2606802458058856, "learning_rate": 2.4196257379018485e-07, "loss": 1.5506, "step": 3626 }, { "epoch": 0.5470175703189805, "grad_norm": 0.23904823371329273, "learning_rate": 2.4185711793770655e-07, "loss": 1.5732, "step": 3627 }, { "epoch": 0.547168388507654, "grad_norm": 0.36528586683365055, "learning_rate": 2.4175166952677045e-07, "loss": 1.5987, "step": 3628 }, { "epoch": 0.5473192066963276, "grad_norm": 0.25159845930818714, "learning_rate": 2.416462285810529e-07, "loss": 1.5513, "step": 3629 }, { "epoch": 0.5474700248850012, "grad_norm": 0.24853920826770132, "learning_rate": 2.415407951242283e-07, "loss": 1.5624, "step": 3630 }, { "epoch": 0.5476208430736746, "grad_norm": 0.23867638541766906, "learning_rate": 2.414353691799698e-07, "loss": 1.6519, "step": 3631 }, { "epoch": 0.5477716612623482, "grad_norm": 0.28298887384204163, "learning_rate": 2.413299507719483e-07, "loss": 1.6201, "step": 3632 }, { "epoch": 0.5479224794510218, "grad_norm": 0.2506146374102195, "learning_rate": 2.412245399238335e-07, "loss": 1.5457, "step": 3633 }, { "epoch": 0.5480732976396954, "grad_norm": 0.24805360609832733, "learning_rate": 2.411191366592931e-07, "loss": 1.6349, "step": 3634 }, { "epoch": 0.5482241158283689, "grad_norm": 0.25391443799219615, "learning_rate": 2.4101374100199315e-07, "loss": 1.5485, "step": 3635 }, { "epoch": 0.5483749340170424, "grad_norm": 0.2979419731401153, "learning_rate": 2.409083529755983e-07, "loss": 1.5457, "step": 3636 }, { "epoch": 0.548525752205716, "grad_norm": 0.25061725613115277, "learning_rate": 2.40802972603771e-07, "loss": 1.5062, "step": 3637 }, { "epoch": 0.5486765703943896, "grad_norm": 0.3675582934544481, "learning_rate": 2.406975999101723e-07, "loss": 1.514, "step": 3638 }, { "epoch": 0.5488273885830631, "grad_norm": 0.2337542475694023, "learning_rate": 2.4059223491846135e-07, "loss": 1.5166, "step": 3639 }, { "epoch": 0.5489782067717367, "grad_norm": 0.25198011405401805, "learning_rate": 2.4048687765229574e-07, "loss": 1.5625, "step": 3640 }, { "epoch": 0.5491290249604103, "grad_norm": 0.24638371098340794, "learning_rate": 2.403815281353313e-07, "loss": 1.578, "step": 3641 }, { "epoch": 0.5492798431490837, "grad_norm": 0.2563153278271117, "learning_rate": 2.402761863912219e-07, "loss": 1.6488, "step": 3642 }, { "epoch": 0.5494306613377573, "grad_norm": 0.2580106370971055, "learning_rate": 2.4017085244362e-07, "loss": 1.6231, "step": 3643 }, { "epoch": 0.5495814795264309, "grad_norm": 0.24455256267634917, "learning_rate": 2.4006552631617604e-07, "loss": 1.5983, "step": 3644 }, { "epoch": 0.5497322977151045, "grad_norm": 0.39325417509774485, "learning_rate": 2.399602080325388e-07, "loss": 1.5503, "step": 3645 }, { "epoch": 0.549883115903778, "grad_norm": 0.2441326961183391, "learning_rate": 2.3985489761635543e-07, "loss": 1.5835, "step": 3646 }, { "epoch": 0.5500339340924516, "grad_norm": 0.26732110100009343, "learning_rate": 2.3974959509127095e-07, "loss": 1.5572, "step": 3647 }, { "epoch": 0.5501847522811251, "grad_norm": 0.2643667705732118, "learning_rate": 2.3964430048092903e-07, "loss": 1.5744, "step": 3648 }, { "epoch": 0.5503355704697986, "grad_norm": 0.25634976898777334, "learning_rate": 2.3953901380897137e-07, "loss": 1.5818, "step": 3649 }, { "epoch": 0.5504863886584722, "grad_norm": 0.24104439553659085, "learning_rate": 2.3943373509903776e-07, "loss": 1.5567, "step": 3650 }, { "epoch": 0.5506372068471458, "grad_norm": 0.27330546395370564, "learning_rate": 2.393284643747665e-07, "loss": 1.5864, "step": 3651 }, { "epoch": 0.5507880250358194, "grad_norm": 0.24567026735933592, "learning_rate": 2.3922320165979384e-07, "loss": 1.5429, "step": 3652 }, { "epoch": 0.5509388432244928, "grad_norm": 0.25877285545727613, "learning_rate": 2.3911794697775427e-07, "loss": 1.5658, "step": 3653 }, { "epoch": 0.5510896614131664, "grad_norm": 0.2829892531211758, "learning_rate": 2.390127003522807e-07, "loss": 1.5716, "step": 3654 }, { "epoch": 0.55124047960184, "grad_norm": 0.26420734964587744, "learning_rate": 2.3890746180700393e-07, "loss": 1.5965, "step": 3655 }, { "epoch": 0.5513912977905135, "grad_norm": 0.2813925312118151, "learning_rate": 2.388022313655531e-07, "loss": 1.5952, "step": 3656 }, { "epoch": 0.5515421159791871, "grad_norm": 0.27421147776570304, "learning_rate": 2.3869700905155555e-07, "loss": 1.5298, "step": 3657 }, { "epoch": 0.5516929341678607, "grad_norm": 0.24612321380173793, "learning_rate": 2.385917948886367e-07, "loss": 1.5782, "step": 3658 }, { "epoch": 0.5518437523565342, "grad_norm": 0.3010225001430544, "learning_rate": 2.384865889004203e-07, "loss": 1.5167, "step": 3659 }, { "epoch": 0.5519945705452077, "grad_norm": 0.25818104168215283, "learning_rate": 2.3838139111052796e-07, "loss": 1.5353, "step": 3660 }, { "epoch": 0.5521453887338813, "grad_norm": 0.2594370849630129, "learning_rate": 2.3827620154257982e-07, "loss": 1.5772, "step": 3661 }, { "epoch": 0.5522962069225549, "grad_norm": 0.2581055323738375, "learning_rate": 2.3817102022019399e-07, "loss": 1.5492, "step": 3662 }, { "epoch": 0.5524470251112285, "grad_norm": 0.380152434425543, "learning_rate": 2.3806584716698663e-07, "loss": 1.5527, "step": 3663 }, { "epoch": 0.5525978432999019, "grad_norm": 0.24856846332942859, "learning_rate": 2.379606824065723e-07, "loss": 1.5547, "step": 3664 }, { "epoch": 0.5527486614885755, "grad_norm": 0.2497454185942363, "learning_rate": 2.3785552596256343e-07, "loss": 1.5996, "step": 3665 }, { "epoch": 0.5528994796772491, "grad_norm": 0.31191475381343176, "learning_rate": 2.377503778585707e-07, "loss": 1.5698, "step": 3666 }, { "epoch": 0.5530502978659226, "grad_norm": 0.25142526486928735, "learning_rate": 2.3764523811820308e-07, "loss": 1.5149, "step": 3667 }, { "epoch": 0.5532011160545962, "grad_norm": 0.27740314564154706, "learning_rate": 2.375401067650673e-07, "loss": 1.5717, "step": 3668 }, { "epoch": 0.5533519342432698, "grad_norm": 0.2549316680761266, "learning_rate": 2.3743498382276855e-07, "loss": 1.5578, "step": 3669 }, { "epoch": 0.5535027524319432, "grad_norm": 0.23933092939208836, "learning_rate": 2.3732986931490996e-07, "loss": 1.5491, "step": 3670 }, { "epoch": 0.5536535706206168, "grad_norm": 0.3834995115356936, "learning_rate": 2.3722476326509272e-07, "loss": 1.5602, "step": 3671 }, { "epoch": 0.5538043888092904, "grad_norm": 0.37334563762423256, "learning_rate": 2.3711966569691634e-07, "loss": 1.6239, "step": 3672 }, { "epoch": 0.553955206997964, "grad_norm": 0.2395634198444132, "learning_rate": 2.3701457663397812e-07, "loss": 1.5575, "step": 3673 }, { "epoch": 0.5541060251866375, "grad_norm": 0.26102920902338, "learning_rate": 2.3690949609987375e-07, "loss": 1.5986, "step": 3674 }, { "epoch": 0.5542568433753111, "grad_norm": 0.8757806837521017, "learning_rate": 2.3680442411819684e-07, "loss": 1.5603, "step": 3675 }, { "epoch": 0.5544076615639846, "grad_norm": 0.3097578413639605, "learning_rate": 2.3669936071253905e-07, "loss": 1.5871, "step": 3676 }, { "epoch": 0.5545584797526582, "grad_norm": 0.2415676421450433, "learning_rate": 2.3659430590649032e-07, "loss": 1.6083, "step": 3677 }, { "epoch": 0.5547092979413317, "grad_norm": 0.25008839522250964, "learning_rate": 2.364892597236383e-07, "loss": 1.5743, "step": 3678 }, { "epoch": 0.5548601161300053, "grad_norm": 0.28816170962884513, "learning_rate": 2.3638422218756903e-07, "loss": 1.5044, "step": 3679 }, { "epoch": 0.5550109343186789, "grad_norm": 0.259771854122331, "learning_rate": 2.3627919332186657e-07, "loss": 1.517, "step": 3680 }, { "epoch": 0.5551617525073523, "grad_norm": 0.2921177162408171, "learning_rate": 2.3617417315011285e-07, "loss": 1.5945, "step": 3681 }, { "epoch": 0.5553125706960259, "grad_norm": 0.25867029639801753, "learning_rate": 2.3606916169588804e-07, "loss": 1.5291, "step": 3682 }, { "epoch": 0.5554633888846995, "grad_norm": 0.2432258692914303, "learning_rate": 2.359641589827701e-07, "loss": 1.5735, "step": 3683 }, { "epoch": 0.5556142070733731, "grad_norm": 0.25144434374534613, "learning_rate": 2.358591650343354e-07, "loss": 1.5869, "step": 3684 }, { "epoch": 0.5557650252620466, "grad_norm": 0.2548083114866012, "learning_rate": 2.357541798741581e-07, "loss": 1.4551, "step": 3685 }, { "epoch": 0.5559158434507202, "grad_norm": 0.2580466394469013, "learning_rate": 2.3564920352581025e-07, "loss": 1.5541, "step": 3686 }, { "epoch": 0.5560666616393937, "grad_norm": 0.24127002626442975, "learning_rate": 2.355442360128624e-07, "loss": 1.5381, "step": 3687 }, { "epoch": 0.5562174798280672, "grad_norm": 0.24253121564118715, "learning_rate": 2.3543927735888248e-07, "loss": 1.5691, "step": 3688 }, { "epoch": 0.5563682980167408, "grad_norm": 0.2569879120797617, "learning_rate": 2.35334327587437e-07, "loss": 1.5691, "step": 3689 }, { "epoch": 0.5565191162054144, "grad_norm": 0.23527231150376013, "learning_rate": 2.352293867220902e-07, "loss": 1.5975, "step": 3690 }, { "epoch": 0.556669934394088, "grad_norm": 0.25058550713662187, "learning_rate": 2.3512445478640425e-07, "loss": 1.5556, "step": 3691 }, { "epoch": 0.5568207525827615, "grad_norm": 0.2446541392046704, "learning_rate": 2.3501953180393958e-07, "loss": 1.6593, "step": 3692 }, { "epoch": 0.556971570771435, "grad_norm": 0.25115483375925146, "learning_rate": 2.3491461779825433e-07, "loss": 1.5412, "step": 3693 }, { "epoch": 0.5571223889601086, "grad_norm": 0.2520219702329995, "learning_rate": 2.348097127929048e-07, "loss": 1.5735, "step": 3694 }, { "epoch": 0.5572732071487821, "grad_norm": 0.2594579956511486, "learning_rate": 2.3470481681144526e-07, "loss": 1.5407, "step": 3695 }, { "epoch": 0.5574240253374557, "grad_norm": 0.2632419593584636, "learning_rate": 2.3459992987742783e-07, "loss": 1.5122, "step": 3696 }, { "epoch": 0.5575748435261293, "grad_norm": 0.24066331709923033, "learning_rate": 2.3449505201440272e-07, "loss": 1.5542, "step": 3697 }, { "epoch": 0.5577256617148028, "grad_norm": 0.2670785975264236, "learning_rate": 2.3439018324591815e-07, "loss": 1.5664, "step": 3698 }, { "epoch": 0.5578764799034763, "grad_norm": 0.2566681108304461, "learning_rate": 2.3428532359552006e-07, "loss": 1.4981, "step": 3699 }, { "epoch": 0.5580272980921499, "grad_norm": 0.3193962578119453, "learning_rate": 2.341804730867526e-07, "loss": 1.5459, "step": 3700 }, { "epoch": 0.5581781162808235, "grad_norm": 0.24338695588518144, "learning_rate": 2.3407563174315764e-07, "loss": 1.6626, "step": 3701 }, { "epoch": 0.558328934469497, "grad_norm": 0.26531266120238695, "learning_rate": 2.3397079958827527e-07, "loss": 1.5394, "step": 3702 }, { "epoch": 0.5584797526581706, "grad_norm": 0.25190397601521103, "learning_rate": 2.3386597664564335e-07, "loss": 1.5861, "step": 3703 }, { "epoch": 0.5586305708468441, "grad_norm": 0.2446769652456106, "learning_rate": 2.337611629387975e-07, "loss": 1.5025, "step": 3704 }, { "epoch": 0.5587813890355177, "grad_norm": 0.25365522039409677, "learning_rate": 2.3365635849127163e-07, "loss": 1.5669, "step": 3705 }, { "epoch": 0.5589322072241912, "grad_norm": 0.25214964867523426, "learning_rate": 2.3355156332659743e-07, "loss": 1.5295, "step": 3706 }, { "epoch": 0.5590830254128648, "grad_norm": 0.2653556219624336, "learning_rate": 2.3344677746830426e-07, "loss": 1.5761, "step": 3707 }, { "epoch": 0.5592338436015384, "grad_norm": 0.2480112212862208, "learning_rate": 2.333420009399198e-07, "loss": 1.5422, "step": 3708 }, { "epoch": 0.5593846617902118, "grad_norm": 0.24583448225831397, "learning_rate": 2.3323723376496928e-07, "loss": 1.5547, "step": 3709 }, { "epoch": 0.5595354799788854, "grad_norm": 0.26179366693271394, "learning_rate": 2.3313247596697604e-07, "loss": 1.5524, "step": 3710 }, { "epoch": 0.559686298167559, "grad_norm": 0.2810880095236748, "learning_rate": 2.330277275694613e-07, "loss": 1.5449, "step": 3711 }, { "epoch": 0.5598371163562326, "grad_norm": 0.28865368321909335, "learning_rate": 2.3292298859594407e-07, "loss": 1.5111, "step": 3712 }, { "epoch": 0.5599879345449061, "grad_norm": 0.28220169530861183, "learning_rate": 2.3281825906994134e-07, "loss": 1.5148, "step": 3713 }, { "epoch": 0.5601387527335797, "grad_norm": 0.8659363874227264, "learning_rate": 2.3271353901496786e-07, "loss": 1.5034, "step": 3714 }, { "epoch": 0.5602895709222532, "grad_norm": 0.24744321943675116, "learning_rate": 2.3260882845453643e-07, "loss": 1.5889, "step": 3715 }, { "epoch": 0.5604403891109268, "grad_norm": 0.256305715829819, "learning_rate": 2.325041274121576e-07, "loss": 1.5833, "step": 3716 }, { "epoch": 0.5605912072996003, "grad_norm": 0.26064801863351983, "learning_rate": 2.3239943591133975e-07, "loss": 1.5298, "step": 3717 }, { "epoch": 0.5607420254882739, "grad_norm": 0.30004312330576116, "learning_rate": 2.3229475397558918e-07, "loss": 1.6146, "step": 3718 }, { "epoch": 0.5608928436769475, "grad_norm": 0.24133194064638744, "learning_rate": 2.3219008162841014e-07, "loss": 1.5237, "step": 3719 }, { "epoch": 0.561043661865621, "grad_norm": 0.2921675486036917, "learning_rate": 2.3208541889330446e-07, "loss": 1.5492, "step": 3720 }, { "epoch": 0.5611944800542945, "grad_norm": 0.2555644620420599, "learning_rate": 2.3198076579377208e-07, "loss": 1.4912, "step": 3721 }, { "epoch": 0.5613452982429681, "grad_norm": 0.24578747399944298, "learning_rate": 2.3187612235331067e-07, "loss": 1.6188, "step": 3722 }, { "epoch": 0.5614961164316417, "grad_norm": 0.26423511951666934, "learning_rate": 2.3177148859541563e-07, "loss": 1.5885, "step": 3723 }, { "epoch": 0.5616469346203152, "grad_norm": 0.2857468589383398, "learning_rate": 2.3166686454358042e-07, "loss": 1.5766, "step": 3724 }, { "epoch": 0.5617977528089888, "grad_norm": 0.25503409850321557, "learning_rate": 2.3156225022129604e-07, "loss": 1.6271, "step": 3725 }, { "epoch": 0.5619485709976623, "grad_norm": 0.24713449835532042, "learning_rate": 2.3145764565205162e-07, "loss": 1.5937, "step": 3726 }, { "epoch": 0.5620993891863358, "grad_norm": 0.24672820850266292, "learning_rate": 2.3135305085933376e-07, "loss": 1.5304, "step": 3727 }, { "epoch": 0.5622502073750094, "grad_norm": 0.2517702226825073, "learning_rate": 2.3124846586662717e-07, "loss": 1.5882, "step": 3728 }, { "epoch": 0.562401025563683, "grad_norm": 0.25707062152749144, "learning_rate": 2.3114389069741418e-07, "loss": 1.5541, "step": 3729 }, { "epoch": 0.5625518437523566, "grad_norm": 0.26060636056281195, "learning_rate": 2.3103932537517495e-07, "loss": 1.5432, "step": 3730 }, { "epoch": 0.5627026619410301, "grad_norm": 0.2389548776377199, "learning_rate": 2.3093476992338744e-07, "loss": 1.5368, "step": 3731 }, { "epoch": 0.5628534801297036, "grad_norm": 0.2944855833008349, "learning_rate": 2.308302243655274e-07, "loss": 1.4542, "step": 3732 }, { "epoch": 0.5630042983183772, "grad_norm": 0.31719225087494396, "learning_rate": 2.3072568872506836e-07, "loss": 1.5473, "step": 3733 }, { "epoch": 0.5631551165070507, "grad_norm": 0.24864431603286263, "learning_rate": 2.3062116302548168e-07, "loss": 1.5101, "step": 3734 }, { "epoch": 0.5633059346957243, "grad_norm": 0.27002909684434634, "learning_rate": 2.3051664729023628e-07, "loss": 1.5838, "step": 3735 }, { "epoch": 0.5634567528843979, "grad_norm": 0.25453113539837574, "learning_rate": 2.3041214154279904e-07, "loss": 1.5544, "step": 3736 }, { "epoch": 0.5636075710730715, "grad_norm": 0.25301132100861035, "learning_rate": 2.3030764580663465e-07, "loss": 1.5701, "step": 3737 }, { "epoch": 0.5637583892617449, "grad_norm": 0.35461958294716706, "learning_rate": 2.302031601052053e-07, "loss": 1.5366, "step": 3738 }, { "epoch": 0.5639092074504185, "grad_norm": 0.256245998210831, "learning_rate": 2.3009868446197118e-07, "loss": 1.552, "step": 3739 }, { "epoch": 0.5640600256390921, "grad_norm": 0.24497442119592555, "learning_rate": 2.2999421890039005e-07, "loss": 1.5586, "step": 3740 }, { "epoch": 0.5642108438277657, "grad_norm": 0.25352179266618674, "learning_rate": 2.298897634439174e-07, "loss": 1.5696, "step": 3741 }, { "epoch": 0.5643616620164392, "grad_norm": 0.24358541538783612, "learning_rate": 2.2978531811600678e-07, "loss": 1.5355, "step": 3742 }, { "epoch": 0.5645124802051127, "grad_norm": 1.5337264757037743, "learning_rate": 2.296808829401089e-07, "loss": 1.5272, "step": 3743 }, { "epoch": 0.5646632983937863, "grad_norm": 0.24206462688464736, "learning_rate": 2.2957645793967274e-07, "loss": 1.5689, "step": 3744 }, { "epoch": 0.5648141165824598, "grad_norm": 0.27781314679882974, "learning_rate": 2.2947204313814456e-07, "loss": 1.6009, "step": 3745 }, { "epoch": 0.5649649347711334, "grad_norm": 0.8659978798118473, "learning_rate": 2.293676385589687e-07, "loss": 1.5952, "step": 3746 }, { "epoch": 0.565115752959807, "grad_norm": 0.7918369023779016, "learning_rate": 2.2926324422558691e-07, "loss": 1.4788, "step": 3747 }, { "epoch": 0.5652665711484806, "grad_norm": 0.2993582645953468, "learning_rate": 2.2915886016143875e-07, "loss": 1.5096, "step": 3748 }, { "epoch": 0.565417389337154, "grad_norm": 0.5632692862640049, "learning_rate": 2.2905448638996155e-07, "loss": 1.5725, "step": 3749 }, { "epoch": 0.5655682075258276, "grad_norm": 0.2518484130311377, "learning_rate": 2.2895012293459025e-07, "loss": 1.5019, "step": 3750 }, { "epoch": 0.5657190257145012, "grad_norm": 0.26977467638308, "learning_rate": 2.2884576981875747e-07, "loss": 1.5284, "step": 3751 }, { "epoch": 0.5658698439031747, "grad_norm": 0.25014250802983745, "learning_rate": 2.2874142706589356e-07, "loss": 1.5237, "step": 3752 }, { "epoch": 0.5660206620918483, "grad_norm": 0.24500688644136545, "learning_rate": 2.2863709469942643e-07, "loss": 1.57, "step": 3753 }, { "epoch": 0.5661714802805218, "grad_norm": 0.3467656004210753, "learning_rate": 2.2853277274278177e-07, "loss": 1.5379, "step": 3754 }, { "epoch": 0.5663222984691954, "grad_norm": 0.33106795675581974, "learning_rate": 2.2842846121938293e-07, "loss": 1.6001, "step": 3755 }, { "epoch": 0.5664731166578689, "grad_norm": 0.5379001383327541, "learning_rate": 2.2832416015265087e-07, "loss": 1.5852, "step": 3756 }, { "epoch": 0.5666239348465425, "grad_norm": 0.5228243427759623, "learning_rate": 2.2821986956600419e-07, "loss": 1.5025, "step": 3757 }, { "epoch": 0.5667747530352161, "grad_norm": 0.2395550928183924, "learning_rate": 2.281155894828592e-07, "loss": 1.5079, "step": 3758 }, { "epoch": 0.5669255712238896, "grad_norm": 0.2571340962141452, "learning_rate": 2.280113199266297e-07, "loss": 1.5392, "step": 3759 }, { "epoch": 0.5670763894125631, "grad_norm": 0.27627938424007076, "learning_rate": 2.2790706092072742e-07, "loss": 1.577, "step": 3760 }, { "epoch": 0.5672272076012367, "grad_norm": 0.2735106100048377, "learning_rate": 2.2780281248856148e-07, "loss": 1.5253, "step": 3761 }, { "epoch": 0.5673780257899103, "grad_norm": 0.24565529996766425, "learning_rate": 2.2769857465353864e-07, "loss": 1.5414, "step": 3762 }, { "epoch": 0.5675288439785838, "grad_norm": 0.25911738100127885, "learning_rate": 2.2759434743906337e-07, "loss": 1.5168, "step": 3763 }, { "epoch": 0.5676796621672574, "grad_norm": 0.26083242181688693, "learning_rate": 2.274901308685377e-07, "loss": 1.5698, "step": 3764 }, { "epoch": 0.567830480355931, "grad_norm": 0.253637562025555, "learning_rate": 2.273859249653613e-07, "loss": 1.5868, "step": 3765 }, { "epoch": 0.5679812985446044, "grad_norm": 0.24777184255839774, "learning_rate": 2.2728172975293135e-07, "loss": 1.5206, "step": 3766 }, { "epoch": 0.568132116733278, "grad_norm": 0.30430013138718626, "learning_rate": 2.2717754525464284e-07, "loss": 1.5847, "step": 3767 }, { "epoch": 0.5682829349219516, "grad_norm": 0.2771173974071153, "learning_rate": 2.2707337149388822e-07, "loss": 1.5594, "step": 3768 }, { "epoch": 0.5684337531106252, "grad_norm": 0.24679284719922462, "learning_rate": 2.2696920849405742e-07, "loss": 1.5829, "step": 3769 }, { "epoch": 0.5685845712992987, "grad_norm": 0.2541246356975005, "learning_rate": 2.2686505627853813e-07, "loss": 1.5748, "step": 3770 }, { "epoch": 0.5687353894879722, "grad_norm": 0.29219160832881896, "learning_rate": 2.2676091487071552e-07, "loss": 1.5848, "step": 3771 }, { "epoch": 0.5688862076766458, "grad_norm": 0.38956183827483026, "learning_rate": 2.266567842939724e-07, "loss": 1.5496, "step": 3772 }, { "epoch": 0.5690370258653193, "grad_norm": 0.2506659621824879, "learning_rate": 2.265526645716892e-07, "loss": 1.5326, "step": 3773 }, { "epoch": 0.5691878440539929, "grad_norm": 0.2679515485132347, "learning_rate": 2.2644855572724363e-07, "loss": 1.5572, "step": 3774 }, { "epoch": 0.5693386622426665, "grad_norm": 0.2579569141417791, "learning_rate": 2.2634445778401134e-07, "loss": 1.5749, "step": 3775 }, { "epoch": 0.5694894804313401, "grad_norm": 0.250645661902425, "learning_rate": 2.262403707653653e-07, "loss": 1.5957, "step": 3776 }, { "epoch": 0.5696402986200135, "grad_norm": 0.6242568522969492, "learning_rate": 2.2613629469467598e-07, "loss": 1.6399, "step": 3777 }, { "epoch": 0.5697911168086871, "grad_norm": 0.24429435285297052, "learning_rate": 2.2603222959531165e-07, "loss": 1.5638, "step": 3778 }, { "epoch": 0.5699419349973607, "grad_norm": 0.26010481690712356, "learning_rate": 2.2592817549063786e-07, "loss": 1.5593, "step": 3779 }, { "epoch": 0.5700927531860343, "grad_norm": 0.24442274456846919, "learning_rate": 2.2582413240401778e-07, "loss": 1.5124, "step": 3780 }, { "epoch": 0.5702435713747078, "grad_norm": 0.2423865915093295, "learning_rate": 2.2572010035881216e-07, "loss": 1.5678, "step": 3781 }, { "epoch": 0.5703943895633814, "grad_norm": 0.25110373866973795, "learning_rate": 2.2561607937837917e-07, "loss": 1.5525, "step": 3782 }, { "epoch": 0.5705452077520549, "grad_norm": 0.25941477053110074, "learning_rate": 2.2551206948607465e-07, "loss": 1.6247, "step": 3783 }, { "epoch": 0.5706960259407284, "grad_norm": 0.2577626131281058, "learning_rate": 2.2540807070525164e-07, "loss": 1.5449, "step": 3784 }, { "epoch": 0.570846844129402, "grad_norm": 0.24301635040369168, "learning_rate": 2.2530408305926113e-07, "loss": 1.49, "step": 3785 }, { "epoch": 0.5709976623180756, "grad_norm": 0.2630750900131234, "learning_rate": 2.2520010657145127e-07, "loss": 1.5623, "step": 3786 }, { "epoch": 0.5711484805067492, "grad_norm": 0.24920024775254504, "learning_rate": 2.250961412651678e-07, "loss": 1.6354, "step": 3787 }, { "epoch": 0.5712992986954226, "grad_norm": 0.2787379757097677, "learning_rate": 2.2499218716375402e-07, "loss": 1.578, "step": 3788 }, { "epoch": 0.5714501168840962, "grad_norm": 0.2534437872582565, "learning_rate": 2.2488824429055055e-07, "loss": 1.5623, "step": 3789 }, { "epoch": 0.5716009350727698, "grad_norm": 0.26891581634427875, "learning_rate": 2.2478431266889565e-07, "loss": 1.5497, "step": 3790 }, { "epoch": 0.5717517532614433, "grad_norm": 0.24354950530761477, "learning_rate": 2.2468039232212506e-07, "loss": 1.5665, "step": 3791 }, { "epoch": 0.5719025714501169, "grad_norm": 0.26367058200099974, "learning_rate": 2.245764832735718e-07, "loss": 1.6466, "step": 3792 }, { "epoch": 0.5720533896387905, "grad_norm": 0.2873797075741446, "learning_rate": 2.2447258554656657e-07, "loss": 1.5831, "step": 3793 }, { "epoch": 0.572204207827464, "grad_norm": 0.25887675924205883, "learning_rate": 2.243686991644374e-07, "loss": 1.5302, "step": 3794 }, { "epoch": 0.5723550260161375, "grad_norm": 0.24775356955881006, "learning_rate": 2.2426482415050984e-07, "loss": 1.5206, "step": 3795 }, { "epoch": 0.5725058442048111, "grad_norm": 0.2609640185767476, "learning_rate": 2.2416096052810684e-07, "loss": 1.6058, "step": 3796 }, { "epoch": 0.5726566623934847, "grad_norm": 0.2541846246845802, "learning_rate": 2.2405710832054874e-07, "loss": 1.6014, "step": 3797 }, { "epoch": 0.5728074805821582, "grad_norm": 0.24060146349943168, "learning_rate": 2.2395326755115347e-07, "loss": 1.5616, "step": 3798 }, { "epoch": 0.5729582987708317, "grad_norm": 0.2699389201639666, "learning_rate": 2.2384943824323632e-07, "loss": 1.4967, "step": 3799 }, { "epoch": 0.5731091169595053, "grad_norm": 0.2590233871468359, "learning_rate": 2.2374562042010986e-07, "loss": 1.5868, "step": 3800 }, { "epoch": 0.5732599351481789, "grad_norm": 0.2500973098596572, "learning_rate": 2.2364181410508441e-07, "loss": 1.5975, "step": 3801 }, { "epoch": 0.5734107533368524, "grad_norm": 0.24656651461348164, "learning_rate": 2.235380193214673e-07, "loss": 1.5572, "step": 3802 }, { "epoch": 0.573561571525526, "grad_norm": 0.29060161703836196, "learning_rate": 2.2343423609256368e-07, "loss": 1.5095, "step": 3803 }, { "epoch": 0.5737123897141996, "grad_norm": 0.25440387679712745, "learning_rate": 2.233304644416758e-07, "loss": 1.5419, "step": 3804 }, { "epoch": 0.573863207902873, "grad_norm": 0.25823801268964536, "learning_rate": 2.2322670439210339e-07, "loss": 1.5465, "step": 3805 }, { "epoch": 0.5740140260915466, "grad_norm": 0.28298262211840103, "learning_rate": 2.231229559671437e-07, "loss": 1.479, "step": 3806 }, { "epoch": 0.5741648442802202, "grad_norm": 0.2474312053351583, "learning_rate": 2.230192191900912e-07, "loss": 1.577, "step": 3807 }, { "epoch": 0.5743156624688938, "grad_norm": 0.2577236463799376, "learning_rate": 2.2291549408423788e-07, "loss": 1.5948, "step": 3808 }, { "epoch": 0.5744664806575673, "grad_norm": 0.2405755948683268, "learning_rate": 2.22811780672873e-07, "loss": 1.5225, "step": 3809 }, { "epoch": 0.5746172988462409, "grad_norm": 0.27403598796818784, "learning_rate": 2.2270807897928317e-07, "loss": 1.6322, "step": 3810 }, { "epoch": 0.5747681170349144, "grad_norm": 0.24616655581097505, "learning_rate": 2.2260438902675254e-07, "loss": 1.5112, "step": 3811 }, { "epoch": 0.574918935223588, "grad_norm": 0.26025260418138463, "learning_rate": 2.2250071083856259e-07, "loss": 1.5589, "step": 3812 }, { "epoch": 0.5750697534122615, "grad_norm": 0.2533320997307876, "learning_rate": 2.223970444379919e-07, "loss": 1.578, "step": 3813 }, { "epoch": 0.5752205716009351, "grad_norm": 0.6182973675590029, "learning_rate": 2.222933898483168e-07, "loss": 1.5124, "step": 3814 }, { "epoch": 0.5753713897896087, "grad_norm": 0.2607767696742802, "learning_rate": 2.2218974709281058e-07, "loss": 1.5282, "step": 3815 }, { "epoch": 0.5755222079782821, "grad_norm": 1.746288228687721, "learning_rate": 2.2208611619474414e-07, "loss": 1.5935, "step": 3816 }, { "epoch": 0.5756730261669557, "grad_norm": 0.2781940675906223, "learning_rate": 2.219824971773857e-07, "loss": 1.5586, "step": 3817 }, { "epoch": 0.5758238443556293, "grad_norm": 0.2666272791271762, "learning_rate": 2.2187889006400067e-07, "loss": 1.5286, "step": 3818 }, { "epoch": 0.5759746625443029, "grad_norm": 0.24416966759777037, "learning_rate": 2.2177529487785184e-07, "loss": 1.5812, "step": 3819 }, { "epoch": 0.5761254807329764, "grad_norm": 0.4255803719795962, "learning_rate": 2.2167171164219945e-07, "loss": 1.5936, "step": 3820 }, { "epoch": 0.57627629892165, "grad_norm": 0.28413577943475554, "learning_rate": 2.215681403803008e-07, "loss": 1.5585, "step": 3821 }, { "epoch": 0.5764271171103235, "grad_norm": 0.45129086175053656, "learning_rate": 2.2146458111541083e-07, "loss": 1.5197, "step": 3822 }, { "epoch": 0.576577935298997, "grad_norm": 0.2698861269924893, "learning_rate": 2.2136103387078148e-07, "loss": 1.5499, "step": 3823 }, { "epoch": 0.5767287534876706, "grad_norm": 0.24600013904480686, "learning_rate": 2.2125749866966214e-07, "loss": 1.5971, "step": 3824 }, { "epoch": 0.5768795716763442, "grad_norm": 0.24129779490920517, "learning_rate": 2.2115397553529958e-07, "loss": 1.5806, "step": 3825 }, { "epoch": 0.5770303898650178, "grad_norm": 0.29701513447079625, "learning_rate": 2.2105046449093762e-07, "loss": 1.6232, "step": 3826 }, { "epoch": 0.5771812080536913, "grad_norm": 0.7615362675290088, "learning_rate": 2.2094696555981763e-07, "loss": 1.5411, "step": 3827 }, { "epoch": 0.5773320262423648, "grad_norm": 0.2413044971417661, "learning_rate": 2.2084347876517796e-07, "loss": 1.4892, "step": 3828 }, { "epoch": 0.5774828444310384, "grad_norm": 0.24703204099069684, "learning_rate": 2.2074000413025457e-07, "loss": 1.5733, "step": 3829 }, { "epoch": 0.5776336626197119, "grad_norm": 0.25368532396740934, "learning_rate": 2.206365416782805e-07, "loss": 1.6596, "step": 3830 }, { "epoch": 0.5777844808083855, "grad_norm": 0.2737552950500867, "learning_rate": 2.20533091432486e-07, "loss": 1.6007, "step": 3831 }, { "epoch": 0.5779352989970591, "grad_norm": 0.24841815713314006, "learning_rate": 2.204296534160988e-07, "loss": 1.5218, "step": 3832 }, { "epoch": 0.5780861171857326, "grad_norm": 0.2555444899340187, "learning_rate": 2.2032622765234355e-07, "loss": 1.5312, "step": 3833 }, { "epoch": 0.5782369353744061, "grad_norm": 0.25121951884051724, "learning_rate": 2.202228141644426e-07, "loss": 1.5525, "step": 3834 }, { "epoch": 0.5783877535630797, "grad_norm": 0.2826540264763573, "learning_rate": 2.2011941297561514e-07, "loss": 1.561, "step": 3835 }, { "epoch": 0.5785385717517533, "grad_norm": 0.24319345438007933, "learning_rate": 2.200160241090777e-07, "loss": 1.5542, "step": 3836 }, { "epoch": 0.5786893899404268, "grad_norm": 0.2542961847052196, "learning_rate": 2.1991264758804424e-07, "loss": 1.5275, "step": 3837 }, { "epoch": 0.5788402081291004, "grad_norm": 0.2626142301729653, "learning_rate": 2.1980928343572569e-07, "loss": 1.5552, "step": 3838 }, { "epoch": 0.5789910263177739, "grad_norm": 0.2716722650906108, "learning_rate": 2.1970593167533035e-07, "loss": 1.5288, "step": 3839 }, { "epoch": 0.5791418445064475, "grad_norm": 0.24899949433714882, "learning_rate": 2.196025923300636e-07, "loss": 1.5404, "step": 3840 }, { "epoch": 0.579292662695121, "grad_norm": 0.25354168178863573, "learning_rate": 2.1949926542312835e-07, "loss": 1.5423, "step": 3841 }, { "epoch": 0.5794434808837946, "grad_norm": 0.23854730518745276, "learning_rate": 2.1939595097772435e-07, "loss": 1.5197, "step": 3842 }, { "epoch": 0.5795942990724682, "grad_norm": 0.2430693716168873, "learning_rate": 2.1929264901704863e-07, "loss": 1.4972, "step": 3843 }, { "epoch": 0.5797451172611416, "grad_norm": 0.24301961919786527, "learning_rate": 2.1918935956429568e-07, "loss": 1.5387, "step": 3844 }, { "epoch": 0.5798959354498152, "grad_norm": 0.2524188769060236, "learning_rate": 2.1908608264265688e-07, "loss": 1.4952, "step": 3845 }, { "epoch": 0.5800467536384888, "grad_norm": 0.263113455281279, "learning_rate": 2.1898281827532088e-07, "loss": 1.5065, "step": 3846 }, { "epoch": 0.5801975718271624, "grad_norm": 0.2705355413432033, "learning_rate": 2.1887956648547362e-07, "loss": 1.6136, "step": 3847 }, { "epoch": 0.5803483900158359, "grad_norm": 0.2970875459648603, "learning_rate": 2.1877632729629802e-07, "loss": 1.5707, "step": 3848 }, { "epoch": 0.5804992082045095, "grad_norm": 0.2593001241846887, "learning_rate": 2.1867310073097438e-07, "loss": 1.5253, "step": 3849 }, { "epoch": 0.580650026393183, "grad_norm": 0.2569704845840797, "learning_rate": 2.1856988681268014e-07, "loss": 1.5755, "step": 3850 }, { "epoch": 0.5808008445818565, "grad_norm": 0.2555578317437223, "learning_rate": 2.1846668556458964e-07, "loss": 1.5297, "step": 3851 }, { "epoch": 0.5809516627705301, "grad_norm": 0.24976164652961383, "learning_rate": 2.1836349700987472e-07, "loss": 1.5918, "step": 3852 }, { "epoch": 0.5811024809592037, "grad_norm": 0.25412501981978447, "learning_rate": 2.182603211717041e-07, "loss": 1.5667, "step": 3853 }, { "epoch": 0.5812532991478773, "grad_norm": 0.24643386448414026, "learning_rate": 2.1815715807324387e-07, "loss": 1.5016, "step": 3854 }, { "epoch": 0.5814041173365508, "grad_norm": 0.291967919185025, "learning_rate": 2.180540077376571e-07, "loss": 1.5208, "step": 3855 }, { "epoch": 0.5815549355252243, "grad_norm": 0.2607898661757403, "learning_rate": 2.17950870188104e-07, "loss": 1.6324, "step": 3856 }, { "epoch": 0.5817057537138979, "grad_norm": 0.3237350230718295, "learning_rate": 2.1784774544774205e-07, "loss": 1.5811, "step": 3857 }, { "epoch": 0.5818565719025715, "grad_norm": 0.25465789432054664, "learning_rate": 2.177446335397256e-07, "loss": 1.6522, "step": 3858 }, { "epoch": 0.582007390091245, "grad_norm": 0.25650077735120685, "learning_rate": 2.1764153448720646e-07, "loss": 1.5092, "step": 3859 }, { "epoch": 0.5821582082799186, "grad_norm": 0.2556223473650482, "learning_rate": 2.1753844831333335e-07, "loss": 1.4928, "step": 3860 }, { "epoch": 0.5823090264685921, "grad_norm": 0.2647938026212078, "learning_rate": 2.1743537504125199e-07, "loss": 1.6117, "step": 3861 }, { "epoch": 0.5824598446572656, "grad_norm": 0.28386285783857657, "learning_rate": 2.173323146941054e-07, "loss": 1.5646, "step": 3862 }, { "epoch": 0.5826106628459392, "grad_norm": 0.25605121004622367, "learning_rate": 2.1722926729503365e-07, "loss": 1.655, "step": 3863 }, { "epoch": 0.5827614810346128, "grad_norm": 0.347275229460256, "learning_rate": 2.171262328671738e-07, "loss": 1.6725, "step": 3864 }, { "epoch": 0.5829122992232864, "grad_norm": 0.25644950265792565, "learning_rate": 2.170232114336602e-07, "loss": 1.563, "step": 3865 }, { "epoch": 0.5830631174119599, "grad_norm": 0.2514527888535762, "learning_rate": 2.169202030176241e-07, "loss": 1.6298, "step": 3866 }, { "epoch": 0.5832139356006334, "grad_norm": 0.25577460819356496, "learning_rate": 2.1681720764219384e-07, "loss": 1.5562, "step": 3867 }, { "epoch": 0.583364753789307, "grad_norm": 0.26046606047108944, "learning_rate": 2.1671422533049504e-07, "loss": 1.4611, "step": 3868 }, { "epoch": 0.5835155719779805, "grad_norm": 0.23726670710428088, "learning_rate": 2.1661125610565006e-07, "loss": 1.5789, "step": 3869 }, { "epoch": 0.5836663901666541, "grad_norm": 0.2669902319815929, "learning_rate": 2.1650829999077856e-07, "loss": 1.6146, "step": 3870 }, { "epoch": 0.5838172083553277, "grad_norm": 0.24610296922448516, "learning_rate": 2.1640535700899714e-07, "loss": 1.5298, "step": 3871 }, { "epoch": 0.5839680265440013, "grad_norm": 0.2590328572776037, "learning_rate": 2.1630242718341957e-07, "loss": 1.5387, "step": 3872 }, { "epoch": 0.5841188447326747, "grad_norm": 0.2628650412755619, "learning_rate": 2.1619951053715657e-07, "loss": 1.6035, "step": 3873 }, { "epoch": 0.5842696629213483, "grad_norm": 0.3192328346924656, "learning_rate": 2.160966070933159e-07, "loss": 1.5955, "step": 3874 }, { "epoch": 0.5844204811100219, "grad_norm": 0.2526909035008379, "learning_rate": 2.1599371687500245e-07, "loss": 1.5603, "step": 3875 }, { "epoch": 0.5845712992986954, "grad_norm": 0.26297610180789677, "learning_rate": 2.1589083990531793e-07, "loss": 1.5714, "step": 3876 }, { "epoch": 0.584722117487369, "grad_norm": 0.3217290084400424, "learning_rate": 2.1578797620736133e-07, "loss": 1.5672, "step": 3877 }, { "epoch": 0.5848729356760425, "grad_norm": 0.3162169538147139, "learning_rate": 2.1568512580422855e-07, "loss": 1.4855, "step": 3878 }, { "epoch": 0.5850237538647161, "grad_norm": 0.2516293552966027, "learning_rate": 2.155822887190124e-07, "loss": 1.5633, "step": 3879 }, { "epoch": 0.5851745720533896, "grad_norm": 0.29894170183508184, "learning_rate": 2.154794649748029e-07, "loss": 1.5917, "step": 3880 }, { "epoch": 0.5853253902420632, "grad_norm": 0.2682481575960108, "learning_rate": 2.1537665459468696e-07, "loss": 1.5779, "step": 3881 }, { "epoch": 0.5854762084307368, "grad_norm": 0.25103199650586117, "learning_rate": 2.152738576017485e-07, "loss": 1.5169, "step": 3882 }, { "epoch": 0.5856270266194104, "grad_norm": 0.25201957555043975, "learning_rate": 2.1517107401906842e-07, "loss": 1.5541, "step": 3883 }, { "epoch": 0.5857778448080838, "grad_norm": 0.8384191903433962, "learning_rate": 2.1506830386972458e-07, "loss": 1.587, "step": 3884 }, { "epoch": 0.5859286629967574, "grad_norm": 0.2488293562239644, "learning_rate": 2.1496554717679194e-07, "loss": 1.5721, "step": 3885 }, { "epoch": 0.586079481185431, "grad_norm": 0.2368679296727655, "learning_rate": 2.1486280396334239e-07, "loss": 1.5324, "step": 3886 }, { "epoch": 0.5862302993741045, "grad_norm": 0.24897976851256423, "learning_rate": 2.147600742524447e-07, "loss": 1.54, "step": 3887 }, { "epoch": 0.5863811175627781, "grad_norm": 0.3002451271596729, "learning_rate": 2.1465735806716478e-07, "loss": 1.5172, "step": 3888 }, { "epoch": 0.5865319357514516, "grad_norm": 0.24850770735669614, "learning_rate": 2.1455465543056527e-07, "loss": 1.5526, "step": 3889 }, { "epoch": 0.5866827539401251, "grad_norm": 0.4619670490548418, "learning_rate": 2.14451966365706e-07, "loss": 1.5955, "step": 3890 }, { "epoch": 0.5868335721287987, "grad_norm": 0.43523556266765423, "learning_rate": 2.1434929089564375e-07, "loss": 1.6064, "step": 3891 }, { "epoch": 0.5869843903174723, "grad_norm": 0.23674067638948412, "learning_rate": 2.1424662904343188e-07, "loss": 1.5527, "step": 3892 }, { "epoch": 0.5871352085061459, "grad_norm": 0.25585996624482177, "learning_rate": 2.1414398083212114e-07, "loss": 1.6381, "step": 3893 }, { "epoch": 0.5872860266948194, "grad_norm": 0.3467997914699444, "learning_rate": 2.1404134628475912e-07, "loss": 1.4996, "step": 3894 }, { "epoch": 0.5874368448834929, "grad_norm": 0.27717865650431744, "learning_rate": 2.139387254243901e-07, "loss": 1.542, "step": 3895 }, { "epoch": 0.5875876630721665, "grad_norm": 0.24240472352347092, "learning_rate": 2.1383611827405557e-07, "loss": 1.5739, "step": 3896 }, { "epoch": 0.58773848126084, "grad_norm": 0.2550797345859492, "learning_rate": 2.1373352485679375e-07, "loss": 1.5743, "step": 3897 }, { "epoch": 0.5878892994495136, "grad_norm": 0.25890179909780026, "learning_rate": 2.1363094519563984e-07, "loss": 1.6189, "step": 3898 }, { "epoch": 0.5880401176381872, "grad_norm": 0.3744524860252214, "learning_rate": 2.1352837931362605e-07, "loss": 1.615, "step": 3899 }, { "epoch": 0.5881909358268608, "grad_norm": 0.27406405524412275, "learning_rate": 2.134258272337814e-07, "loss": 1.5882, "step": 3900 }, { "epoch": 0.5883417540155342, "grad_norm": 0.25618129568741593, "learning_rate": 2.133232889791317e-07, "loss": 1.5719, "step": 3901 }, { "epoch": 0.5884925722042078, "grad_norm": 0.5344404015159859, "learning_rate": 2.1322076457269983e-07, "loss": 1.5345, "step": 3902 }, { "epoch": 0.5886433903928814, "grad_norm": 0.27394922769442587, "learning_rate": 2.1311825403750554e-07, "loss": 1.5363, "step": 3903 }, { "epoch": 0.588794208581555, "grad_norm": 0.2516339285494544, "learning_rate": 2.1301575739656547e-07, "loss": 1.5915, "step": 3904 }, { "epoch": 0.5889450267702285, "grad_norm": 0.25531568449077874, "learning_rate": 2.12913274672893e-07, "loss": 1.6105, "step": 3905 }, { "epoch": 0.589095844958902, "grad_norm": 0.25052078610351264, "learning_rate": 2.1281080588949851e-07, "loss": 1.6057, "step": 3906 }, { "epoch": 0.5892466631475756, "grad_norm": 0.2875496049149756, "learning_rate": 2.127083510693893e-07, "loss": 1.4833, "step": 3907 }, { "epoch": 0.5893974813362491, "grad_norm": 0.26066187862838874, "learning_rate": 2.126059102355694e-07, "loss": 1.5166, "step": 3908 }, { "epoch": 0.5895482995249227, "grad_norm": 0.24066311208593907, "learning_rate": 2.1250348341103974e-07, "loss": 1.5614, "step": 3909 }, { "epoch": 0.5896991177135963, "grad_norm": 0.2527211765328934, "learning_rate": 2.1240107061879813e-07, "loss": 1.5479, "step": 3910 }, { "epoch": 0.5898499359022699, "grad_norm": 0.24507118283919913, "learning_rate": 2.1229867188183932e-07, "loss": 1.5131, "step": 3911 }, { "epoch": 0.5900007540909433, "grad_norm": 0.24660986393765302, "learning_rate": 2.1219628722315468e-07, "loss": 1.5353, "step": 3912 }, { "epoch": 0.5901515722796169, "grad_norm": 0.27213896453072617, "learning_rate": 2.1209391666573262e-07, "loss": 1.5589, "step": 3913 }, { "epoch": 0.5903023904682905, "grad_norm": 0.26620695172163944, "learning_rate": 2.1199156023255838e-07, "loss": 1.5554, "step": 3914 }, { "epoch": 0.590453208656964, "grad_norm": 0.25728273695387793, "learning_rate": 2.1188921794661373e-07, "loss": 1.535, "step": 3915 }, { "epoch": 0.5906040268456376, "grad_norm": 0.2489799042047875, "learning_rate": 2.1178688983087774e-07, "loss": 1.5579, "step": 3916 }, { "epoch": 0.5907548450343112, "grad_norm": 0.25706920572749437, "learning_rate": 2.1168457590832594e-07, "loss": 1.6016, "step": 3917 }, { "epoch": 0.5909056632229847, "grad_norm": 0.25777099994890706, "learning_rate": 2.1158227620193077e-07, "loss": 1.5538, "step": 3918 }, { "epoch": 0.5910564814116582, "grad_norm": 0.8221472064317823, "learning_rate": 2.1147999073466156e-07, "loss": 1.5954, "step": 3919 }, { "epoch": 0.5912072996003318, "grad_norm": 0.2636602003466519, "learning_rate": 2.1137771952948425e-07, "loss": 1.5922, "step": 3920 }, { "epoch": 0.5913581177890054, "grad_norm": 0.2476386765784861, "learning_rate": 2.1127546260936185e-07, "loss": 1.5343, "step": 3921 }, { "epoch": 0.591508935977679, "grad_norm": 0.32515634295585055, "learning_rate": 2.1117321999725396e-07, "loss": 1.5778, "step": 3922 }, { "epoch": 0.5916597541663524, "grad_norm": 0.26955179218666003, "learning_rate": 2.1107099171611704e-07, "loss": 1.604, "step": 3923 }, { "epoch": 0.591810572355026, "grad_norm": 0.26142776224103237, "learning_rate": 2.109687777889042e-07, "loss": 1.5378, "step": 3924 }, { "epoch": 0.5919613905436996, "grad_norm": 0.25235168002590846, "learning_rate": 2.1086657823856567e-07, "loss": 1.5882, "step": 3925 }, { "epoch": 0.5921122087323731, "grad_norm": 0.3268180416647588, "learning_rate": 2.1076439308804806e-07, "loss": 1.5245, "step": 3926 }, { "epoch": 0.5922630269210467, "grad_norm": 0.25122857572439317, "learning_rate": 2.1066222236029496e-07, "loss": 1.53, "step": 3927 }, { "epoch": 0.5924138451097203, "grad_norm": 0.2518402680010454, "learning_rate": 2.105600660782466e-07, "loss": 1.5644, "step": 3928 }, { "epoch": 0.5925646632983937, "grad_norm": 0.6317106903102638, "learning_rate": 2.1045792426484016e-07, "loss": 1.5885, "step": 3929 }, { "epoch": 0.5927154814870673, "grad_norm": 0.23885732382701932, "learning_rate": 2.1035579694300936e-07, "loss": 1.5454, "step": 3930 }, { "epoch": 0.5928662996757409, "grad_norm": 0.24169605774079195, "learning_rate": 2.102536841356848e-07, "loss": 1.5475, "step": 3931 }, { "epoch": 0.5930171178644145, "grad_norm": 0.2496554472407174, "learning_rate": 2.1015158586579385e-07, "loss": 1.5379, "step": 3932 }, { "epoch": 0.593167936053088, "grad_norm": 0.23619446956023743, "learning_rate": 2.1004950215626032e-07, "loss": 1.5518, "step": 3933 }, { "epoch": 0.5933187542417615, "grad_norm": 0.24512528344650525, "learning_rate": 2.0994743303000522e-07, "loss": 1.5636, "step": 3934 }, { "epoch": 0.5934695724304351, "grad_norm": 0.29386974334811894, "learning_rate": 2.0984537850994595e-07, "loss": 1.5043, "step": 3935 }, { "epoch": 0.5936203906191087, "grad_norm": 0.24607778184964832, "learning_rate": 2.0974333861899668e-07, "loss": 1.6379, "step": 3936 }, { "epoch": 0.5937712088077822, "grad_norm": 0.2537985300545943, "learning_rate": 2.0964131338006836e-07, "loss": 1.5556, "step": 3937 }, { "epoch": 0.5939220269964558, "grad_norm": 0.2938624465908146, "learning_rate": 2.0953930281606875e-07, "loss": 1.6214, "step": 3938 }, { "epoch": 0.5940728451851294, "grad_norm": 0.2575265487300153, "learning_rate": 2.0943730694990197e-07, "loss": 1.5301, "step": 3939 }, { "epoch": 0.5942236633738028, "grad_norm": 0.25235912105681335, "learning_rate": 2.0933532580446923e-07, "loss": 1.5302, "step": 3940 }, { "epoch": 0.5943744815624764, "grad_norm": 0.26834261526057135, "learning_rate": 2.0923335940266817e-07, "loss": 1.4762, "step": 3941 }, { "epoch": 0.59452529975115, "grad_norm": 0.274112552646199, "learning_rate": 2.091314077673933e-07, "loss": 1.5406, "step": 3942 }, { "epoch": 0.5946761179398236, "grad_norm": 0.25979791178173584, "learning_rate": 2.0902947092153566e-07, "loss": 1.549, "step": 3943 }, { "epoch": 0.5948269361284971, "grad_norm": 0.24123835415858874, "learning_rate": 2.0892754888798308e-07, "loss": 1.5285, "step": 3944 }, { "epoch": 0.5949777543171707, "grad_norm": 0.2833528756405838, "learning_rate": 2.0882564168962007e-07, "loss": 1.5937, "step": 3945 }, { "epoch": 0.5951285725058442, "grad_norm": 0.2558238217311259, "learning_rate": 2.0872374934932764e-07, "loss": 1.5854, "step": 3946 }, { "epoch": 0.5952793906945177, "grad_norm": 0.28536413947594585, "learning_rate": 2.086218718899837e-07, "loss": 1.5424, "step": 3947 }, { "epoch": 0.5954302088831913, "grad_norm": 0.28409176854031953, "learning_rate": 2.0852000933446268e-07, "loss": 1.6091, "step": 3948 }, { "epoch": 0.5955810270718649, "grad_norm": 0.2638455821143265, "learning_rate": 2.0841816170563558e-07, "loss": 1.4921, "step": 3949 }, { "epoch": 0.5957318452605385, "grad_norm": 0.2544125404436739, "learning_rate": 2.0831632902637036e-07, "loss": 1.5437, "step": 3950 }, { "epoch": 0.5958826634492119, "grad_norm": 0.3329482205826952, "learning_rate": 2.0821451131953128e-07, "loss": 1.5533, "step": 3951 }, { "epoch": 0.5960334816378855, "grad_norm": 0.24549932350258638, "learning_rate": 2.0811270860797942e-07, "loss": 1.5775, "step": 3952 }, { "epoch": 0.5961842998265591, "grad_norm": 0.24280489344835257, "learning_rate": 2.0801092091457252e-07, "loss": 1.532, "step": 3953 }, { "epoch": 0.5963351180152326, "grad_norm": 0.2662286734750068, "learning_rate": 2.0790914826216466e-07, "loss": 1.506, "step": 3954 }, { "epoch": 0.5964859362039062, "grad_norm": 0.23685295151761798, "learning_rate": 2.0780739067360696e-07, "loss": 1.602, "step": 3955 }, { "epoch": 0.5966367543925798, "grad_norm": 0.24488927836775568, "learning_rate": 2.07705648171747e-07, "loss": 1.6221, "step": 3956 }, { "epoch": 0.5967875725812533, "grad_norm": 0.30208706647893807, "learning_rate": 2.0760392077942885e-07, "loss": 1.5745, "step": 3957 }, { "epoch": 0.5969383907699268, "grad_norm": 0.27614397463665463, "learning_rate": 2.0750220851949324e-07, "loss": 1.5399, "step": 3958 }, { "epoch": 0.5970892089586004, "grad_norm": 0.24605552384885424, "learning_rate": 2.074005114147775e-07, "loss": 1.4549, "step": 3959 }, { "epoch": 0.597240027147274, "grad_norm": 0.2891713160108519, "learning_rate": 2.0729882948811573e-07, "loss": 1.5384, "step": 3960 }, { "epoch": 0.5973908453359476, "grad_norm": 0.2635344708459758, "learning_rate": 2.0719716276233844e-07, "loss": 1.5661, "step": 3961 }, { "epoch": 0.597541663524621, "grad_norm": 0.24959908716759605, "learning_rate": 2.070955112602727e-07, "loss": 1.5404, "step": 3962 }, { "epoch": 0.5976924817132946, "grad_norm": 0.2530298085425925, "learning_rate": 2.069938750047423e-07, "loss": 1.5452, "step": 3963 }, { "epoch": 0.5978432999019682, "grad_norm": 0.26999816586828845, "learning_rate": 2.0689225401856754e-07, "loss": 1.4886, "step": 3964 }, { "epoch": 0.5979941180906417, "grad_norm": 0.24534664683760726, "learning_rate": 2.0679064832456522e-07, "loss": 1.5164, "step": 3965 }, { "epoch": 0.5981449362793153, "grad_norm": 0.25437673218079726, "learning_rate": 2.0668905794554886e-07, "loss": 1.5683, "step": 3966 }, { "epoch": 0.5982957544679889, "grad_norm": 0.24109383361353498, "learning_rate": 2.0658748290432842e-07, "loss": 1.5497, "step": 3967 }, { "epoch": 0.5984465726566623, "grad_norm": 0.2510432340228446, "learning_rate": 2.0648592322371045e-07, "loss": 1.5323, "step": 3968 }, { "epoch": 0.5985973908453359, "grad_norm": 0.2606103881986699, "learning_rate": 2.0638437892649814e-07, "loss": 1.6385, "step": 3969 }, { "epoch": 0.5987482090340095, "grad_norm": 0.2555271402806662, "learning_rate": 2.0628285003549095e-07, "loss": 1.6314, "step": 3970 }, { "epoch": 0.5988990272226831, "grad_norm": 0.24917813790419496, "learning_rate": 2.061813365734853e-07, "loss": 1.5815, "step": 3971 }, { "epoch": 0.5990498454113566, "grad_norm": 0.2490699589494564, "learning_rate": 2.0607983856327376e-07, "loss": 1.5371, "step": 3972 }, { "epoch": 0.5992006636000302, "grad_norm": 0.25179033656276567, "learning_rate": 2.0597835602764558e-07, "loss": 1.632, "step": 3973 }, { "epoch": 0.5993514817887037, "grad_norm": 0.3379907290009692, "learning_rate": 2.058768889893867e-07, "loss": 1.6252, "step": 3974 }, { "epoch": 0.5995022999773773, "grad_norm": 0.2509877314699308, "learning_rate": 2.0577543747127928e-07, "loss": 1.6167, "step": 3975 }, { "epoch": 0.5996531181660508, "grad_norm": 0.325693924804953, "learning_rate": 2.056740014961022e-07, "loss": 1.5144, "step": 3976 }, { "epoch": 0.5998039363547244, "grad_norm": 0.2966623552278896, "learning_rate": 2.0557258108663077e-07, "loss": 1.603, "step": 3977 }, { "epoch": 0.599954754543398, "grad_norm": 0.2412681146127633, "learning_rate": 2.0547117626563687e-07, "loss": 1.5428, "step": 3978 }, { "epoch": 0.6001055727320714, "grad_norm": 0.2453501162309635, "learning_rate": 2.0536978705588876e-07, "loss": 1.6435, "step": 3979 }, { "epoch": 0.600256390920745, "grad_norm": 0.2757165902222816, "learning_rate": 2.0526841348015134e-07, "loss": 1.5337, "step": 3980 }, { "epoch": 0.6004072091094186, "grad_norm": 0.6674557603985731, "learning_rate": 2.0516705556118586e-07, "loss": 1.4952, "step": 3981 }, { "epoch": 0.6005580272980922, "grad_norm": 0.2597450747834464, "learning_rate": 2.0506571332175027e-07, "loss": 1.5062, "step": 3982 }, { "epoch": 0.6007088454867657, "grad_norm": 0.26841242776289004, "learning_rate": 2.0496438678459866e-07, "loss": 1.598, "step": 3983 }, { "epoch": 0.6008596636754393, "grad_norm": 0.2836316492514382, "learning_rate": 2.0486307597248192e-07, "loss": 1.5803, "step": 3984 }, { "epoch": 0.6010104818641128, "grad_norm": 0.24917175257954272, "learning_rate": 2.0476178090814716e-07, "loss": 1.4978, "step": 3985 }, { "epoch": 0.6011613000527863, "grad_norm": 0.3142681695486749, "learning_rate": 2.0466050161433822e-07, "loss": 1.594, "step": 3986 }, { "epoch": 0.6013121182414599, "grad_norm": 0.24425064910154232, "learning_rate": 2.0455923811379516e-07, "loss": 1.5431, "step": 3987 }, { "epoch": 0.6014629364301335, "grad_norm": 0.26105924672835484, "learning_rate": 2.044579904292546e-07, "loss": 1.5695, "step": 3988 }, { "epoch": 0.6016137546188071, "grad_norm": 0.3693476273425113, "learning_rate": 2.0435675858344964e-07, "loss": 1.5336, "step": 3989 }, { "epoch": 0.6017645728074806, "grad_norm": 0.2862201779770262, "learning_rate": 2.0425554259910972e-07, "loss": 1.5806, "step": 3990 }, { "epoch": 0.6019153909961541, "grad_norm": 0.26953985052228474, "learning_rate": 2.0415434249896073e-07, "loss": 1.5872, "step": 3991 }, { "epoch": 0.6020662091848277, "grad_norm": 0.2670217912865157, "learning_rate": 2.0405315830572524e-07, "loss": 1.5602, "step": 3992 }, { "epoch": 0.6022170273735012, "grad_norm": 0.2799420163304301, "learning_rate": 2.0395199004212177e-07, "loss": 1.5834, "step": 3993 }, { "epoch": 0.6023678455621748, "grad_norm": 0.2622691870431353, "learning_rate": 2.038508377308657e-07, "loss": 1.5618, "step": 3994 }, { "epoch": 0.6025186637508484, "grad_norm": 0.24447491424701062, "learning_rate": 2.0374970139466874e-07, "loss": 1.5483, "step": 3995 }, { "epoch": 0.6026694819395219, "grad_norm": 0.25278398998840484, "learning_rate": 2.0364858105623877e-07, "loss": 1.5617, "step": 3996 }, { "epoch": 0.6028203001281954, "grad_norm": 0.3259578925650183, "learning_rate": 2.0354747673828037e-07, "loss": 1.647, "step": 3997 }, { "epoch": 0.602971118316869, "grad_norm": 0.2537031308112455, "learning_rate": 2.0344638846349438e-07, "loss": 1.5762, "step": 3998 }, { "epoch": 0.6031219365055426, "grad_norm": 0.3072984206056757, "learning_rate": 2.03345316254578e-07, "loss": 1.5335, "step": 3999 }, { "epoch": 0.6032727546942162, "grad_norm": 0.23685479682711305, "learning_rate": 2.0324426013422503e-07, "loss": 1.5777, "step": 4000 }, { "epoch": 0.6034235728828897, "grad_norm": 0.24977825998686742, "learning_rate": 2.0314322012512536e-07, "loss": 1.5163, "step": 4001 }, { "epoch": 0.6035743910715632, "grad_norm": 0.24791281456150227, "learning_rate": 2.0304219624996548e-07, "loss": 1.5596, "step": 4002 }, { "epoch": 0.6037252092602368, "grad_norm": 0.2446735385952689, "learning_rate": 2.0294118853142816e-07, "loss": 1.6651, "step": 4003 }, { "epoch": 0.6038760274489103, "grad_norm": 0.25592372644780303, "learning_rate": 2.0284019699219261e-07, "loss": 1.5529, "step": 4004 }, { "epoch": 0.6040268456375839, "grad_norm": 0.24918388591171547, "learning_rate": 2.027392216549344e-07, "loss": 1.5621, "step": 4005 }, { "epoch": 0.6041776638262575, "grad_norm": 0.24863410118812368, "learning_rate": 2.0263826254232535e-07, "loss": 1.6028, "step": 4006 }, { "epoch": 0.604328482014931, "grad_norm": 0.2579812114037648, "learning_rate": 2.025373196770338e-07, "loss": 1.6146, "step": 4007 }, { "epoch": 0.6044793002036045, "grad_norm": 0.25664846405886427, "learning_rate": 2.0243639308172433e-07, "loss": 1.6093, "step": 4008 }, { "epoch": 0.6046301183922781, "grad_norm": 0.2468779556543227, "learning_rate": 2.0233548277905788e-07, "loss": 1.5615, "step": 4009 }, { "epoch": 0.6047809365809517, "grad_norm": 0.24596144695452257, "learning_rate": 2.0223458879169176e-07, "loss": 1.5465, "step": 4010 }, { "epoch": 0.6049317547696252, "grad_norm": 1.3899818192881177, "learning_rate": 2.021337111422796e-07, "loss": 1.5285, "step": 4011 }, { "epoch": 0.6050825729582988, "grad_norm": 0.297674427115539, "learning_rate": 2.020328498534714e-07, "loss": 1.5641, "step": 4012 }, { "epoch": 0.6052333911469723, "grad_norm": 0.25306946958463306, "learning_rate": 2.019320049479135e-07, "loss": 1.5092, "step": 4013 }, { "epoch": 0.6053842093356459, "grad_norm": 0.2622726337402047, "learning_rate": 2.0183117644824844e-07, "loss": 1.5436, "step": 4014 }, { "epoch": 0.6055350275243194, "grad_norm": 0.25230067375888293, "learning_rate": 2.017303643771152e-07, "loss": 1.5619, "step": 4015 }, { "epoch": 0.605685845712993, "grad_norm": 0.24811629245500297, "learning_rate": 2.0162956875714892e-07, "loss": 1.5575, "step": 4016 }, { "epoch": 0.6058366639016666, "grad_norm": 0.25084692060505426, "learning_rate": 2.015287896109813e-07, "loss": 1.5078, "step": 4017 }, { "epoch": 0.6059874820903401, "grad_norm": 0.27316312077471316, "learning_rate": 2.014280269612401e-07, "loss": 1.6608, "step": 4018 }, { "epoch": 0.6061383002790136, "grad_norm": 0.25027999660426603, "learning_rate": 2.013272808305495e-07, "loss": 1.5155, "step": 4019 }, { "epoch": 0.6062891184676872, "grad_norm": 0.27436443112357595, "learning_rate": 2.0122655124153003e-07, "loss": 1.5238, "step": 4020 }, { "epoch": 0.6064399366563608, "grad_norm": 0.29271848922103766, "learning_rate": 2.0112583821679823e-07, "loss": 1.607, "step": 4021 }, { "epoch": 0.6065907548450343, "grad_norm": 0.27458871893177195, "learning_rate": 2.0102514177896723e-07, "loss": 1.5824, "step": 4022 }, { "epoch": 0.6067415730337079, "grad_norm": 0.2871429383848267, "learning_rate": 2.0092446195064632e-07, "loss": 1.595, "step": 4023 }, { "epoch": 0.6068923912223814, "grad_norm": 0.24987310951695893, "learning_rate": 2.0082379875444095e-07, "loss": 1.5381, "step": 4024 }, { "epoch": 0.6070432094110549, "grad_norm": 0.288569270895411, "learning_rate": 2.0072315221295304e-07, "loss": 1.6683, "step": 4025 }, { "epoch": 0.6071940275997285, "grad_norm": 0.2602470848370709, "learning_rate": 2.006225223487807e-07, "loss": 1.5364, "step": 4026 }, { "epoch": 0.6073448457884021, "grad_norm": 0.292666506670437, "learning_rate": 2.0052190918451812e-07, "loss": 1.538, "step": 4027 }, { "epoch": 0.6074956639770757, "grad_norm": 0.25021384441705, "learning_rate": 2.0042131274275603e-07, "loss": 1.5249, "step": 4028 }, { "epoch": 0.6076464821657492, "grad_norm": 0.25042684593440523, "learning_rate": 2.0032073304608116e-07, "loss": 1.5947, "step": 4029 }, { "epoch": 0.6077973003544227, "grad_norm": 0.260428866319, "learning_rate": 2.0022017011707659e-07, "loss": 1.5524, "step": 4030 }, { "epoch": 0.6079481185430963, "grad_norm": 0.25240729324746547, "learning_rate": 2.0011962397832176e-07, "loss": 1.5652, "step": 4031 }, { "epoch": 0.6080989367317698, "grad_norm": 0.26500416833054574, "learning_rate": 2.0001909465239204e-07, "loss": 1.5401, "step": 4032 }, { "epoch": 0.6082497549204434, "grad_norm": 0.24371524278783477, "learning_rate": 1.9991858216185926e-07, "loss": 1.4963, "step": 4033 }, { "epoch": 0.608400573109117, "grad_norm": 0.24616987432015056, "learning_rate": 1.998180865292914e-07, "loss": 1.5697, "step": 4034 }, { "epoch": 0.6085513912977906, "grad_norm": 0.24365652566052104, "learning_rate": 1.9971760777725267e-07, "loss": 1.5275, "step": 4035 }, { "epoch": 0.608702209486464, "grad_norm": 0.3418343694455725, "learning_rate": 1.9961714592830343e-07, "loss": 1.5607, "step": 4036 }, { "epoch": 0.6088530276751376, "grad_norm": 0.24263105443052232, "learning_rate": 1.995167010050003e-07, "loss": 1.6039, "step": 4037 }, { "epoch": 0.6090038458638112, "grad_norm": 0.24789525388839767, "learning_rate": 1.994162730298961e-07, "loss": 1.5246, "step": 4038 }, { "epoch": 0.6091546640524848, "grad_norm": 0.2519990536823345, "learning_rate": 1.9931586202553996e-07, "loss": 1.5572, "step": 4039 }, { "epoch": 0.6093054822411583, "grad_norm": 0.24938178893953983, "learning_rate": 1.9921546801447686e-07, "loss": 1.5402, "step": 4040 }, { "epoch": 0.6094563004298318, "grad_norm": 0.2519735953711437, "learning_rate": 1.9911509101924822e-07, "loss": 1.5569, "step": 4041 }, { "epoch": 0.6096071186185054, "grad_norm": 0.25966022656929916, "learning_rate": 1.9901473106239175e-07, "loss": 1.5073, "step": 4042 }, { "epoch": 0.6097579368071789, "grad_norm": 0.25092742548521996, "learning_rate": 1.9891438816644114e-07, "loss": 1.5618, "step": 4043 }, { "epoch": 0.6099087549958525, "grad_norm": 0.24640923677698962, "learning_rate": 1.9881406235392616e-07, "loss": 1.5218, "step": 4044 }, { "epoch": 0.6100595731845261, "grad_norm": 0.24504171106026, "learning_rate": 1.9871375364737303e-07, "loss": 1.6179, "step": 4045 }, { "epoch": 0.6102103913731997, "grad_norm": 0.2515031678751933, "learning_rate": 1.986134620693039e-07, "loss": 1.5529, "step": 4046 }, { "epoch": 0.6103612095618731, "grad_norm": 0.24473526970393245, "learning_rate": 1.9851318764223707e-07, "loss": 1.4985, "step": 4047 }, { "epoch": 0.6105120277505467, "grad_norm": 0.278452783492205, "learning_rate": 1.9841293038868728e-07, "loss": 1.6168, "step": 4048 }, { "epoch": 0.6106628459392203, "grad_norm": 0.27096654217276134, "learning_rate": 1.98312690331165e-07, "loss": 1.5433, "step": 4049 }, { "epoch": 0.6108136641278938, "grad_norm": 0.2585151663611356, "learning_rate": 1.9821246749217718e-07, "loss": 1.5119, "step": 4050 }, { "epoch": 0.6109644823165674, "grad_norm": 0.24595928472114498, "learning_rate": 1.9811226189422665e-07, "loss": 1.5151, "step": 4051 }, { "epoch": 0.6111153005052409, "grad_norm": 0.25664183667744295, "learning_rate": 1.9801207355981253e-07, "loss": 1.4884, "step": 4052 }, { "epoch": 0.6112661186939145, "grad_norm": 0.2819628569318754, "learning_rate": 1.979119025114301e-07, "loss": 1.5883, "step": 4053 }, { "epoch": 0.611416936882588, "grad_norm": 0.26122868180042536, "learning_rate": 1.9781174877157058e-07, "loss": 1.5647, "step": 4054 }, { "epoch": 0.6115677550712616, "grad_norm": 0.25351052367174465, "learning_rate": 1.9771161236272138e-07, "loss": 1.5438, "step": 4055 }, { "epoch": 0.6117185732599352, "grad_norm": 0.25176624911153417, "learning_rate": 1.9761149330736615e-07, "loss": 1.5529, "step": 4056 }, { "epoch": 0.6118693914486087, "grad_norm": 0.2658656205789021, "learning_rate": 1.9751139162798442e-07, "loss": 1.5326, "step": 4057 }, { "epoch": 0.6120202096372822, "grad_norm": 0.2441990909999017, "learning_rate": 1.9741130734705208e-07, "loss": 1.5739, "step": 4058 }, { "epoch": 0.6121710278259558, "grad_norm": 0.2585026208719849, "learning_rate": 1.9731124048704083e-07, "loss": 1.5382, "step": 4059 }, { "epoch": 0.6123218460146294, "grad_norm": 0.27435498431153166, "learning_rate": 1.9721119107041862e-07, "loss": 1.5223, "step": 4060 }, { "epoch": 0.6124726642033029, "grad_norm": 0.2544450887008572, "learning_rate": 1.9711115911964948e-07, "loss": 1.5327, "step": 4061 }, { "epoch": 0.6126234823919765, "grad_norm": 0.2613375063886286, "learning_rate": 1.970111446571935e-07, "loss": 1.5335, "step": 4062 }, { "epoch": 0.6127743005806501, "grad_norm": 0.2616005863266627, "learning_rate": 1.9691114770550683e-07, "loss": 1.5132, "step": 4063 }, { "epoch": 0.6129251187693235, "grad_norm": 0.24819046142554235, "learning_rate": 1.968111682870417e-07, "loss": 1.4915, "step": 4064 }, { "epoch": 0.6130759369579971, "grad_norm": 0.2406507972592332, "learning_rate": 1.9671120642424638e-07, "loss": 1.5525, "step": 4065 }, { "epoch": 0.6132267551466707, "grad_norm": 0.3982133511343078, "learning_rate": 1.9661126213956526e-07, "loss": 1.5404, "step": 4066 }, { "epoch": 0.6133775733353443, "grad_norm": 0.2461650672475944, "learning_rate": 1.9651133545543874e-07, "loss": 1.6025, "step": 4067 }, { "epoch": 0.6135283915240178, "grad_norm": 0.23898555321982162, "learning_rate": 1.964114263943032e-07, "loss": 1.5512, "step": 4068 }, { "epoch": 0.6136792097126913, "grad_norm": 0.27853965969912664, "learning_rate": 1.9631153497859121e-07, "loss": 1.5427, "step": 4069 }, { "epoch": 0.6138300279013649, "grad_norm": 0.42909251202956133, "learning_rate": 1.962116612307313e-07, "loss": 1.6012, "step": 4070 }, { "epoch": 0.6139808460900384, "grad_norm": 0.3696437855995362, "learning_rate": 1.9611180517314802e-07, "loss": 1.5802, "step": 4071 }, { "epoch": 0.614131664278712, "grad_norm": 0.2549663908066695, "learning_rate": 1.9601196682826193e-07, "loss": 1.6248, "step": 4072 }, { "epoch": 0.6142824824673856, "grad_norm": 0.3301196031328827, "learning_rate": 1.9591214621848962e-07, "loss": 1.5649, "step": 4073 }, { "epoch": 0.6144333006560592, "grad_norm": 0.2621416162410892, "learning_rate": 1.9581234336624392e-07, "loss": 1.5402, "step": 4074 }, { "epoch": 0.6145841188447326, "grad_norm": 0.25361749740991746, "learning_rate": 1.9571255829393318e-07, "loss": 1.5929, "step": 4075 }, { "epoch": 0.6147349370334062, "grad_norm": 0.2747034766804121, "learning_rate": 1.9561279102396233e-07, "loss": 1.5866, "step": 4076 }, { "epoch": 0.6148857552220798, "grad_norm": 0.25233153353526366, "learning_rate": 1.9551304157873182e-07, "loss": 1.6599, "step": 4077 }, { "epoch": 0.6150365734107534, "grad_norm": 0.25539884183733497, "learning_rate": 1.9541330998063843e-07, "loss": 1.5836, "step": 4078 }, { "epoch": 0.6151873915994269, "grad_norm": 0.2507081858447005, "learning_rate": 1.9531359625207477e-07, "loss": 1.5236, "step": 4079 }, { "epoch": 0.6153382097881005, "grad_norm": 0.2558971095723662, "learning_rate": 1.9521390041542945e-07, "loss": 1.5469, "step": 4080 }, { "epoch": 0.615489027976774, "grad_norm": 0.24583239296880544, "learning_rate": 1.9511422249308717e-07, "loss": 1.5651, "step": 4081 }, { "epoch": 0.6156398461654475, "grad_norm": 0.2707883188380288, "learning_rate": 1.9501456250742843e-07, "loss": 1.5377, "step": 4082 }, { "epoch": 0.6157906643541211, "grad_norm": 0.3134379839846514, "learning_rate": 1.9491492048082982e-07, "loss": 1.5443, "step": 4083 }, { "epoch": 0.6159414825427947, "grad_norm": 0.2681812828397381, "learning_rate": 1.9481529643566396e-07, "loss": 1.6049, "step": 4084 }, { "epoch": 0.6160923007314683, "grad_norm": 0.24977055392234673, "learning_rate": 1.9471569039429926e-07, "loss": 1.5978, "step": 4085 }, { "epoch": 0.6162431189201417, "grad_norm": 0.43051972962166635, "learning_rate": 1.9461610237910016e-07, "loss": 1.5634, "step": 4086 }, { "epoch": 0.6163939371088153, "grad_norm": 0.2428402746597223, "learning_rate": 1.9451653241242716e-07, "loss": 1.5482, "step": 4087 }, { "epoch": 0.6165447552974889, "grad_norm": 0.3218770038957548, "learning_rate": 1.9441698051663657e-07, "loss": 1.5731, "step": 4088 }, { "epoch": 0.6166955734861624, "grad_norm": 0.2800037101511921, "learning_rate": 1.9431744671408068e-07, "loss": 1.639, "step": 4089 }, { "epoch": 0.616846391674836, "grad_norm": 0.2654757134978187, "learning_rate": 1.9421793102710777e-07, "loss": 1.5251, "step": 4090 }, { "epoch": 0.6169972098635096, "grad_norm": 0.24389321581221143, "learning_rate": 1.941184334780619e-07, "loss": 1.5301, "step": 4091 }, { "epoch": 0.617148028052183, "grad_norm": 0.24763840870790535, "learning_rate": 1.940189540892833e-07, "loss": 1.5054, "step": 4092 }, { "epoch": 0.6172988462408566, "grad_norm": 0.2496724949164925, "learning_rate": 1.9391949288310794e-07, "loss": 1.5838, "step": 4093 }, { "epoch": 0.6174496644295302, "grad_norm": 0.24786919043003802, "learning_rate": 1.9382004988186778e-07, "loss": 1.527, "step": 4094 }, { "epoch": 0.6176004826182038, "grad_norm": 0.28112801249262676, "learning_rate": 1.9372062510789062e-07, "loss": 1.5374, "step": 4095 }, { "epoch": 0.6177513008068773, "grad_norm": 0.2515386545134396, "learning_rate": 1.9362121858350017e-07, "loss": 1.6068, "step": 4096 }, { "epoch": 0.6179021189955508, "grad_norm": 0.24586907780483525, "learning_rate": 1.9352183033101626e-07, "loss": 1.5445, "step": 4097 }, { "epoch": 0.6180529371842244, "grad_norm": 0.25770929010992843, "learning_rate": 1.934224603727543e-07, "loss": 1.607, "step": 4098 }, { "epoch": 0.618203755372898, "grad_norm": 0.28897863823031006, "learning_rate": 1.9332310873102576e-07, "loss": 1.544, "step": 4099 }, { "epoch": 0.6183545735615715, "grad_norm": 0.24925170298732696, "learning_rate": 1.9322377542813806e-07, "loss": 1.5623, "step": 4100 }, { "epoch": 0.6185053917502451, "grad_norm": 0.2685205241567941, "learning_rate": 1.931244604863943e-07, "loss": 1.5672, "step": 4101 }, { "epoch": 0.6186562099389187, "grad_norm": 0.2731422561837393, "learning_rate": 1.9302516392809377e-07, "loss": 1.525, "step": 4102 }, { "epoch": 0.6188070281275921, "grad_norm": 0.24643168544194732, "learning_rate": 1.9292588577553116e-07, "loss": 1.5496, "step": 4103 }, { "epoch": 0.6189578463162657, "grad_norm": 0.5197430988428763, "learning_rate": 1.9282662605099747e-07, "loss": 1.5146, "step": 4104 }, { "epoch": 0.6191086645049393, "grad_norm": 0.3065588217115942, "learning_rate": 1.9272738477677947e-07, "loss": 1.5909, "step": 4105 }, { "epoch": 0.6192594826936129, "grad_norm": 0.25905693240224775, "learning_rate": 1.9262816197515957e-07, "loss": 1.5024, "step": 4106 }, { "epoch": 0.6194103008822864, "grad_norm": 0.2770648457699646, "learning_rate": 1.9252895766841632e-07, "loss": 1.5263, "step": 4107 }, { "epoch": 0.61956111907096, "grad_norm": 0.26308592325563007, "learning_rate": 1.9242977187882381e-07, "loss": 1.5136, "step": 4108 }, { "epoch": 0.6197119372596335, "grad_norm": 0.24897013505706644, "learning_rate": 1.9233060462865226e-07, "loss": 1.5643, "step": 4109 }, { "epoch": 0.619862755448307, "grad_norm": 0.2551089624531133, "learning_rate": 1.9223145594016753e-07, "loss": 1.5402, "step": 4110 }, { "epoch": 0.6200135736369806, "grad_norm": 0.254746090937665, "learning_rate": 1.9213232583563144e-07, "loss": 1.553, "step": 4111 }, { "epoch": 0.6201643918256542, "grad_norm": 0.25297452968194123, "learning_rate": 1.920332143373016e-07, "loss": 1.6361, "step": 4112 }, { "epoch": 0.6203152100143278, "grad_norm": 0.2642876174112166, "learning_rate": 1.9193412146743137e-07, "loss": 1.5536, "step": 4113 }, { "epoch": 0.6204660282030012, "grad_norm": 0.25329441823211885, "learning_rate": 1.9183504724827005e-07, "loss": 1.6001, "step": 4114 }, { "epoch": 0.6206168463916748, "grad_norm": 0.2529701432326738, "learning_rate": 1.9173599170206266e-07, "loss": 1.6273, "step": 4115 }, { "epoch": 0.6207676645803484, "grad_norm": 0.27508609656579774, "learning_rate": 1.9163695485105003e-07, "loss": 1.529, "step": 4116 }, { "epoch": 0.620918482769022, "grad_norm": 0.2468866471606738, "learning_rate": 1.915379367174688e-07, "loss": 1.583, "step": 4117 }, { "epoch": 0.6210693009576955, "grad_norm": 0.24609628520209548, "learning_rate": 1.9143893732355153e-07, "loss": 1.6009, "step": 4118 }, { "epoch": 0.6212201191463691, "grad_norm": 0.2514840187389571, "learning_rate": 1.9133995669152632e-07, "loss": 1.5842, "step": 4119 }, { "epoch": 0.6213709373350426, "grad_norm": 0.243764288526915, "learning_rate": 1.9124099484361738e-07, "loss": 1.4551, "step": 4120 }, { "epoch": 0.6215217555237161, "grad_norm": 0.24310431183917738, "learning_rate": 1.9114205180204436e-07, "loss": 1.5369, "step": 4121 }, { "epoch": 0.6216725737123897, "grad_norm": 0.24773421290054362, "learning_rate": 1.910431275890229e-07, "loss": 1.5442, "step": 4122 }, { "epoch": 0.6218233919010633, "grad_norm": 0.24121504981746866, "learning_rate": 1.909442222267645e-07, "loss": 1.5791, "step": 4123 }, { "epoch": 0.6219742100897369, "grad_norm": 0.9265375389096069, "learning_rate": 1.9084533573747607e-07, "loss": 1.5504, "step": 4124 }, { "epoch": 0.6221250282784104, "grad_norm": 0.24661482815201358, "learning_rate": 1.907464681433607e-07, "loss": 1.5427, "step": 4125 }, { "epoch": 0.6222758464670839, "grad_norm": 0.25449076998397746, "learning_rate": 1.906476194666169e-07, "loss": 1.5549, "step": 4126 }, { "epoch": 0.6224266646557575, "grad_norm": 0.24234561801686147, "learning_rate": 1.9054878972943917e-07, "loss": 1.6136, "step": 4127 }, { "epoch": 0.622577482844431, "grad_norm": 0.26050689682241945, "learning_rate": 1.9044997895401764e-07, "loss": 1.6193, "step": 4128 }, { "epoch": 0.6227283010331046, "grad_norm": 0.2978038318937061, "learning_rate": 1.9035118716253824e-07, "loss": 1.5952, "step": 4129 }, { "epoch": 0.6228791192217782, "grad_norm": 0.31268001378147847, "learning_rate": 1.9025241437718247e-07, "loss": 1.6071, "step": 4130 }, { "epoch": 0.6230299374104517, "grad_norm": 0.2466418509727293, "learning_rate": 1.901536606201279e-07, "loss": 1.5854, "step": 4131 }, { "epoch": 0.6231807555991252, "grad_norm": 0.2617793842289526, "learning_rate": 1.900549259135475e-07, "loss": 1.5385, "step": 4132 }, { "epoch": 0.6233315737877988, "grad_norm": 0.24814544058042998, "learning_rate": 1.8995621027961005e-07, "loss": 1.5733, "step": 4133 }, { "epoch": 0.6234823919764724, "grad_norm": 0.2599446957413963, "learning_rate": 1.898575137404802e-07, "loss": 1.52, "step": 4134 }, { "epoch": 0.623633210165146, "grad_norm": 0.2858657339108542, "learning_rate": 1.8975883631831812e-07, "loss": 1.6453, "step": 4135 }, { "epoch": 0.6237840283538195, "grad_norm": 0.33921126820227165, "learning_rate": 1.8966017803527977e-07, "loss": 1.5529, "step": 4136 }, { "epoch": 0.623934846542493, "grad_norm": 0.5693270764008028, "learning_rate": 1.8956153891351683e-07, "loss": 1.5517, "step": 4137 }, { "epoch": 0.6240856647311666, "grad_norm": 0.24994044412573277, "learning_rate": 1.894629189751767e-07, "loss": 1.5789, "step": 4138 }, { "epoch": 0.6242364829198401, "grad_norm": 0.24992411867556763, "learning_rate": 1.893643182424023e-07, "loss": 1.5707, "step": 4139 }, { "epoch": 0.6243873011085137, "grad_norm": 0.36508795915231895, "learning_rate": 1.8926573673733243e-07, "loss": 1.5648, "step": 4140 }, { "epoch": 0.6245381192971873, "grad_norm": 0.27187533818843984, "learning_rate": 1.8916717448210157e-07, "loss": 1.5407, "step": 4141 }, { "epoch": 0.6246889374858607, "grad_norm": 0.24719777663478043, "learning_rate": 1.8906863149883972e-07, "loss": 1.5622, "step": 4142 }, { "epoch": 0.6248397556745343, "grad_norm": 0.3215843836818221, "learning_rate": 1.8897010780967275e-07, "loss": 1.5543, "step": 4143 }, { "epoch": 0.6249905738632079, "grad_norm": 0.25889412312839055, "learning_rate": 1.8887160343672205e-07, "loss": 1.5937, "step": 4144 }, { "epoch": 0.6251413920518815, "grad_norm": 0.2486312762659672, "learning_rate": 1.8877311840210464e-07, "loss": 1.5016, "step": 4145 }, { "epoch": 0.625292210240555, "grad_norm": 0.25252796679476264, "learning_rate": 1.8867465272793341e-07, "loss": 1.5647, "step": 4146 }, { "epoch": 0.6254430284292286, "grad_norm": 0.26325625302217504, "learning_rate": 1.8857620643631672e-07, "loss": 1.5759, "step": 4147 }, { "epoch": 0.6255938466179021, "grad_norm": 0.2794617366637707, "learning_rate": 1.8847777954935856e-07, "loss": 1.6256, "step": 4148 }, { "epoch": 0.6257446648065756, "grad_norm": 0.25124112050953085, "learning_rate": 1.8837937208915876e-07, "loss": 1.6167, "step": 4149 }, { "epoch": 0.6258954829952492, "grad_norm": 0.2403652724633617, "learning_rate": 1.8828098407781256e-07, "loss": 1.5463, "step": 4150 }, { "epoch": 0.6260463011839228, "grad_norm": 0.24589197110547903, "learning_rate": 1.8818261553741106e-07, "loss": 1.522, "step": 4151 }, { "epoch": 0.6261971193725964, "grad_norm": 0.2414084395099176, "learning_rate": 1.8808426649004066e-07, "loss": 1.576, "step": 4152 }, { "epoch": 0.6263479375612699, "grad_norm": 0.2621010820780024, "learning_rate": 1.8798593695778376e-07, "loss": 1.4603, "step": 4153 }, { "epoch": 0.6264987557499434, "grad_norm": 0.2530761046314033, "learning_rate": 1.8788762696271815e-07, "loss": 1.5216, "step": 4154 }, { "epoch": 0.626649573938617, "grad_norm": 0.2630464111986687, "learning_rate": 1.8778933652691724e-07, "loss": 1.537, "step": 4155 }, { "epoch": 0.6268003921272906, "grad_norm": 0.24619072070004963, "learning_rate": 1.8769106567245012e-07, "loss": 1.5224, "step": 4156 }, { "epoch": 0.6269512103159641, "grad_norm": 0.2701082424355122, "learning_rate": 1.8759281442138152e-07, "loss": 1.6291, "step": 4157 }, { "epoch": 0.6271020285046377, "grad_norm": 0.23715739982926645, "learning_rate": 1.8749458279577162e-07, "loss": 1.5534, "step": 4158 }, { "epoch": 0.6272528466933112, "grad_norm": 0.2503082038843583, "learning_rate": 1.8739637081767638e-07, "loss": 1.5226, "step": 4159 }, { "epoch": 0.6274036648819847, "grad_norm": 0.25121712160535314, "learning_rate": 1.8729817850914713e-07, "loss": 1.5512, "step": 4160 }, { "epoch": 0.6275544830706583, "grad_norm": 0.2490038140958148, "learning_rate": 1.87200005892231e-07, "loss": 1.5235, "step": 4161 }, { "epoch": 0.6277053012593319, "grad_norm": 0.26388132410554616, "learning_rate": 1.8710185298897057e-07, "loss": 1.5502, "step": 4162 }, { "epoch": 0.6278561194480055, "grad_norm": 0.24615806276388288, "learning_rate": 1.8700371982140407e-07, "loss": 1.5226, "step": 4163 }, { "epoch": 0.628006937636679, "grad_norm": 0.6160809143870036, "learning_rate": 1.869056064115651e-07, "loss": 1.5449, "step": 4164 }, { "epoch": 0.6281577558253525, "grad_norm": 0.3681537783717758, "learning_rate": 1.8680751278148315e-07, "loss": 1.5616, "step": 4165 }, { "epoch": 0.6283085740140261, "grad_norm": 0.2539030267347039, "learning_rate": 1.8670943895318297e-07, "loss": 1.5492, "step": 4166 }, { "epoch": 0.6284593922026996, "grad_norm": 0.24935666844681972, "learning_rate": 1.8661138494868513e-07, "loss": 1.5975, "step": 4167 }, { "epoch": 0.6286102103913732, "grad_norm": 0.2388405326692446, "learning_rate": 1.8651335079000547e-07, "loss": 1.5921, "step": 4168 }, { "epoch": 0.6287610285800468, "grad_norm": 0.24942525861218032, "learning_rate": 1.8641533649915558e-07, "loss": 1.5575, "step": 4169 }, { "epoch": 0.6289118467687204, "grad_norm": 0.281992690958799, "learning_rate": 1.8631734209814254e-07, "loss": 1.5531, "step": 4170 }, { "epoch": 0.6290626649573938, "grad_norm": 0.4165446083848909, "learning_rate": 1.862193676089689e-07, "loss": 1.523, "step": 4171 }, { "epoch": 0.6292134831460674, "grad_norm": 0.270594878229444, "learning_rate": 1.8612141305363282e-07, "loss": 1.4721, "step": 4172 }, { "epoch": 0.629364301334741, "grad_norm": 0.2636488038229057, "learning_rate": 1.8602347845412796e-07, "loss": 1.5209, "step": 4173 }, { "epoch": 0.6295151195234145, "grad_norm": 0.26570010948929257, "learning_rate": 1.859255638324434e-07, "loss": 1.5453, "step": 4174 }, { "epoch": 0.6296659377120881, "grad_norm": 0.30017922416708503, "learning_rate": 1.8582766921056403e-07, "loss": 1.5877, "step": 4175 }, { "epoch": 0.6298167559007616, "grad_norm": 0.24676668094362056, "learning_rate": 1.8572979461046978e-07, "loss": 1.5554, "step": 4176 }, { "epoch": 0.6299675740894352, "grad_norm": 0.23806938270752506, "learning_rate": 1.8563194005413652e-07, "loss": 1.4845, "step": 4177 }, { "epoch": 0.6301183922781087, "grad_norm": 0.24248553109147253, "learning_rate": 1.8553410556353537e-07, "loss": 1.5381, "step": 4178 }, { "epoch": 0.6302692104667823, "grad_norm": 0.2440376958567314, "learning_rate": 1.854362911606331e-07, "loss": 1.5795, "step": 4179 }, { "epoch": 0.6304200286554559, "grad_norm": 0.24707297107098733, "learning_rate": 1.853384968673919e-07, "loss": 1.5483, "step": 4180 }, { "epoch": 0.6305708468441295, "grad_norm": 0.2701950141920131, "learning_rate": 1.852407227057693e-07, "loss": 1.5881, "step": 4181 }, { "epoch": 0.6307216650328029, "grad_norm": 0.25399214732994313, "learning_rate": 1.8514296869771856e-07, "loss": 1.6224, "step": 4182 }, { "epoch": 0.6308724832214765, "grad_norm": 0.25343116507938845, "learning_rate": 1.8504523486518825e-07, "loss": 1.5684, "step": 4183 }, { "epoch": 0.6310233014101501, "grad_norm": 0.25533834657361526, "learning_rate": 1.8494752123012247e-07, "loss": 1.5733, "step": 4184 }, { "epoch": 0.6311741195988236, "grad_norm": 0.41570541171889785, "learning_rate": 1.8484982781446085e-07, "loss": 1.5857, "step": 4185 }, { "epoch": 0.6313249377874972, "grad_norm": 0.25628694395084733, "learning_rate": 1.8475215464013826e-07, "loss": 1.5359, "step": 4186 }, { "epoch": 0.6314757559761707, "grad_norm": 1.398194438395537, "learning_rate": 1.8465450172908535e-07, "loss": 1.547, "step": 4187 }, { "epoch": 0.6316265741648442, "grad_norm": 0.2669251497151142, "learning_rate": 1.8455686910322794e-07, "loss": 1.5542, "step": 4188 }, { "epoch": 0.6317773923535178, "grad_norm": 0.2680685107882268, "learning_rate": 1.8445925678448737e-07, "loss": 1.5726, "step": 4189 }, { "epoch": 0.6319282105421914, "grad_norm": 0.2598035544709078, "learning_rate": 1.8436166479478054e-07, "loss": 1.5905, "step": 4190 }, { "epoch": 0.632079028730865, "grad_norm": 0.3312860252153446, "learning_rate": 1.8426409315601964e-07, "loss": 1.5328, "step": 4191 }, { "epoch": 0.6322298469195385, "grad_norm": 0.24867557124552955, "learning_rate": 1.8416654189011231e-07, "loss": 1.5652, "step": 4192 }, { "epoch": 0.632380665108212, "grad_norm": 0.2539702365191936, "learning_rate": 1.8406901101896172e-07, "loss": 1.5334, "step": 4193 }, { "epoch": 0.6325314832968856, "grad_norm": 0.25319783760576814, "learning_rate": 1.8397150056446637e-07, "loss": 1.5103, "step": 4194 }, { "epoch": 0.6326823014855592, "grad_norm": 0.2725195248103415, "learning_rate": 1.8387401054852015e-07, "loss": 1.5734, "step": 4195 }, { "epoch": 0.6328331196742327, "grad_norm": 0.2389547759149391, "learning_rate": 1.8377654099301242e-07, "loss": 1.5676, "step": 4196 }, { "epoch": 0.6329839378629063, "grad_norm": 0.24308762508937254, "learning_rate": 1.83679091919828e-07, "loss": 1.531, "step": 4197 }, { "epoch": 0.6331347560515799, "grad_norm": 0.23951624981640487, "learning_rate": 1.83581663350847e-07, "loss": 1.4717, "step": 4198 }, { "epoch": 0.6332855742402533, "grad_norm": 0.24921256044384987, "learning_rate": 1.8348425530794497e-07, "loss": 1.4919, "step": 4199 }, { "epoch": 0.6334363924289269, "grad_norm": 0.24498417312046017, "learning_rate": 1.8338686781299283e-07, "loss": 1.5877, "step": 4200 }, { "epoch": 0.6335872106176005, "grad_norm": 0.25287524883896284, "learning_rate": 1.83289500887857e-07, "loss": 1.5059, "step": 4201 }, { "epoch": 0.6337380288062741, "grad_norm": 0.28046838176968375, "learning_rate": 1.8319215455439907e-07, "loss": 1.5184, "step": 4202 }, { "epoch": 0.6338888469949476, "grad_norm": 0.24926218979066564, "learning_rate": 1.8309482883447624e-07, "loss": 1.5998, "step": 4203 }, { "epoch": 0.6340396651836211, "grad_norm": 0.26904621255126415, "learning_rate": 1.8299752374994087e-07, "loss": 1.4524, "step": 4204 }, { "epoch": 0.6341904833722947, "grad_norm": 0.2516115836972916, "learning_rate": 1.829002393226408e-07, "loss": 1.5444, "step": 4205 }, { "epoch": 0.6343413015609682, "grad_norm": 0.25427584168397666, "learning_rate": 1.8280297557441933e-07, "loss": 1.5456, "step": 4206 }, { "epoch": 0.6344921197496418, "grad_norm": 0.2607497963600299, "learning_rate": 1.827057325271148e-07, "loss": 1.5775, "step": 4207 }, { "epoch": 0.6346429379383154, "grad_norm": 0.2713007671030094, "learning_rate": 1.8260851020256135e-07, "loss": 1.5586, "step": 4208 }, { "epoch": 0.634793756126989, "grad_norm": 0.24027781792831884, "learning_rate": 1.82511308622588e-07, "loss": 1.5769, "step": 4209 }, { "epoch": 0.6349445743156624, "grad_norm": 0.24382415193812368, "learning_rate": 1.8241412780901942e-07, "loss": 1.5387, "step": 4210 }, { "epoch": 0.635095392504336, "grad_norm": 0.2487821871282781, "learning_rate": 1.8231696778367561e-07, "loss": 1.6078, "step": 4211 }, { "epoch": 0.6352462106930096, "grad_norm": 0.26504042617639684, "learning_rate": 1.8221982856837177e-07, "loss": 1.6453, "step": 4212 }, { "epoch": 0.6353970288816831, "grad_norm": 0.2900287076542723, "learning_rate": 1.8212271018491836e-07, "loss": 1.5267, "step": 4213 }, { "epoch": 0.6355478470703567, "grad_norm": 0.2729493662176146, "learning_rate": 1.8202561265512155e-07, "loss": 1.5371, "step": 4214 }, { "epoch": 0.6356986652590303, "grad_norm": 0.5427101588791572, "learning_rate": 1.8192853600078234e-07, "loss": 1.5208, "step": 4215 }, { "epoch": 0.6358494834477038, "grad_norm": 0.2438775766503901, "learning_rate": 1.8183148024369736e-07, "loss": 1.5903, "step": 4216 }, { "epoch": 0.6360003016363773, "grad_norm": 0.25030235583666205, "learning_rate": 1.817344454056584e-07, "loss": 1.6066, "step": 4217 }, { "epoch": 0.6361511198250509, "grad_norm": 0.24859572445405265, "learning_rate": 1.8163743150845263e-07, "loss": 1.5843, "step": 4218 }, { "epoch": 0.6363019380137245, "grad_norm": 0.2847060495976388, "learning_rate": 1.815404385738626e-07, "loss": 1.5628, "step": 4219 }, { "epoch": 0.636452756202398, "grad_norm": 0.2545438298708443, "learning_rate": 1.8144346662366582e-07, "loss": 1.4878, "step": 4220 }, { "epoch": 0.6366035743910715, "grad_norm": 0.3434970471598928, "learning_rate": 1.8134651567963555e-07, "loss": 1.6083, "step": 4221 }, { "epoch": 0.6367543925797451, "grad_norm": 0.24761430835041962, "learning_rate": 1.812495857635399e-07, "loss": 1.5511, "step": 4222 }, { "epoch": 0.6369052107684187, "grad_norm": 0.25743559630787793, "learning_rate": 1.8115267689714265e-07, "loss": 1.5761, "step": 4223 }, { "epoch": 0.6370560289570922, "grad_norm": 0.24586976247005674, "learning_rate": 1.810557891022026e-07, "loss": 1.5507, "step": 4224 }, { "epoch": 0.6372068471457658, "grad_norm": 0.24560702102994042, "learning_rate": 1.8095892240047382e-07, "loss": 1.5253, "step": 4225 }, { "epoch": 0.6373576653344394, "grad_norm": 0.35455214235373617, "learning_rate": 1.8086207681370562e-07, "loss": 1.5711, "step": 4226 }, { "epoch": 0.6375084835231128, "grad_norm": 0.2694202111370069, "learning_rate": 1.8076525236364292e-07, "loss": 1.4951, "step": 4227 }, { "epoch": 0.6376593017117864, "grad_norm": 0.2935064832001404, "learning_rate": 1.806684490720255e-07, "loss": 1.58, "step": 4228 }, { "epoch": 0.63781011990046, "grad_norm": 0.25526045148889653, "learning_rate": 1.8057166696058834e-07, "loss": 1.4672, "step": 4229 }, { "epoch": 0.6379609380891336, "grad_norm": 0.2488160762655106, "learning_rate": 1.804749060510622e-07, "loss": 1.536, "step": 4230 }, { "epoch": 0.6381117562778071, "grad_norm": 0.2489581523780075, "learning_rate": 1.803781663651724e-07, "loss": 1.6017, "step": 4231 }, { "epoch": 0.6382625744664806, "grad_norm": 0.3321037514342219, "learning_rate": 1.8028144792463996e-07, "loss": 1.6216, "step": 4232 }, { "epoch": 0.6384133926551542, "grad_norm": 0.24786754870364802, "learning_rate": 1.80184750751181e-07, "loss": 1.6027, "step": 4233 }, { "epoch": 0.6385642108438278, "grad_norm": 0.25282574867184987, "learning_rate": 1.8008807486650679e-07, "loss": 1.5709, "step": 4234 }, { "epoch": 0.6387150290325013, "grad_norm": 0.25331586615858, "learning_rate": 1.7999142029232388e-07, "loss": 1.5655, "step": 4235 }, { "epoch": 0.6388658472211749, "grad_norm": 0.2552052135322833, "learning_rate": 1.7989478705033417e-07, "loss": 1.6102, "step": 4236 }, { "epoch": 0.6390166654098485, "grad_norm": 0.2521213219961141, "learning_rate": 1.7979817516223456e-07, "loss": 1.5544, "step": 4237 }, { "epoch": 0.6391674835985219, "grad_norm": 0.24411593375663035, "learning_rate": 1.7970158464971713e-07, "loss": 1.538, "step": 4238 }, { "epoch": 0.6393183017871955, "grad_norm": 0.25975553327168827, "learning_rate": 1.7960501553446927e-07, "loss": 1.5823, "step": 4239 }, { "epoch": 0.6394691199758691, "grad_norm": 0.41432267503092424, "learning_rate": 1.7950846783817375e-07, "loss": 1.5641, "step": 4240 }, { "epoch": 0.6396199381645427, "grad_norm": 0.2521228127440944, "learning_rate": 1.7941194158250823e-07, "loss": 1.586, "step": 4241 }, { "epoch": 0.6397707563532162, "grad_norm": 0.2808400435101359, "learning_rate": 1.793154367891455e-07, "loss": 1.5899, "step": 4242 }, { "epoch": 0.6399215745418898, "grad_norm": 0.24672901445972867, "learning_rate": 1.79218953479754e-07, "loss": 1.5811, "step": 4243 }, { "epoch": 0.6400723927305633, "grad_norm": 0.2664023244654792, "learning_rate": 1.7912249167599692e-07, "loss": 1.596, "step": 4244 }, { "epoch": 0.6402232109192368, "grad_norm": 0.24914972989068035, "learning_rate": 1.7902605139953258e-07, "loss": 1.5428, "step": 4245 }, { "epoch": 0.6403740291079104, "grad_norm": 0.30448809907106317, "learning_rate": 1.789296326720149e-07, "loss": 1.5993, "step": 4246 }, { "epoch": 0.640524847296584, "grad_norm": 0.2400644326933849, "learning_rate": 1.788332355150925e-07, "loss": 1.5132, "step": 4247 }, { "epoch": 0.6406756654852576, "grad_norm": 0.29287295103545796, "learning_rate": 1.7873685995040939e-07, "loss": 1.5137, "step": 4248 }, { "epoch": 0.640826483673931, "grad_norm": 0.24555063234062374, "learning_rate": 1.7864050599960467e-07, "loss": 1.5487, "step": 4249 }, { "epoch": 0.6409773018626046, "grad_norm": 0.3538453607491625, "learning_rate": 1.785441736843128e-07, "loss": 1.5447, "step": 4250 }, { "epoch": 0.6411281200512782, "grad_norm": 0.26030436621563, "learning_rate": 1.7844786302616299e-07, "loss": 1.501, "step": 4251 }, { "epoch": 0.6412789382399517, "grad_norm": 0.26606952015233887, "learning_rate": 1.7835157404677975e-07, "loss": 1.531, "step": 4252 }, { "epoch": 0.6414297564286253, "grad_norm": 0.2442741412802352, "learning_rate": 1.7825530676778298e-07, "loss": 1.538, "step": 4253 }, { "epoch": 0.6415805746172989, "grad_norm": 0.2766729345972109, "learning_rate": 1.7815906121078734e-07, "loss": 1.5777, "step": 4254 }, { "epoch": 0.6417313928059724, "grad_norm": 0.2633113523056404, "learning_rate": 1.7806283739740275e-07, "loss": 1.6583, "step": 4255 }, { "epoch": 0.6418822109946459, "grad_norm": 0.26819538082082267, "learning_rate": 1.7796663534923417e-07, "loss": 1.4589, "step": 4256 }, { "epoch": 0.6420330291833195, "grad_norm": 0.24879651699590694, "learning_rate": 1.7787045508788208e-07, "loss": 1.4937, "step": 4257 }, { "epoch": 0.6421838473719931, "grad_norm": 0.3620468432092983, "learning_rate": 1.7777429663494149e-07, "loss": 1.5409, "step": 4258 }, { "epoch": 0.6423346655606667, "grad_norm": 0.24575768302139991, "learning_rate": 1.7767816001200265e-07, "loss": 1.5894, "step": 4259 }, { "epoch": 0.6424854837493402, "grad_norm": 0.2455419739713735, "learning_rate": 1.775820452406514e-07, "loss": 1.5973, "step": 4260 }, { "epoch": 0.6426363019380137, "grad_norm": 0.44783851720614026, "learning_rate": 1.7748595234246812e-07, "loss": 1.5683, "step": 4261 }, { "epoch": 0.6427871201266873, "grad_norm": 0.27759236136718557, "learning_rate": 1.7738988133902832e-07, "loss": 1.5398, "step": 4262 }, { "epoch": 0.6429379383153608, "grad_norm": 0.264656048146391, "learning_rate": 1.7729383225190302e-07, "loss": 1.5681, "step": 4263 }, { "epoch": 0.6430887565040344, "grad_norm": 0.39913174201778384, "learning_rate": 1.771978051026579e-07, "loss": 1.5774, "step": 4264 }, { "epoch": 0.643239574692708, "grad_norm": 0.2412047861650146, "learning_rate": 1.7710179991285384e-07, "loss": 1.6106, "step": 4265 }, { "epoch": 0.6433903928813814, "grad_norm": 0.27653888375962127, "learning_rate": 1.7700581670404668e-07, "loss": 1.5251, "step": 4266 }, { "epoch": 0.643541211070055, "grad_norm": 0.2482221340611367, "learning_rate": 1.769098554977877e-07, "loss": 1.571, "step": 4267 }, { "epoch": 0.6436920292587286, "grad_norm": 0.2536941588689048, "learning_rate": 1.7681391631562287e-07, "loss": 1.6051, "step": 4268 }, { "epoch": 0.6438428474474022, "grad_norm": 0.2612095300387124, "learning_rate": 1.7671799917909324e-07, "loss": 1.5267, "step": 4269 }, { "epoch": 0.6439936656360757, "grad_norm": 0.544029058097784, "learning_rate": 1.7662210410973512e-07, "loss": 1.6163, "step": 4270 }, { "epoch": 0.6441444838247493, "grad_norm": 0.30183016903165544, "learning_rate": 1.7652623112907971e-07, "loss": 1.5697, "step": 4271 }, { "epoch": 0.6442953020134228, "grad_norm": 0.26179017106545627, "learning_rate": 1.7643038025865326e-07, "loss": 1.6243, "step": 4272 }, { "epoch": 0.6444461202020964, "grad_norm": 0.2512366290362913, "learning_rate": 1.7633455151997718e-07, "loss": 1.5723, "step": 4273 }, { "epoch": 0.6445969383907699, "grad_norm": 0.2833266668155499, "learning_rate": 1.762387449345677e-07, "loss": 1.6155, "step": 4274 }, { "epoch": 0.6447477565794435, "grad_norm": 0.25350106304489006, "learning_rate": 1.7614296052393618e-07, "loss": 1.5511, "step": 4275 }, { "epoch": 0.6448985747681171, "grad_norm": 0.25067693443556277, "learning_rate": 1.7604719830958913e-07, "loss": 1.566, "step": 4276 }, { "epoch": 0.6450493929567905, "grad_norm": 0.44205510669995635, "learning_rate": 1.759514583130279e-07, "loss": 1.5355, "step": 4277 }, { "epoch": 0.6452002111454641, "grad_norm": 0.25698267380120476, "learning_rate": 1.7585574055574882e-07, "loss": 1.5815, "step": 4278 }, { "epoch": 0.6453510293341377, "grad_norm": 0.2577964193893113, "learning_rate": 1.7576004505924335e-07, "loss": 1.5543, "step": 4279 }, { "epoch": 0.6455018475228113, "grad_norm": 0.693098392046214, "learning_rate": 1.7566437184499806e-07, "loss": 1.6158, "step": 4280 }, { "epoch": 0.6456526657114848, "grad_norm": 0.2592870202788428, "learning_rate": 1.755687209344942e-07, "loss": 1.5987, "step": 4281 }, { "epoch": 0.6458034839001584, "grad_norm": 0.23694233652541807, "learning_rate": 1.7547309234920827e-07, "loss": 1.5027, "step": 4282 }, { "epoch": 0.6459543020888319, "grad_norm": 0.39543519488811646, "learning_rate": 1.7537748611061166e-07, "loss": 1.5788, "step": 4283 }, { "epoch": 0.6461051202775054, "grad_norm": 0.24855769967495914, "learning_rate": 1.752819022401707e-07, "loss": 1.4575, "step": 4284 }, { "epoch": 0.646255938466179, "grad_norm": 0.243320883100979, "learning_rate": 1.751863407593468e-07, "loss": 1.5473, "step": 4285 }, { "epoch": 0.6464067566548526, "grad_norm": 0.2542812940612368, "learning_rate": 1.7509080168959634e-07, "loss": 1.5126, "step": 4286 }, { "epoch": 0.6465575748435262, "grad_norm": 0.2728851797285914, "learning_rate": 1.7499528505237058e-07, "loss": 1.5611, "step": 4287 }, { "epoch": 0.6467083930321997, "grad_norm": 0.24571581123573824, "learning_rate": 1.7489979086911572e-07, "loss": 1.5596, "step": 4288 }, { "epoch": 0.6468592112208732, "grad_norm": 0.25357135141351167, "learning_rate": 1.7480431916127314e-07, "loss": 1.5041, "step": 4289 }, { "epoch": 0.6470100294095468, "grad_norm": 0.25122414364095647, "learning_rate": 1.7470886995027889e-07, "loss": 1.5632, "step": 4290 }, { "epoch": 0.6471608475982203, "grad_norm": 0.2504427056630961, "learning_rate": 1.7461344325756416e-07, "loss": 1.5337, "step": 4291 }, { "epoch": 0.6473116657868939, "grad_norm": 0.2536601641407676, "learning_rate": 1.7451803910455493e-07, "loss": 1.4983, "step": 4292 }, { "epoch": 0.6474624839755675, "grad_norm": 0.3148369931095937, "learning_rate": 1.7442265751267233e-07, "loss": 1.504, "step": 4293 }, { "epoch": 0.647613302164241, "grad_norm": 0.2528144702491195, "learning_rate": 1.7432729850333227e-07, "loss": 1.5981, "step": 4294 }, { "epoch": 0.6477641203529145, "grad_norm": 0.24553333308016317, "learning_rate": 1.7423196209794553e-07, "loss": 1.6014, "step": 4295 }, { "epoch": 0.6479149385415881, "grad_norm": 0.2522726324420554, "learning_rate": 1.7413664831791808e-07, "loss": 1.5271, "step": 4296 }, { "epoch": 0.6480657567302617, "grad_norm": 0.24628252707932746, "learning_rate": 1.7404135718465052e-07, "loss": 1.5046, "step": 4297 }, { "epoch": 0.6482165749189353, "grad_norm": 0.25665776928582656, "learning_rate": 1.7394608871953846e-07, "loss": 1.5659, "step": 4298 }, { "epoch": 0.6483673931076088, "grad_norm": 0.2546847179612797, "learning_rate": 1.7385084294397255e-07, "loss": 1.5515, "step": 4299 }, { "epoch": 0.6485182112962823, "grad_norm": 0.39990758357897654, "learning_rate": 1.7375561987933823e-07, "loss": 1.551, "step": 4300 }, { "epoch": 0.6486690294849559, "grad_norm": 0.2391955745190497, "learning_rate": 1.7366041954701572e-07, "loss": 1.5303, "step": 4301 }, { "epoch": 0.6488198476736294, "grad_norm": 0.25178143314721985, "learning_rate": 1.7356524196838038e-07, "loss": 1.5213, "step": 4302 }, { "epoch": 0.648970665862303, "grad_norm": 0.3185469911409178, "learning_rate": 1.7347008716480239e-07, "loss": 1.5558, "step": 4303 }, { "epoch": 0.6491214840509766, "grad_norm": 0.26510807044531254, "learning_rate": 1.7337495515764667e-07, "loss": 1.6366, "step": 4304 }, { "epoch": 0.6492723022396502, "grad_norm": 0.24514148823357573, "learning_rate": 1.732798459682731e-07, "loss": 1.4934, "step": 4305 }, { "epoch": 0.6494231204283236, "grad_norm": 0.2644675615376364, "learning_rate": 1.7318475961803663e-07, "loss": 1.5726, "step": 4306 }, { "epoch": 0.6495739386169972, "grad_norm": 0.27290084390136327, "learning_rate": 1.7308969612828677e-07, "loss": 1.5, "step": 4307 }, { "epoch": 0.6497247568056708, "grad_norm": 0.40749283237853495, "learning_rate": 1.7299465552036803e-07, "loss": 1.5564, "step": 4308 }, { "epoch": 0.6498755749943443, "grad_norm": 0.24428773072085164, "learning_rate": 1.728996378156199e-07, "loss": 1.5298, "step": 4309 }, { "epoch": 0.6500263931830179, "grad_norm": 0.2644942551318615, "learning_rate": 1.7280464303537662e-07, "loss": 1.5577, "step": 4310 }, { "epoch": 0.6501772113716914, "grad_norm": 0.24525025933050998, "learning_rate": 1.7270967120096713e-07, "loss": 1.5385, "step": 4311 }, { "epoch": 0.650328029560365, "grad_norm": 0.23709008992363353, "learning_rate": 1.7261472233371553e-07, "loss": 1.5163, "step": 4312 }, { "epoch": 0.6504788477490385, "grad_norm": 1.4112663261911442, "learning_rate": 1.7251979645494064e-07, "loss": 1.5637, "step": 4313 }, { "epoch": 0.6506296659377121, "grad_norm": 0.256267568489337, "learning_rate": 1.7242489358595587e-07, "loss": 1.6153, "step": 4314 }, { "epoch": 0.6507804841263857, "grad_norm": 0.2660453656221649, "learning_rate": 1.7233001374806988e-07, "loss": 1.6332, "step": 4315 }, { "epoch": 0.6509313023150592, "grad_norm": 0.46434640586819476, "learning_rate": 1.722351569625859e-07, "loss": 1.6546, "step": 4316 }, { "epoch": 0.6510821205037327, "grad_norm": 0.24542522675788647, "learning_rate": 1.7214032325080202e-07, "loss": 1.5376, "step": 4317 }, { "epoch": 0.6512329386924063, "grad_norm": 0.3163164998179516, "learning_rate": 1.7204551263401108e-07, "loss": 1.5534, "step": 4318 }, { "epoch": 0.6513837568810799, "grad_norm": 0.24978001107360961, "learning_rate": 1.71950725133501e-07, "loss": 1.5299, "step": 4319 }, { "epoch": 0.6515345750697534, "grad_norm": 0.5574009388465192, "learning_rate": 1.7185596077055425e-07, "loss": 1.6008, "step": 4320 }, { "epoch": 0.651685393258427, "grad_norm": 0.25102482710261503, "learning_rate": 1.7176121956644812e-07, "loss": 1.525, "step": 4321 }, { "epoch": 0.6518362114471005, "grad_norm": 0.2556993576030097, "learning_rate": 1.7166650154245492e-07, "loss": 1.5854, "step": 4322 }, { "epoch": 0.651987029635774, "grad_norm": 0.7906151359618411, "learning_rate": 1.7157180671984156e-07, "loss": 1.5772, "step": 4323 }, { "epoch": 0.6521378478244476, "grad_norm": 0.25693223933049275, "learning_rate": 1.7147713511986967e-07, "loss": 1.5045, "step": 4324 }, { "epoch": 0.6522886660131212, "grad_norm": 0.24547260528732132, "learning_rate": 1.713824867637959e-07, "loss": 1.6148, "step": 4325 }, { "epoch": 0.6524394842017948, "grad_norm": 0.24976059592113195, "learning_rate": 1.7128786167287158e-07, "loss": 1.5302, "step": 4326 }, { "epoch": 0.6525903023904683, "grad_norm": 0.256866786888526, "learning_rate": 1.711932598683426e-07, "loss": 1.5909, "step": 4327 }, { "epoch": 0.6527411205791418, "grad_norm": 0.2536906065453723, "learning_rate": 1.710986813714501e-07, "loss": 1.5116, "step": 4328 }, { "epoch": 0.6528919387678154, "grad_norm": 0.2674631416210052, "learning_rate": 1.7100412620342957e-07, "loss": 1.5351, "step": 4329 }, { "epoch": 0.653042756956489, "grad_norm": 0.25525115253926994, "learning_rate": 1.709095943855114e-07, "loss": 1.6358, "step": 4330 }, { "epoch": 0.6531935751451625, "grad_norm": 1.4586056729239687, "learning_rate": 1.7081508593892064e-07, "loss": 1.4906, "step": 4331 }, { "epoch": 0.6533443933338361, "grad_norm": 0.2703883011500383, "learning_rate": 1.7072060088487734e-07, "loss": 1.5704, "step": 4332 }, { "epoch": 0.6534952115225097, "grad_norm": 0.29758567824060134, "learning_rate": 1.7062613924459613e-07, "loss": 1.6118, "step": 4333 }, { "epoch": 0.6536460297111831, "grad_norm": 0.3897839427934797, "learning_rate": 1.7053170103928627e-07, "loss": 1.6127, "step": 4334 }, { "epoch": 0.6537968478998567, "grad_norm": 0.25338403351835703, "learning_rate": 1.7043728629015207e-07, "loss": 1.6034, "step": 4335 }, { "epoch": 0.6539476660885303, "grad_norm": 0.24361921837387265, "learning_rate": 1.7034289501839234e-07, "loss": 1.5631, "step": 4336 }, { "epoch": 0.6540984842772039, "grad_norm": 0.23947647600801733, "learning_rate": 1.702485272452005e-07, "loss": 1.4588, "step": 4337 }, { "epoch": 0.6542493024658774, "grad_norm": 0.25680951317289713, "learning_rate": 1.7015418299176507e-07, "loss": 1.5945, "step": 4338 }, { "epoch": 0.6544001206545509, "grad_norm": 0.25021590941728317, "learning_rate": 1.7005986227926902e-07, "loss": 1.5184, "step": 4339 }, { "epoch": 0.6545509388432245, "grad_norm": 0.2856791616026467, "learning_rate": 1.6996556512889e-07, "loss": 1.4966, "step": 4340 }, { "epoch": 0.654701757031898, "grad_norm": 0.27081040180416743, "learning_rate": 1.6987129156180063e-07, "loss": 1.4834, "step": 4341 }, { "epoch": 0.6548525752205716, "grad_norm": 0.25935752153647074, "learning_rate": 1.6977704159916801e-07, "loss": 1.5971, "step": 4342 }, { "epoch": 0.6550033934092452, "grad_norm": 0.24553600502404302, "learning_rate": 1.6968281526215393e-07, "loss": 1.5911, "step": 4343 }, { "epoch": 0.6551542115979188, "grad_norm": 0.2558518932840195, "learning_rate": 1.6958861257191497e-07, "loss": 1.5687, "step": 4344 }, { "epoch": 0.6553050297865922, "grad_norm": 0.2589811311294846, "learning_rate": 1.6949443354960247e-07, "loss": 1.5707, "step": 4345 }, { "epoch": 0.6554558479752658, "grad_norm": 0.24884607307303297, "learning_rate": 1.6940027821636228e-07, "loss": 1.5615, "step": 4346 }, { "epoch": 0.6556066661639394, "grad_norm": 0.2647618922190364, "learning_rate": 1.6930614659333498e-07, "loss": 1.5405, "step": 4347 }, { "epoch": 0.6557574843526129, "grad_norm": 0.2623678103868431, "learning_rate": 1.69212038701656e-07, "loss": 1.539, "step": 4348 }, { "epoch": 0.6559083025412865, "grad_norm": 0.25316236256720226, "learning_rate": 1.691179545624552e-07, "loss": 1.5687, "step": 4349 }, { "epoch": 0.6560591207299601, "grad_norm": 0.25834994094591196, "learning_rate": 1.6902389419685716e-07, "loss": 1.5376, "step": 4350 }, { "epoch": 0.6562099389186336, "grad_norm": 0.2547110661971648, "learning_rate": 1.6892985762598133e-07, "loss": 1.5755, "step": 4351 }, { "epoch": 0.6563607571073071, "grad_norm": 0.3476854271744261, "learning_rate": 1.6883584487094154e-07, "loss": 1.5526, "step": 4352 }, { "epoch": 0.6565115752959807, "grad_norm": 0.2509408636464211, "learning_rate": 1.687418559528463e-07, "loss": 1.4926, "step": 4353 }, { "epoch": 0.6566623934846543, "grad_norm": 0.24533886050655487, "learning_rate": 1.6864789089279913e-07, "loss": 1.5253, "step": 4354 }, { "epoch": 0.6568132116733278, "grad_norm": 0.2696381455151316, "learning_rate": 1.6855394971189778e-07, "loss": 1.5332, "step": 4355 }, { "epoch": 0.6569640298620013, "grad_norm": 0.24429394764520096, "learning_rate": 1.6846003243123472e-07, "loss": 1.5204, "step": 4356 }, { "epoch": 0.6571148480506749, "grad_norm": 0.2594783788516782, "learning_rate": 1.6836613907189712e-07, "loss": 1.557, "step": 4357 }, { "epoch": 0.6572656662393485, "grad_norm": 0.2717059267374578, "learning_rate": 1.6827226965496686e-07, "loss": 1.5728, "step": 4358 }, { "epoch": 0.657416484428022, "grad_norm": 0.24778293229119935, "learning_rate": 1.6817842420152035e-07, "loss": 1.5528, "step": 4359 }, { "epoch": 0.6575673026166956, "grad_norm": 0.2553800911389374, "learning_rate": 1.6808460273262847e-07, "loss": 1.5626, "step": 4360 }, { "epoch": 0.6577181208053692, "grad_norm": 0.27268535161135665, "learning_rate": 1.6799080526935713e-07, "loss": 1.5631, "step": 4361 }, { "epoch": 0.6578689389940426, "grad_norm": 0.2548385837394829, "learning_rate": 1.6789703183276647e-07, "loss": 1.5719, "step": 4362 }, { "epoch": 0.6580197571827162, "grad_norm": 0.26768870655081467, "learning_rate": 1.6780328244391123e-07, "loss": 1.544, "step": 4363 }, { "epoch": 0.6581705753713898, "grad_norm": 0.23771140979603766, "learning_rate": 1.6770955712384105e-07, "loss": 1.5666, "step": 4364 }, { "epoch": 0.6583213935600634, "grad_norm": 0.2412574229422365, "learning_rate": 1.6761585589359997e-07, "loss": 1.5838, "step": 4365 }, { "epoch": 0.6584722117487369, "grad_norm": 0.26441662184861997, "learning_rate": 1.6752217877422665e-07, "loss": 1.537, "step": 4366 }, { "epoch": 0.6586230299374104, "grad_norm": 0.24615839066654882, "learning_rate": 1.6742852578675415e-07, "loss": 1.5528, "step": 4367 }, { "epoch": 0.658773848126084, "grad_norm": 0.24312429921517675, "learning_rate": 1.6733489695221054e-07, "loss": 1.4885, "step": 4368 }, { "epoch": 0.6589246663147575, "grad_norm": 0.25986900550707687, "learning_rate": 1.6724129229161815e-07, "loss": 1.5759, "step": 4369 }, { "epoch": 0.6590754845034311, "grad_norm": 0.2617579223876186, "learning_rate": 1.671477118259938e-07, "loss": 1.5496, "step": 4370 }, { "epoch": 0.6592263026921047, "grad_norm": 0.2621628735602423, "learning_rate": 1.6705415557634926e-07, "loss": 1.4728, "step": 4371 }, { "epoch": 0.6593771208807783, "grad_norm": 0.25976189558002166, "learning_rate": 1.6696062356369052e-07, "loss": 1.6121, "step": 4372 }, { "epoch": 0.6595279390694517, "grad_norm": 0.25191569383887047, "learning_rate": 1.6686711580901816e-07, "loss": 1.5294, "step": 4373 }, { "epoch": 0.6596787572581253, "grad_norm": 1.6830369004072032, "learning_rate": 1.6677363233332753e-07, "loss": 1.5476, "step": 4374 }, { "epoch": 0.6598295754467989, "grad_norm": 0.24972879055668815, "learning_rate": 1.6668017315760837e-07, "loss": 1.5387, "step": 4375 }, { "epoch": 0.6599803936354725, "grad_norm": 0.2529044244935057, "learning_rate": 1.665867383028449e-07, "loss": 1.5556, "step": 4376 }, { "epoch": 0.660131211824146, "grad_norm": 0.24237828487076207, "learning_rate": 1.6649332779001606e-07, "loss": 1.5094, "step": 4377 }, { "epoch": 0.6602820300128196, "grad_norm": 0.2516651522330598, "learning_rate": 1.663999416400952e-07, "loss": 1.5412, "step": 4378 }, { "epoch": 0.6604328482014931, "grad_norm": 0.2472022069889934, "learning_rate": 1.6630657987405027e-07, "loss": 1.5465, "step": 4379 }, { "epoch": 0.6605836663901666, "grad_norm": 0.42366911674571833, "learning_rate": 1.6621324251284355e-07, "loss": 1.5475, "step": 4380 }, { "epoch": 0.6607344845788402, "grad_norm": 0.24492696411215548, "learning_rate": 1.6611992957743214e-07, "loss": 1.5285, "step": 4381 }, { "epoch": 0.6608853027675138, "grad_norm": 0.2567745155121796, "learning_rate": 1.6602664108876757e-07, "loss": 1.4723, "step": 4382 }, { "epoch": 0.6610361209561874, "grad_norm": 0.2482074703553842, "learning_rate": 1.659333770677956e-07, "loss": 1.6355, "step": 4383 }, { "epoch": 0.6611869391448608, "grad_norm": 0.24325576127420598, "learning_rate": 1.658401375354569e-07, "loss": 1.5417, "step": 4384 }, { "epoch": 0.6613377573335344, "grad_norm": 0.24726061523357257, "learning_rate": 1.6574692251268645e-07, "loss": 1.5517, "step": 4385 }, { "epoch": 0.661488575522208, "grad_norm": 0.24787383673380384, "learning_rate": 1.6565373202041364e-07, "loss": 1.5424, "step": 4386 }, { "epoch": 0.6616393937108815, "grad_norm": 0.2546839322501766, "learning_rate": 1.6556056607956254e-07, "loss": 1.5875, "step": 4387 }, { "epoch": 0.6617902118995551, "grad_norm": 0.2530227398838417, "learning_rate": 1.654674247110516e-07, "loss": 1.5844, "step": 4388 }, { "epoch": 0.6619410300882287, "grad_norm": 0.2472382108263328, "learning_rate": 1.6537430793579367e-07, "loss": 1.558, "step": 4389 }, { "epoch": 0.6620918482769022, "grad_norm": 0.2467916743913005, "learning_rate": 1.6528121577469633e-07, "loss": 1.6053, "step": 4390 }, { "epoch": 0.6622426664655757, "grad_norm": 0.2806261435847341, "learning_rate": 1.6518814824866144e-07, "loss": 1.5899, "step": 4391 }, { "epoch": 0.6623934846542493, "grad_norm": 0.2902261400426562, "learning_rate": 1.6509510537858537e-07, "loss": 1.5363, "step": 4392 }, { "epoch": 0.6625443028429229, "grad_norm": 0.2708674758772053, "learning_rate": 1.650020871853588e-07, "loss": 1.5422, "step": 4393 }, { "epoch": 0.6626951210315964, "grad_norm": 0.24276755358204255, "learning_rate": 1.649090936898672e-07, "loss": 1.5363, "step": 4394 }, { "epoch": 0.66284593922027, "grad_norm": 0.24804347418554512, "learning_rate": 1.6481612491299036e-07, "loss": 1.5663, "step": 4395 }, { "epoch": 0.6629967574089435, "grad_norm": 0.25096976238548924, "learning_rate": 1.647231808756022e-07, "loss": 1.5343, "step": 4396 }, { "epoch": 0.6631475755976171, "grad_norm": 0.24496843643929367, "learning_rate": 1.6463026159857172e-07, "loss": 1.5362, "step": 4397 }, { "epoch": 0.6632983937862906, "grad_norm": 0.25238739350256006, "learning_rate": 1.645373671027618e-07, "loss": 1.6324, "step": 4398 }, { "epoch": 0.6634492119749642, "grad_norm": 0.25179695590369966, "learning_rate": 1.6444449740902987e-07, "loss": 1.5586, "step": 4399 }, { "epoch": 0.6636000301636378, "grad_norm": 0.24998595205338636, "learning_rate": 1.6435165253822813e-07, "loss": 1.5956, "step": 4400 }, { "epoch": 0.6637508483523112, "grad_norm": 0.2984166708920905, "learning_rate": 1.6425883251120285e-07, "loss": 1.5744, "step": 4401 }, { "epoch": 0.6639016665409848, "grad_norm": 0.24872797484434014, "learning_rate": 1.6416603734879469e-07, "loss": 1.5456, "step": 4402 }, { "epoch": 0.6640524847296584, "grad_norm": 0.31371587804843853, "learning_rate": 1.6407326707183907e-07, "loss": 1.5451, "step": 4403 }, { "epoch": 0.664203302918332, "grad_norm": 0.25024145507022, "learning_rate": 1.6398052170116556e-07, "loss": 1.5041, "step": 4404 }, { "epoch": 0.6643541211070055, "grad_norm": 0.252617873762467, "learning_rate": 1.6388780125759816e-07, "loss": 1.5515, "step": 4405 }, { "epoch": 0.6645049392956791, "grad_norm": 0.2579091788972358, "learning_rate": 1.6379510576195524e-07, "loss": 1.549, "step": 4406 }, { "epoch": 0.6646557574843526, "grad_norm": 0.2406197129149719, "learning_rate": 1.6370243523504977e-07, "loss": 1.5341, "step": 4407 }, { "epoch": 0.6648065756730261, "grad_norm": 0.2543884142132946, "learning_rate": 1.6360978969768902e-07, "loss": 1.5269, "step": 4408 }, { "epoch": 0.6649573938616997, "grad_norm": 0.2531155660170818, "learning_rate": 1.6351716917067438e-07, "loss": 1.604, "step": 4409 }, { "epoch": 0.6651082120503733, "grad_norm": 0.25200196374912637, "learning_rate": 1.6342457367480207e-07, "loss": 1.5982, "step": 4410 }, { "epoch": 0.6652590302390469, "grad_norm": 0.2594260257387867, "learning_rate": 1.6333200323086244e-07, "loss": 1.5579, "step": 4411 }, { "epoch": 0.6654098484277203, "grad_norm": 0.2555082936930443, "learning_rate": 1.632394578596401e-07, "loss": 1.4944, "step": 4412 }, { "epoch": 0.6655606666163939, "grad_norm": 0.26174160023642556, "learning_rate": 1.6314693758191442e-07, "loss": 1.5242, "step": 4413 }, { "epoch": 0.6657114848050675, "grad_norm": 0.24320823578852807, "learning_rate": 1.6305444241845875e-07, "loss": 1.6164, "step": 4414 }, { "epoch": 0.665862302993741, "grad_norm": 0.23972350200655893, "learning_rate": 1.629619723900409e-07, "loss": 1.5389, "step": 4415 }, { "epoch": 0.6660131211824146, "grad_norm": 0.24819399239616022, "learning_rate": 1.628695275174232e-07, "loss": 1.539, "step": 4416 }, { "epoch": 0.6661639393710882, "grad_norm": 0.2738680504735075, "learning_rate": 1.6277710782136223e-07, "loss": 1.5313, "step": 4417 }, { "epoch": 0.6663147575597617, "grad_norm": 0.29736483013703435, "learning_rate": 1.6268471332260882e-07, "loss": 1.5923, "step": 4418 }, { "epoch": 0.6664655757484352, "grad_norm": 0.2691164164594476, "learning_rate": 1.6259234404190815e-07, "loss": 1.5661, "step": 4419 }, { "epoch": 0.6666163939371088, "grad_norm": 0.24407951769351988, "learning_rate": 1.6250000000000006e-07, "loss": 1.5182, "step": 4420 }, { "epoch": 0.6667672121257824, "grad_norm": 0.3036247360046378, "learning_rate": 1.6240768121761826e-07, "loss": 1.5337, "step": 4421 }, { "epoch": 0.666918030314456, "grad_norm": 0.2681579943835327, "learning_rate": 1.6231538771549104e-07, "loss": 1.5705, "step": 4422 }, { "epoch": 0.6670688485031295, "grad_norm": 0.25816631047547445, "learning_rate": 1.6222311951434108e-07, "loss": 1.563, "step": 4423 }, { "epoch": 0.667219666691803, "grad_norm": 0.25013242748167597, "learning_rate": 1.6213087663488522e-07, "loss": 1.5577, "step": 4424 }, { "epoch": 0.6673704848804766, "grad_norm": 0.2455927102819475, "learning_rate": 1.620386590978346e-07, "loss": 1.536, "step": 4425 }, { "epoch": 0.6675213030691501, "grad_norm": 0.24801865500769413, "learning_rate": 1.6194646692389484e-07, "loss": 1.4987, "step": 4426 }, { "epoch": 0.6676721212578237, "grad_norm": 0.24911440013001487, "learning_rate": 1.6185430013376582e-07, "loss": 1.5705, "step": 4427 }, { "epoch": 0.6678229394464973, "grad_norm": 0.2628882747468611, "learning_rate": 1.6176215874814146e-07, "loss": 1.5372, "step": 4428 }, { "epoch": 0.6679737576351708, "grad_norm": 0.2595634659919045, "learning_rate": 1.6167004278771037e-07, "loss": 1.5658, "step": 4429 }, { "epoch": 0.6681245758238443, "grad_norm": 0.29775413439795473, "learning_rate": 1.6157795227315524e-07, "loss": 1.5422, "step": 4430 }, { "epoch": 0.6682753940125179, "grad_norm": 0.2690934522371737, "learning_rate": 1.6148588722515304e-07, "loss": 1.5789, "step": 4431 }, { "epoch": 0.6684262122011915, "grad_norm": 0.25994584944910876, "learning_rate": 1.6139384766437497e-07, "loss": 1.5574, "step": 4432 }, { "epoch": 0.668577030389865, "grad_norm": 0.49385878510095477, "learning_rate": 1.6130183361148672e-07, "loss": 1.6128, "step": 4433 }, { "epoch": 0.6687278485785386, "grad_norm": 0.26612645215918884, "learning_rate": 1.6120984508714814e-07, "loss": 1.588, "step": 4434 }, { "epoch": 0.6688786667672121, "grad_norm": 0.2466698877353191, "learning_rate": 1.6111788211201317e-07, "loss": 1.5513, "step": 4435 }, { "epoch": 0.6690294849558857, "grad_norm": 0.3837798363225113, "learning_rate": 1.6102594470673031e-07, "loss": 1.5866, "step": 4436 }, { "epoch": 0.6691803031445592, "grad_norm": 0.24367217671826055, "learning_rate": 1.6093403289194225e-07, "loss": 1.5122, "step": 4437 }, { "epoch": 0.6693311213332328, "grad_norm": 0.24662484557225653, "learning_rate": 1.608421466882856e-07, "loss": 1.5657, "step": 4438 }, { "epoch": 0.6694819395219064, "grad_norm": 0.24653542481933985, "learning_rate": 1.607502861163918e-07, "loss": 1.55, "step": 4439 }, { "epoch": 0.66963275771058, "grad_norm": 0.26100371071233147, "learning_rate": 1.606584511968861e-07, "loss": 1.5384, "step": 4440 }, { "epoch": 0.6697835758992534, "grad_norm": 0.2510751090311021, "learning_rate": 1.6056664195038797e-07, "loss": 1.5858, "step": 4441 }, { "epoch": 0.669934394087927, "grad_norm": 0.25432632198219307, "learning_rate": 1.6047485839751148e-07, "loss": 1.5679, "step": 4442 }, { "epoch": 0.6700852122766006, "grad_norm": 0.2613752320293682, "learning_rate": 1.6038310055886467e-07, "loss": 1.6188, "step": 4443 }, { "epoch": 0.6702360304652741, "grad_norm": 0.25383575216431087, "learning_rate": 1.6029136845504974e-07, "loss": 1.5303, "step": 4444 }, { "epoch": 0.6703868486539477, "grad_norm": 0.28151295313508107, "learning_rate": 1.601996621066632e-07, "loss": 1.5927, "step": 4445 }, { "epoch": 0.6705376668426212, "grad_norm": 0.26884315365474487, "learning_rate": 1.6010798153429597e-07, "loss": 1.5429, "step": 4446 }, { "epoch": 0.6706884850312947, "grad_norm": 0.25801191861820594, "learning_rate": 1.6001632675853295e-07, "loss": 1.627, "step": 4447 }, { "epoch": 0.6708393032199683, "grad_norm": 0.2493698982634603, "learning_rate": 1.5992469779995312e-07, "loss": 1.5584, "step": 4448 }, { "epoch": 0.6709901214086419, "grad_norm": 0.2488284571200544, "learning_rate": 1.5983309467913008e-07, "loss": 1.5873, "step": 4449 }, { "epoch": 0.6711409395973155, "grad_norm": 0.25185762106130766, "learning_rate": 1.597415174166314e-07, "loss": 1.6337, "step": 4450 }, { "epoch": 0.671291757785989, "grad_norm": 0.24107801326174988, "learning_rate": 1.5964996603301857e-07, "loss": 1.5663, "step": 4451 }, { "epoch": 0.6714425759746625, "grad_norm": 0.23335337442589293, "learning_rate": 1.5955844054884788e-07, "loss": 1.6478, "step": 4452 }, { "epoch": 0.6715933941633361, "grad_norm": 0.24953066525510237, "learning_rate": 1.594669409846693e-07, "loss": 1.6057, "step": 4453 }, { "epoch": 0.6717442123520097, "grad_norm": 0.2537367589132101, "learning_rate": 1.5937546736102708e-07, "loss": 1.6144, "step": 4454 }, { "epoch": 0.6718950305406832, "grad_norm": 0.26427627828933026, "learning_rate": 1.5928401969845977e-07, "loss": 1.593, "step": 4455 }, { "epoch": 0.6720458487293568, "grad_norm": 0.252361012839834, "learning_rate": 1.591925980175002e-07, "loss": 1.5249, "step": 4456 }, { "epoch": 0.6721966669180303, "grad_norm": 0.24555805311657805, "learning_rate": 1.5910120233867501e-07, "loss": 1.5458, "step": 4457 }, { "epoch": 0.6723474851067038, "grad_norm": 0.2606785058051602, "learning_rate": 1.5900983268250518e-07, "loss": 1.6235, "step": 4458 }, { "epoch": 0.6724983032953774, "grad_norm": 0.24333960694058768, "learning_rate": 1.58918489069506e-07, "loss": 1.5034, "step": 4459 }, { "epoch": 0.672649121484051, "grad_norm": 0.305832013139834, "learning_rate": 1.5882717152018666e-07, "loss": 1.4752, "step": 4460 }, { "epoch": 0.6727999396727246, "grad_norm": 0.25273299194813803, "learning_rate": 1.5873588005505067e-07, "loss": 1.6172, "step": 4461 }, { "epoch": 0.6729507578613981, "grad_norm": 0.24424501947561303, "learning_rate": 1.586446146945955e-07, "loss": 1.6006, "step": 4462 }, { "epoch": 0.6731015760500716, "grad_norm": 0.24893274250145397, "learning_rate": 1.5855337545931308e-07, "loss": 1.5378, "step": 4463 }, { "epoch": 0.6732523942387452, "grad_norm": 2.34854981138626, "learning_rate": 1.5846216236968913e-07, "loss": 1.5606, "step": 4464 }, { "epoch": 0.6734032124274187, "grad_norm": 0.25952343083126733, "learning_rate": 1.5837097544620364e-07, "loss": 1.5542, "step": 4465 }, { "epoch": 0.6735540306160923, "grad_norm": 0.24166214284987536, "learning_rate": 1.5827981470933083e-07, "loss": 1.4965, "step": 4466 }, { "epoch": 0.6737048488047659, "grad_norm": 0.2623262912298193, "learning_rate": 1.581886801795389e-07, "loss": 1.5544, "step": 4467 }, { "epoch": 0.6738556669934395, "grad_norm": 0.2812617083911704, "learning_rate": 1.5809757187729013e-07, "loss": 1.574, "step": 4468 }, { "epoch": 0.6740064851821129, "grad_norm": 0.5291270399283328, "learning_rate": 1.580064898230411e-07, "loss": 1.5888, "step": 4469 }, { "epoch": 0.6741573033707865, "grad_norm": 0.40802473392185296, "learning_rate": 1.5791543403724233e-07, "loss": 1.6316, "step": 4470 }, { "epoch": 0.6743081215594601, "grad_norm": 0.2843350286941599, "learning_rate": 1.5782440454033847e-07, "loss": 1.5192, "step": 4471 }, { "epoch": 0.6744589397481336, "grad_norm": 0.41370144878011966, "learning_rate": 1.5773340135276836e-07, "loss": 1.5708, "step": 4472 }, { "epoch": 0.6746097579368072, "grad_norm": 0.2621786014549776, "learning_rate": 1.576424244949648e-07, "loss": 1.6857, "step": 4473 }, { "epoch": 0.6747605761254807, "grad_norm": 0.2539712941682984, "learning_rate": 1.575514739873548e-07, "loss": 1.5274, "step": 4474 }, { "epoch": 0.6749113943141543, "grad_norm": 0.262397966494765, "learning_rate": 1.5746054985035929e-07, "loss": 1.5127, "step": 4475 }, { "epoch": 0.6750622125028278, "grad_norm": 0.2988519798684835, "learning_rate": 1.5736965210439355e-07, "loss": 1.5159, "step": 4476 }, { "epoch": 0.6752130306915014, "grad_norm": 0.2625430928739007, "learning_rate": 1.5727878076986667e-07, "loss": 1.5121, "step": 4477 }, { "epoch": 0.675363848880175, "grad_norm": 0.2693026704994772, "learning_rate": 1.5718793586718187e-07, "loss": 1.5429, "step": 4478 }, { "epoch": 0.6755146670688486, "grad_norm": 0.24741617489781206, "learning_rate": 1.5709711741673658e-07, "loss": 1.6317, "step": 4479 }, { "epoch": 0.675665485257522, "grad_norm": 0.24554929918868298, "learning_rate": 1.5700632543892216e-07, "loss": 1.5224, "step": 4480 }, { "epoch": 0.6758163034461956, "grad_norm": 0.2531089314030115, "learning_rate": 1.5691555995412397e-07, "loss": 1.5566, "step": 4481 }, { "epoch": 0.6759671216348692, "grad_norm": 0.270813179835107, "learning_rate": 1.5682482098272166e-07, "loss": 1.5137, "step": 4482 }, { "epoch": 0.6761179398235427, "grad_norm": 0.2531790176228573, "learning_rate": 1.5673410854508862e-07, "loss": 1.5078, "step": 4483 }, { "epoch": 0.6762687580122163, "grad_norm": 0.24704562757704143, "learning_rate": 1.5664342266159253e-07, "loss": 1.5327, "step": 4484 }, { "epoch": 0.6764195762008899, "grad_norm": 0.31259419416536566, "learning_rate": 1.5655276335259493e-07, "loss": 1.5217, "step": 4485 }, { "epoch": 0.6765703943895633, "grad_norm": 0.29975032453330397, "learning_rate": 1.5646213063845157e-07, "loss": 1.5935, "step": 4486 }, { "epoch": 0.6767212125782369, "grad_norm": 0.25402666044063815, "learning_rate": 1.563715245395121e-07, "loss": 1.4853, "step": 4487 }, { "epoch": 0.6768720307669105, "grad_norm": 0.8082627996462705, "learning_rate": 1.562809450761201e-07, "loss": 1.5272, "step": 4488 }, { "epoch": 0.6770228489555841, "grad_norm": 0.2522359601460765, "learning_rate": 1.5619039226861351e-07, "loss": 1.5272, "step": 4489 }, { "epoch": 0.6771736671442576, "grad_norm": 0.2432005732759633, "learning_rate": 1.5609986613732402e-07, "loss": 1.5519, "step": 4490 }, { "epoch": 0.6773244853329311, "grad_norm": 0.2530184344932088, "learning_rate": 1.5600936670257718e-07, "loss": 1.6198, "step": 4491 }, { "epoch": 0.6774753035216047, "grad_norm": 0.2382296864813892, "learning_rate": 1.5591889398469302e-07, "loss": 1.5021, "step": 4492 }, { "epoch": 0.6776261217102783, "grad_norm": 0.2503873698254462, "learning_rate": 1.558284480039852e-07, "loss": 1.5769, "step": 4493 }, { "epoch": 0.6777769398989518, "grad_norm": 0.6478262856588054, "learning_rate": 1.5573802878076136e-07, "loss": 1.5224, "step": 4494 }, { "epoch": 0.6779277580876254, "grad_norm": 0.25990690300475844, "learning_rate": 1.556476363353234e-07, "loss": 1.5723, "step": 4495 }, { "epoch": 0.678078576276299, "grad_norm": 0.2762708223326884, "learning_rate": 1.555572706879671e-07, "loss": 1.462, "step": 4496 }, { "epoch": 0.6782293944649724, "grad_norm": 0.2563697519669219, "learning_rate": 1.5546693185898203e-07, "loss": 1.5847, "step": 4497 }, { "epoch": 0.678380212653646, "grad_norm": 0.28862133225427156, "learning_rate": 1.5537661986865187e-07, "loss": 1.5384, "step": 4498 }, { "epoch": 0.6785310308423196, "grad_norm": 0.24185984802208632, "learning_rate": 1.5528633473725443e-07, "loss": 1.6157, "step": 4499 }, { "epoch": 0.6786818490309932, "grad_norm": 0.24390163793301547, "learning_rate": 1.5519607648506133e-07, "loss": 1.5652, "step": 4500 }, { "epoch": 0.6788326672196667, "grad_norm": 0.25433487014020545, "learning_rate": 1.55105845132338e-07, "loss": 1.5813, "step": 4501 }, { "epoch": 0.6789834854083402, "grad_norm": 0.24768666349044535, "learning_rate": 1.550156406993443e-07, "loss": 1.5257, "step": 4502 }, { "epoch": 0.6791343035970138, "grad_norm": 0.25434925804032016, "learning_rate": 1.5492546320633355e-07, "loss": 1.5431, "step": 4503 }, { "epoch": 0.6792851217856873, "grad_norm": 0.25432855324045334, "learning_rate": 1.5483531267355315e-07, "loss": 1.5667, "step": 4504 }, { "epoch": 0.6794359399743609, "grad_norm": 0.29078320294039717, "learning_rate": 1.5474518912124474e-07, "loss": 1.5261, "step": 4505 }, { "epoch": 0.6795867581630345, "grad_norm": 0.2534399553146919, "learning_rate": 1.546550925696436e-07, "loss": 1.5477, "step": 4506 }, { "epoch": 0.6797375763517081, "grad_norm": 0.26475011017631184, "learning_rate": 1.5456502303897893e-07, "loss": 1.4716, "step": 4507 }, { "epoch": 0.6798883945403815, "grad_norm": 0.25023873514497286, "learning_rate": 1.544749805494741e-07, "loss": 1.5263, "step": 4508 }, { "epoch": 0.6800392127290551, "grad_norm": 0.4316287456279205, "learning_rate": 1.5438496512134625e-07, "loss": 1.5983, "step": 4509 }, { "epoch": 0.6801900309177287, "grad_norm": 0.2489370624450571, "learning_rate": 1.5429497677480638e-07, "loss": 1.4934, "step": 4510 }, { "epoch": 0.6803408491064022, "grad_norm": 0.3669846777729833, "learning_rate": 1.5420501553005948e-07, "loss": 1.5831, "step": 4511 }, { "epoch": 0.6804916672950758, "grad_norm": 0.2648456804985748, "learning_rate": 1.541150814073046e-07, "loss": 1.6421, "step": 4512 }, { "epoch": 0.6806424854837494, "grad_norm": 0.26796561522886525, "learning_rate": 1.5402517442673454e-07, "loss": 1.5622, "step": 4513 }, { "epoch": 0.6807933036724229, "grad_norm": 0.2525029359985612, "learning_rate": 1.539352946085359e-07, "loss": 1.5529, "step": 4514 }, { "epoch": 0.6809441218610964, "grad_norm": 0.3438247994299546, "learning_rate": 1.538454419728895e-07, "loss": 1.5502, "step": 4515 }, { "epoch": 0.68109494004977, "grad_norm": 0.3565079090528708, "learning_rate": 1.5375561653996977e-07, "loss": 1.5665, "step": 4516 }, { "epoch": 0.6812457582384436, "grad_norm": 0.2502136419414264, "learning_rate": 1.5366581832994508e-07, "loss": 1.5328, "step": 4517 }, { "epoch": 0.6813965764271172, "grad_norm": 0.25637007238519177, "learning_rate": 1.535760473629779e-07, "loss": 1.5678, "step": 4518 }, { "epoch": 0.6815473946157906, "grad_norm": 0.2542868537875946, "learning_rate": 1.5348630365922432e-07, "loss": 1.5661, "step": 4519 }, { "epoch": 0.6816982128044642, "grad_norm": 0.24491684976074926, "learning_rate": 1.5339658723883443e-07, "loss": 1.5301, "step": 4520 }, { "epoch": 0.6818490309931378, "grad_norm": 0.2532871996773783, "learning_rate": 1.533068981219522e-07, "loss": 1.5442, "step": 4521 }, { "epoch": 0.6819998491818113, "grad_norm": 0.2969754695815701, "learning_rate": 1.5321723632871546e-07, "loss": 1.5914, "step": 4522 }, { "epoch": 0.6821506673704849, "grad_norm": 0.23776153267951033, "learning_rate": 1.5312760187925586e-07, "loss": 1.5446, "step": 4523 }, { "epoch": 0.6823014855591585, "grad_norm": 0.2567751502907507, "learning_rate": 1.5303799479369888e-07, "loss": 1.633, "step": 4524 }, { "epoch": 0.682452303747832, "grad_norm": 0.24803655120611262, "learning_rate": 1.5294841509216412e-07, "loss": 1.606, "step": 4525 }, { "epoch": 0.6826031219365055, "grad_norm": 0.2502764130264785, "learning_rate": 1.5285886279476462e-07, "loss": 1.5575, "step": 4526 }, { "epoch": 0.6827539401251791, "grad_norm": 0.2552962726477028, "learning_rate": 1.5276933792160752e-07, "loss": 1.5266, "step": 4527 }, { "epoch": 0.6829047583138527, "grad_norm": 0.2618369174454743, "learning_rate": 1.5267984049279389e-07, "loss": 1.5161, "step": 4528 }, { "epoch": 0.6830555765025262, "grad_norm": 0.2469464741877154, "learning_rate": 1.5259037052841844e-07, "loss": 1.5408, "step": 4529 }, { "epoch": 0.6832063946911997, "grad_norm": 0.252624797216419, "learning_rate": 1.525009280485697e-07, "loss": 1.5406, "step": 4530 }, { "epoch": 0.6833572128798733, "grad_norm": 0.25393176829144687, "learning_rate": 1.5241151307333022e-07, "loss": 1.5946, "step": 4531 }, { "epoch": 0.6835080310685469, "grad_norm": 0.26189789836181204, "learning_rate": 1.5232212562277625e-07, "loss": 1.6088, "step": 4532 }, { "epoch": 0.6836588492572204, "grad_norm": 0.2542125862503045, "learning_rate": 1.522327657169778e-07, "loss": 1.546, "step": 4533 }, { "epoch": 0.683809667445894, "grad_norm": 0.29066485291620314, "learning_rate": 1.5214343337599885e-07, "loss": 1.4975, "step": 4534 }, { "epoch": 0.6839604856345676, "grad_norm": 0.24255544535163215, "learning_rate": 1.5205412861989713e-07, "loss": 1.558, "step": 4535 }, { "epoch": 0.684111303823241, "grad_norm": 0.4483495544033751, "learning_rate": 1.5196485146872406e-07, "loss": 1.6128, "step": 4536 }, { "epoch": 0.6842621220119146, "grad_norm": 0.30525871600786114, "learning_rate": 1.5187560194252496e-07, "loss": 1.5494, "step": 4537 }, { "epoch": 0.6844129402005882, "grad_norm": 0.23836452613561476, "learning_rate": 1.517863800613391e-07, "loss": 1.5943, "step": 4538 }, { "epoch": 0.6845637583892618, "grad_norm": 0.2501342378997237, "learning_rate": 1.516971858451993e-07, "loss": 1.5817, "step": 4539 }, { "epoch": 0.6847145765779353, "grad_norm": 0.3276380893394134, "learning_rate": 1.5160801931413215e-07, "loss": 1.5141, "step": 4540 }, { "epoch": 0.6848653947666089, "grad_norm": 0.2615657297936482, "learning_rate": 1.5151888048815833e-07, "loss": 1.5306, "step": 4541 }, { "epoch": 0.6850162129552824, "grad_norm": 0.26231711553037756, "learning_rate": 1.5142976938729198e-07, "loss": 1.602, "step": 4542 }, { "epoch": 0.6851670311439559, "grad_norm": 0.28429576765124964, "learning_rate": 1.5134068603154113e-07, "loss": 1.5316, "step": 4543 }, { "epoch": 0.6853178493326295, "grad_norm": 0.31239091894016446, "learning_rate": 1.5125163044090767e-07, "loss": 1.5024, "step": 4544 }, { "epoch": 0.6854686675213031, "grad_norm": 0.24401945059379362, "learning_rate": 1.5116260263538713e-07, "loss": 1.5675, "step": 4545 }, { "epoch": 0.6856194857099767, "grad_norm": 0.27257654534470044, "learning_rate": 1.5107360263496878e-07, "loss": 1.6108, "step": 4546 }, { "epoch": 0.6857703038986501, "grad_norm": 0.2516205155581599, "learning_rate": 1.5098463045963584e-07, "loss": 1.5333, "step": 4547 }, { "epoch": 0.6859211220873237, "grad_norm": 0.23781179235887603, "learning_rate": 1.508956861293651e-07, "loss": 1.5339, "step": 4548 }, { "epoch": 0.6860719402759973, "grad_norm": 0.2520970638415164, "learning_rate": 1.508067696641271e-07, "loss": 1.5188, "step": 4549 }, { "epoch": 0.6862227584646708, "grad_norm": 0.24736223711973349, "learning_rate": 1.507178810838862e-07, "loss": 1.5573, "step": 4550 }, { "epoch": 0.6863735766533444, "grad_norm": 0.2552180571114534, "learning_rate": 1.5062902040860053e-07, "loss": 1.5815, "step": 4551 }, { "epoch": 0.686524394842018, "grad_norm": 0.25558268884153396, "learning_rate": 1.5054018765822185e-07, "loss": 1.5228, "step": 4552 }, { "epoch": 0.6866752130306915, "grad_norm": 0.26345469434362906, "learning_rate": 1.5045138285269566e-07, "loss": 1.5266, "step": 4553 }, { "epoch": 0.686826031219365, "grad_norm": 0.2730774921258875, "learning_rate": 1.5036260601196136e-07, "loss": 1.5572, "step": 4554 }, { "epoch": 0.6869768494080386, "grad_norm": 0.2652309412430363, "learning_rate": 1.5027385715595187e-07, "loss": 1.5395, "step": 4555 }, { "epoch": 0.6871276675967122, "grad_norm": 0.508357645050542, "learning_rate": 1.5018513630459372e-07, "loss": 1.5837, "step": 4556 }, { "epoch": 0.6872784857853858, "grad_norm": 0.2385243692438645, "learning_rate": 1.5009644347780765e-07, "loss": 1.5148, "step": 4557 }, { "epoch": 0.6874293039740593, "grad_norm": 0.25014088917331895, "learning_rate": 1.5000777869550753e-07, "loss": 1.5505, "step": 4558 }, { "epoch": 0.6875801221627328, "grad_norm": 0.3648206648953276, "learning_rate": 1.4991914197760124e-07, "loss": 1.5474, "step": 4559 }, { "epoch": 0.6877309403514064, "grad_norm": 0.24635446185295545, "learning_rate": 1.498305333439904e-07, "loss": 1.5118, "step": 4560 }, { "epoch": 0.6878817585400799, "grad_norm": 0.24884860378935486, "learning_rate": 1.4974195281457017e-07, "loss": 1.5844, "step": 4561 }, { "epoch": 0.6880325767287535, "grad_norm": 0.25932085760973167, "learning_rate": 1.4965340040922943e-07, "loss": 1.5296, "step": 4562 }, { "epoch": 0.6881833949174271, "grad_norm": 0.24962208314785847, "learning_rate": 1.4956487614785074e-07, "loss": 1.6032, "step": 4563 }, { "epoch": 0.6883342131061005, "grad_norm": 0.3078533657792895, "learning_rate": 1.4947638005031048e-07, "loss": 1.5403, "step": 4564 }, { "epoch": 0.6884850312947741, "grad_norm": 0.24532524033327652, "learning_rate": 1.4938791213647862e-07, "loss": 1.5853, "step": 4565 }, { "epoch": 0.6886358494834477, "grad_norm": 0.25105566983553357, "learning_rate": 1.4929947242621857e-07, "loss": 1.5751, "step": 4566 }, { "epoch": 0.6887866676721213, "grad_norm": 0.2600457386477789, "learning_rate": 1.4921106093938784e-07, "loss": 1.6206, "step": 4567 }, { "epoch": 0.6889374858607948, "grad_norm": 0.2549953407241225, "learning_rate": 1.4912267769583733e-07, "loss": 1.5523, "step": 4568 }, { "epoch": 0.6890883040494684, "grad_norm": 0.2686116638084607, "learning_rate": 1.4903432271541156e-07, "loss": 1.533, "step": 4569 }, { "epoch": 0.6892391222381419, "grad_norm": 0.25694000871757877, "learning_rate": 1.4894599601794895e-07, "loss": 1.558, "step": 4570 }, { "epoch": 0.6893899404268155, "grad_norm": 0.25707164553738177, "learning_rate": 1.488576976232813e-07, "loss": 1.5897, "step": 4571 }, { "epoch": 0.689540758615489, "grad_norm": 0.25449614050111674, "learning_rate": 1.4876942755123413e-07, "loss": 1.6029, "step": 4572 }, { "epoch": 0.6896915768041626, "grad_norm": 0.25345246247083175, "learning_rate": 1.4868118582162682e-07, "loss": 1.6331, "step": 4573 }, { "epoch": 0.6898423949928362, "grad_norm": 0.2548159998153135, "learning_rate": 1.4859297245427207e-07, "loss": 1.5195, "step": 4574 }, { "epoch": 0.6899932131815096, "grad_norm": 0.2506490687717234, "learning_rate": 1.4850478746897637e-07, "loss": 1.5914, "step": 4575 }, { "epoch": 0.6901440313701832, "grad_norm": 0.30149312313645693, "learning_rate": 1.4841663088553979e-07, "loss": 1.557, "step": 4576 }, { "epoch": 0.6902948495588568, "grad_norm": 0.28041762145112953, "learning_rate": 1.483285027237562e-07, "loss": 1.5974, "step": 4577 }, { "epoch": 0.6904456677475304, "grad_norm": 0.2530644352677591, "learning_rate": 1.482404030034128e-07, "loss": 1.4708, "step": 4578 }, { "epoch": 0.6905964859362039, "grad_norm": 0.26767673143594023, "learning_rate": 1.4815233174429047e-07, "loss": 1.5615, "step": 4579 }, { "epoch": 0.6907473041248775, "grad_norm": 0.27807131741426094, "learning_rate": 1.4806428896616396e-07, "loss": 1.6307, "step": 4580 }, { "epoch": 0.690898122313551, "grad_norm": 0.2559874783430606, "learning_rate": 1.4797627468880136e-07, "loss": 1.5253, "step": 4581 }, { "epoch": 0.6910489405022245, "grad_norm": 0.36174431325638784, "learning_rate": 1.4788828893196438e-07, "loss": 1.5906, "step": 4582 }, { "epoch": 0.6911997586908981, "grad_norm": 0.26659067323633806, "learning_rate": 1.4780033171540844e-07, "loss": 1.6182, "step": 4583 }, { "epoch": 0.6913505768795717, "grad_norm": 0.26001690653683646, "learning_rate": 1.477124030588826e-07, "loss": 1.5281, "step": 4584 }, { "epoch": 0.6915013950682453, "grad_norm": 0.2669507511015986, "learning_rate": 1.4762450298212924e-07, "loss": 1.6036, "step": 4585 }, { "epoch": 0.6916522132569188, "grad_norm": 0.27047938435631036, "learning_rate": 1.4753663150488443e-07, "loss": 1.5513, "step": 4586 }, { "epoch": 0.6918030314455923, "grad_norm": 0.24458341620251461, "learning_rate": 1.4744878864687815e-07, "loss": 1.5949, "step": 4587 }, { "epoch": 0.6919538496342659, "grad_norm": 0.332652850278529, "learning_rate": 1.4736097442783346e-07, "loss": 1.5386, "step": 4588 }, { "epoch": 0.6921046678229394, "grad_norm": 0.25264908157249905, "learning_rate": 1.4727318886746724e-07, "loss": 1.5539, "step": 4589 }, { "epoch": 0.692255486011613, "grad_norm": 0.24265317520106885, "learning_rate": 1.4718543198548996e-07, "loss": 1.5467, "step": 4590 }, { "epoch": 0.6924063042002866, "grad_norm": 0.23714676531560258, "learning_rate": 1.470977038016056e-07, "loss": 1.5504, "step": 4591 }, { "epoch": 0.6925571223889601, "grad_norm": 0.2573756180396614, "learning_rate": 1.4701000433551158e-07, "loss": 1.6083, "step": 4592 }, { "epoch": 0.6927079405776336, "grad_norm": 0.24096449503649983, "learning_rate": 1.4692233360689915e-07, "loss": 1.615, "step": 4593 }, { "epoch": 0.6928587587663072, "grad_norm": 0.24723077453611716, "learning_rate": 1.4683469163545287e-07, "loss": 1.5822, "step": 4594 }, { "epoch": 0.6930095769549808, "grad_norm": 0.2565972275948653, "learning_rate": 1.4674707844085082e-07, "loss": 1.5817, "step": 4595 }, { "epoch": 0.6931603951436544, "grad_norm": 0.25394017061617835, "learning_rate": 1.4665949404276485e-07, "loss": 1.6008, "step": 4596 }, { "epoch": 0.6933112133323279, "grad_norm": 0.24890272248525033, "learning_rate": 1.4657193846086018e-07, "loss": 1.5577, "step": 4597 }, { "epoch": 0.6934620315210014, "grad_norm": 0.25471972550028754, "learning_rate": 1.4648441171479558e-07, "loss": 1.487, "step": 4598 }, { "epoch": 0.693612849709675, "grad_norm": 0.24686979043513557, "learning_rate": 1.4639691382422324e-07, "loss": 1.5725, "step": 4599 }, { "epoch": 0.6937636678983485, "grad_norm": 0.3777801149755808, "learning_rate": 1.4630944480878916e-07, "loss": 1.5033, "step": 4600 }, { "epoch": 0.6939144860870221, "grad_norm": 0.24966962480364918, "learning_rate": 1.4622200468813257e-07, "loss": 1.5242, "step": 4601 }, { "epoch": 0.6940653042756957, "grad_norm": 0.24990942126298826, "learning_rate": 1.4613459348188635e-07, "loss": 1.601, "step": 4602 }, { "epoch": 0.6942161224643693, "grad_norm": 0.28309591473883483, "learning_rate": 1.460472112096769e-07, "loss": 1.5873, "step": 4603 }, { "epoch": 0.6943669406530427, "grad_norm": 0.240701977509555, "learning_rate": 1.4595985789112408e-07, "loss": 1.5156, "step": 4604 }, { "epoch": 0.6945177588417163, "grad_norm": 0.25553715074585126, "learning_rate": 1.458725335458411e-07, "loss": 1.5409, "step": 4605 }, { "epoch": 0.6946685770303899, "grad_norm": 0.26737231559406555, "learning_rate": 1.4578523819343508e-07, "loss": 1.4752, "step": 4606 }, { "epoch": 0.6948193952190634, "grad_norm": 0.24931512224638422, "learning_rate": 1.456979718535062e-07, "loss": 1.5891, "step": 4607 }, { "epoch": 0.694970213407737, "grad_norm": 0.24713254615524075, "learning_rate": 1.4561073454564826e-07, "loss": 1.5321, "step": 4608 }, { "epoch": 0.6951210315964105, "grad_norm": 0.24274431420453124, "learning_rate": 1.4552352628944876e-07, "loss": 1.5375, "step": 4609 }, { "epoch": 0.695271849785084, "grad_norm": 0.24327648509344132, "learning_rate": 1.454363471044883e-07, "loss": 1.5397, "step": 4610 }, { "epoch": 0.6954226679737576, "grad_norm": 0.42859160895656556, "learning_rate": 1.4534919701034128e-07, "loss": 1.6108, "step": 4611 }, { "epoch": 0.6955734861624312, "grad_norm": 0.2491065048758073, "learning_rate": 1.4526207602657532e-07, "loss": 1.5006, "step": 4612 }, { "epoch": 0.6957243043511048, "grad_norm": 0.2692117787960512, "learning_rate": 1.451749841727517e-07, "loss": 1.6145, "step": 4613 }, { "epoch": 0.6958751225397783, "grad_norm": 0.2536910675646695, "learning_rate": 1.4508792146842514e-07, "loss": 1.5425, "step": 4614 }, { "epoch": 0.6960259407284518, "grad_norm": 0.3287075484496632, "learning_rate": 1.4500088793314348e-07, "loss": 1.5928, "step": 4615 }, { "epoch": 0.6961767589171254, "grad_norm": 0.2407439602141349, "learning_rate": 1.4491388358644866e-07, "loss": 1.5594, "step": 4616 }, { "epoch": 0.696327577105799, "grad_norm": 0.26256589661588803, "learning_rate": 1.4482690844787543e-07, "loss": 1.5984, "step": 4617 }, { "epoch": 0.6964783952944725, "grad_norm": 0.32159675502824847, "learning_rate": 1.4473996253695224e-07, "loss": 1.5943, "step": 4618 }, { "epoch": 0.6966292134831461, "grad_norm": 0.24326918175647477, "learning_rate": 1.4465304587320115e-07, "loss": 1.5593, "step": 4619 }, { "epoch": 0.6967800316718196, "grad_norm": 0.2537403703578292, "learning_rate": 1.4456615847613738e-07, "loss": 1.5685, "step": 4620 }, { "epoch": 0.6969308498604931, "grad_norm": 0.3058001984394793, "learning_rate": 1.4447930036526963e-07, "loss": 1.5538, "step": 4621 }, { "epoch": 0.6970816680491667, "grad_norm": 0.26862945706499347, "learning_rate": 1.4439247156010014e-07, "loss": 1.6163, "step": 4622 }, { "epoch": 0.6972324862378403, "grad_norm": 0.2518795793224312, "learning_rate": 1.4430567208012458e-07, "loss": 1.6093, "step": 4623 }, { "epoch": 0.6973833044265139, "grad_norm": 0.24554849766932724, "learning_rate": 1.4421890194483184e-07, "loss": 1.5258, "step": 4624 }, { "epoch": 0.6975341226151874, "grad_norm": 0.2518564872582991, "learning_rate": 1.441321611737043e-07, "loss": 1.5754, "step": 4625 }, { "epoch": 0.6976849408038609, "grad_norm": 0.25178904071540303, "learning_rate": 1.4404544978621797e-07, "loss": 1.5468, "step": 4626 }, { "epoch": 0.6978357589925345, "grad_norm": 0.26285735665220644, "learning_rate": 1.4395876780184194e-07, "loss": 1.5592, "step": 4627 }, { "epoch": 0.697986577181208, "grad_norm": 0.24616375825149645, "learning_rate": 1.438721152400388e-07, "loss": 1.558, "step": 4628 }, { "epoch": 0.6981373953698816, "grad_norm": 0.27139617239702973, "learning_rate": 1.4378549212026475e-07, "loss": 1.5659, "step": 4629 }, { "epoch": 0.6982882135585552, "grad_norm": 0.26339115341264374, "learning_rate": 1.4369889846196914e-07, "loss": 1.617, "step": 4630 }, { "epoch": 0.6984390317472288, "grad_norm": 0.2499043349546843, "learning_rate": 1.4361233428459462e-07, "loss": 1.5837, "step": 4631 }, { "epoch": 0.6985898499359022, "grad_norm": 0.2678042961990252, "learning_rate": 1.4352579960757755e-07, "loss": 1.5559, "step": 4632 }, { "epoch": 0.6987406681245758, "grad_norm": 0.2550113462690431, "learning_rate": 1.4343929445034746e-07, "loss": 1.5235, "step": 4633 }, { "epoch": 0.6988914863132494, "grad_norm": 0.27815081080646575, "learning_rate": 1.4335281883232713e-07, "loss": 1.522, "step": 4634 }, { "epoch": 0.699042304501923, "grad_norm": 0.25392747519123826, "learning_rate": 1.4326637277293308e-07, "loss": 1.55, "step": 4635 }, { "epoch": 0.6991931226905965, "grad_norm": 0.24728466238713195, "learning_rate": 1.431799562915748e-07, "loss": 1.6582, "step": 4636 }, { "epoch": 0.69934394087927, "grad_norm": 0.23783857063910752, "learning_rate": 1.4309356940765538e-07, "loss": 1.523, "step": 4637 }, { "epoch": 0.6994947590679436, "grad_norm": 0.27091284652224773, "learning_rate": 1.430072121405711e-07, "loss": 1.5445, "step": 4638 }, { "epoch": 0.6996455772566171, "grad_norm": 0.2635842887417288, "learning_rate": 1.4292088450971178e-07, "loss": 1.5408, "step": 4639 }, { "epoch": 0.6997963954452907, "grad_norm": 0.25774520157377045, "learning_rate": 1.428345865344605e-07, "loss": 1.5434, "step": 4640 }, { "epoch": 0.6999472136339643, "grad_norm": 0.29453802949194896, "learning_rate": 1.427483182341936e-07, "loss": 1.5267, "step": 4641 }, { "epoch": 0.7000980318226379, "grad_norm": 0.24766186104982402, "learning_rate": 1.4266207962828082e-07, "loss": 1.5256, "step": 4642 }, { "epoch": 0.7002488500113113, "grad_norm": 0.26043474078314627, "learning_rate": 1.4257587073608531e-07, "loss": 1.6133, "step": 4643 }, { "epoch": 0.7003996681999849, "grad_norm": 0.2755242564839165, "learning_rate": 1.424896915769634e-07, "loss": 1.4794, "step": 4644 }, { "epoch": 0.7005504863886585, "grad_norm": 0.25911969816313285, "learning_rate": 1.4240354217026491e-07, "loss": 1.5828, "step": 4645 }, { "epoch": 0.700701304577332, "grad_norm": 0.25111418115042655, "learning_rate": 1.423174225353328e-07, "loss": 1.5815, "step": 4646 }, { "epoch": 0.7008521227660056, "grad_norm": 0.2484232812334775, "learning_rate": 1.4223133269150344e-07, "loss": 1.6199, "step": 4647 }, { "epoch": 0.7010029409546792, "grad_norm": 0.2466588001579759, "learning_rate": 1.421452726581066e-07, "loss": 1.6041, "step": 4648 }, { "epoch": 0.7011537591433527, "grad_norm": 0.2512178840310022, "learning_rate": 1.4205924245446522e-07, "loss": 1.5315, "step": 4649 }, { "epoch": 0.7013045773320262, "grad_norm": 0.25511669845907525, "learning_rate": 1.4197324209989557e-07, "loss": 1.5644, "step": 4650 }, { "epoch": 0.7014553955206998, "grad_norm": 0.2580678072784764, "learning_rate": 1.4188727161370717e-07, "loss": 1.5675, "step": 4651 }, { "epoch": 0.7016062137093734, "grad_norm": 0.24206855496981436, "learning_rate": 1.4180133101520302e-07, "loss": 1.5505, "step": 4652 }, { "epoch": 0.701757031898047, "grad_norm": 0.255627073851271, "learning_rate": 1.4171542032367922e-07, "loss": 1.5633, "step": 4653 }, { "epoch": 0.7019078500867204, "grad_norm": 0.24664312577496256, "learning_rate": 1.4162953955842516e-07, "loss": 1.5686, "step": 4654 }, { "epoch": 0.702058668275394, "grad_norm": 0.24303956624318634, "learning_rate": 1.4154368873872374e-07, "loss": 1.5745, "step": 4655 }, { "epoch": 0.7022094864640676, "grad_norm": 0.24684430634144772, "learning_rate": 1.4145786788385086e-07, "loss": 1.5441, "step": 4656 }, { "epoch": 0.7023603046527411, "grad_norm": 0.2917308163037401, "learning_rate": 1.4137207701307577e-07, "loss": 1.6054, "step": 4657 }, { "epoch": 0.7025111228414147, "grad_norm": 0.25098386031111397, "learning_rate": 1.412863161456611e-07, "loss": 1.5738, "step": 4658 }, { "epoch": 0.7026619410300883, "grad_norm": 0.2529208126338307, "learning_rate": 1.4120058530086269e-07, "loss": 1.5741, "step": 4659 }, { "epoch": 0.7028127592187617, "grad_norm": 0.24101650356689916, "learning_rate": 1.411148844979294e-07, "loss": 1.5409, "step": 4660 }, { "epoch": 0.7029635774074353, "grad_norm": 0.238650413776694, "learning_rate": 1.4102921375610382e-07, "loss": 1.5352, "step": 4661 }, { "epoch": 0.7031143955961089, "grad_norm": 0.2689066687683917, "learning_rate": 1.4094357309462134e-07, "loss": 1.5955, "step": 4662 }, { "epoch": 0.7032652137847825, "grad_norm": 0.2706659048411641, "learning_rate": 1.4085796253271093e-07, "loss": 1.562, "step": 4663 }, { "epoch": 0.703416031973456, "grad_norm": 0.23829039855906392, "learning_rate": 1.4077238208959445e-07, "loss": 1.5374, "step": 4664 }, { "epoch": 0.7035668501621295, "grad_norm": 0.26114976173614274, "learning_rate": 1.4068683178448747e-07, "loss": 1.5966, "step": 4665 }, { "epoch": 0.7037176683508031, "grad_norm": 0.25768827402905975, "learning_rate": 1.4060131163659838e-07, "loss": 1.5849, "step": 4666 }, { "epoch": 0.7038684865394766, "grad_norm": 0.4189391320885793, "learning_rate": 1.4051582166512894e-07, "loss": 1.5402, "step": 4667 }, { "epoch": 0.7040193047281502, "grad_norm": 0.2483133632121748, "learning_rate": 1.4043036188927406e-07, "loss": 1.6295, "step": 4668 }, { "epoch": 0.7041701229168238, "grad_norm": 0.2385855023535277, "learning_rate": 1.4034493232822214e-07, "loss": 1.4939, "step": 4669 }, { "epoch": 0.7043209411054974, "grad_norm": 0.26169111052317984, "learning_rate": 1.4025953300115452e-07, "loss": 1.5273, "step": 4670 }, { "epoch": 0.7044717592941708, "grad_norm": 0.2585477916676268, "learning_rate": 1.4017416392724567e-07, "loss": 1.5145, "step": 4671 }, { "epoch": 0.7046225774828444, "grad_norm": 0.26120948412477474, "learning_rate": 1.400888251256637e-07, "loss": 1.5265, "step": 4672 }, { "epoch": 0.704773395671518, "grad_norm": 0.24901225629669874, "learning_rate": 1.4000351661556953e-07, "loss": 1.6174, "step": 4673 }, { "epoch": 0.7049242138601916, "grad_norm": 0.24153572524708944, "learning_rate": 1.3991823841611734e-07, "loss": 1.5245, "step": 4674 }, { "epoch": 0.7050750320488651, "grad_norm": 0.5307711694492537, "learning_rate": 1.398329905464547e-07, "loss": 1.5671, "step": 4675 }, { "epoch": 0.7052258502375387, "grad_norm": 0.2579480575203586, "learning_rate": 1.397477730257222e-07, "loss": 1.5647, "step": 4676 }, { "epoch": 0.7053766684262122, "grad_norm": 0.2639878244090322, "learning_rate": 1.3966258587305356e-07, "loss": 1.5617, "step": 4677 }, { "epoch": 0.7055274866148857, "grad_norm": 0.24542207545138842, "learning_rate": 1.395774291075759e-07, "loss": 1.5895, "step": 4678 }, { "epoch": 0.7056783048035593, "grad_norm": 0.2591097894616853, "learning_rate": 1.394923027484094e-07, "loss": 1.5643, "step": 4679 }, { "epoch": 0.7058291229922329, "grad_norm": 0.24816792443713048, "learning_rate": 1.3940720681466732e-07, "loss": 1.6382, "step": 4680 }, { "epoch": 0.7059799411809065, "grad_norm": 0.24660088521437482, "learning_rate": 1.3932214132545617e-07, "loss": 1.528, "step": 4681 }, { "epoch": 0.7061307593695799, "grad_norm": 0.2550707894427213, "learning_rate": 1.392371062998757e-07, "loss": 1.5442, "step": 4682 }, { "epoch": 0.7062815775582535, "grad_norm": 0.25198459987988236, "learning_rate": 1.3915210175701875e-07, "loss": 1.563, "step": 4683 }, { "epoch": 0.7064323957469271, "grad_norm": 0.2737898364630557, "learning_rate": 1.390671277159712e-07, "loss": 1.5757, "step": 4684 }, { "epoch": 0.7065832139356006, "grad_norm": 0.25991857160488113, "learning_rate": 1.389821841958123e-07, "loss": 1.525, "step": 4685 }, { "epoch": 0.7067340321242742, "grad_norm": 0.26009297654677155, "learning_rate": 1.3889727121561436e-07, "loss": 1.5795, "step": 4686 }, { "epoch": 0.7068848503129478, "grad_norm": 0.2547506888702494, "learning_rate": 1.3881238879444273e-07, "loss": 1.5742, "step": 4687 }, { "epoch": 0.7070356685016213, "grad_norm": 0.24743037937256265, "learning_rate": 1.3872753695135607e-07, "loss": 1.5968, "step": 4688 }, { "epoch": 0.7071864866902948, "grad_norm": 0.24740696964137768, "learning_rate": 1.386427157054061e-07, "loss": 1.4845, "step": 4689 }, { "epoch": 0.7073373048789684, "grad_norm": 0.24466560698641626, "learning_rate": 1.385579250756375e-07, "loss": 1.5184, "step": 4690 }, { "epoch": 0.707488123067642, "grad_norm": 0.24718704325105698, "learning_rate": 1.3847316508108846e-07, "loss": 1.5223, "step": 4691 }, { "epoch": 0.7076389412563155, "grad_norm": 0.32800916216340986, "learning_rate": 1.3838843574078997e-07, "loss": 1.5204, "step": 4692 }, { "epoch": 0.7077897594449891, "grad_norm": 0.2868302931470831, "learning_rate": 1.383037370737662e-07, "loss": 1.5735, "step": 4693 }, { "epoch": 0.7079405776336626, "grad_norm": 0.2463838919775735, "learning_rate": 1.3821906909903444e-07, "loss": 1.5848, "step": 4694 }, { "epoch": 0.7080913958223362, "grad_norm": 0.24133956966287837, "learning_rate": 1.3813443183560524e-07, "loss": 1.6289, "step": 4695 }, { "epoch": 0.7082422140110097, "grad_norm": 0.24378465938684218, "learning_rate": 1.3804982530248205e-07, "loss": 1.4916, "step": 4696 }, { "epoch": 0.7083930321996833, "grad_norm": 0.24630088740305392, "learning_rate": 1.3796524951866142e-07, "loss": 1.5481, "step": 4697 }, { "epoch": 0.7085438503883569, "grad_norm": 0.2656040531784327, "learning_rate": 1.3788070450313328e-07, "loss": 1.5553, "step": 4698 }, { "epoch": 0.7086946685770303, "grad_norm": 0.2511559138227458, "learning_rate": 1.3779619027488027e-07, "loss": 1.5458, "step": 4699 }, { "epoch": 0.7088454867657039, "grad_norm": 0.25184229086367793, "learning_rate": 1.3771170685287835e-07, "loss": 1.5723, "step": 4700 }, { "epoch": 0.7089963049543775, "grad_norm": 0.2576597364739768, "learning_rate": 1.3762725425609656e-07, "loss": 1.441, "step": 4701 }, { "epoch": 0.7091471231430511, "grad_norm": 0.2554761744844474, "learning_rate": 1.375428325034969e-07, "loss": 1.5775, "step": 4702 }, { "epoch": 0.7092979413317246, "grad_norm": 0.24178754208210249, "learning_rate": 1.3745844161403454e-07, "loss": 1.5885, "step": 4703 }, { "epoch": 0.7094487595203982, "grad_norm": 0.2518623994941124, "learning_rate": 1.373740816066577e-07, "loss": 1.5964, "step": 4704 }, { "epoch": 0.7095995777090717, "grad_norm": 0.2502385809004188, "learning_rate": 1.3728975250030767e-07, "loss": 1.5666, "step": 4705 }, { "epoch": 0.7097503958977452, "grad_norm": 0.2431014721828363, "learning_rate": 1.3720545431391877e-07, "loss": 1.486, "step": 4706 }, { "epoch": 0.7099012140864188, "grad_norm": 0.3343324071561751, "learning_rate": 1.3712118706641833e-07, "loss": 1.5682, "step": 4707 }, { "epoch": 0.7100520322750924, "grad_norm": 0.24824475114384603, "learning_rate": 1.3703695077672695e-07, "loss": 1.6006, "step": 4708 }, { "epoch": 0.710202850463766, "grad_norm": 0.27672391262975443, "learning_rate": 1.36952745463758e-07, "loss": 1.5638, "step": 4709 }, { "epoch": 0.7103536686524394, "grad_norm": 0.25077329250405694, "learning_rate": 1.3686857114641805e-07, "loss": 1.5158, "step": 4710 }, { "epoch": 0.710504486841113, "grad_norm": 0.3031627755783418, "learning_rate": 1.3678442784360678e-07, "loss": 1.6167, "step": 4711 }, { "epoch": 0.7106553050297866, "grad_norm": 0.24626767774868308, "learning_rate": 1.3670031557421674e-07, "loss": 1.5919, "step": 4712 }, { "epoch": 0.7108061232184602, "grad_norm": 0.24588592958535221, "learning_rate": 1.3661623435713348e-07, "loss": 1.5194, "step": 4713 }, { "epoch": 0.7109569414071337, "grad_norm": 0.2478471317333664, "learning_rate": 1.3653218421123588e-07, "loss": 1.5131, "step": 4714 }, { "epoch": 0.7111077595958073, "grad_norm": 0.275953448782174, "learning_rate": 1.3644816515539552e-07, "loss": 1.5677, "step": 4715 }, { "epoch": 0.7112585777844808, "grad_norm": 0.25129579415503, "learning_rate": 1.3636417720847722e-07, "loss": 1.5241, "step": 4716 }, { "epoch": 0.7114093959731543, "grad_norm": 0.25414855356921834, "learning_rate": 1.3628022038933857e-07, "loss": 1.5067, "step": 4717 }, { "epoch": 0.7115602141618279, "grad_norm": 0.24728432666433134, "learning_rate": 1.3619629471683048e-07, "loss": 1.6078, "step": 4718 }, { "epoch": 0.7117110323505015, "grad_norm": 0.2449076505767851, "learning_rate": 1.361124002097966e-07, "loss": 1.5409, "step": 4719 }, { "epoch": 0.7118618505391751, "grad_norm": 0.24870336121142433, "learning_rate": 1.360285368870737e-07, "loss": 1.6059, "step": 4720 }, { "epoch": 0.7120126687278486, "grad_norm": 0.2504405644553221, "learning_rate": 1.3594470476749158e-07, "loss": 1.6057, "step": 4721 }, { "epoch": 0.7121634869165221, "grad_norm": 0.27633694696709254, "learning_rate": 1.3586090386987305e-07, "loss": 1.5701, "step": 4722 }, { "epoch": 0.7123143051051957, "grad_norm": 0.2649768585335686, "learning_rate": 1.3577713421303365e-07, "loss": 1.5804, "step": 4723 }, { "epoch": 0.7124651232938692, "grad_norm": 0.26194802890270397, "learning_rate": 1.3569339581578238e-07, "loss": 1.5762, "step": 4724 }, { "epoch": 0.7126159414825428, "grad_norm": 0.2464197300271867, "learning_rate": 1.3560968869692075e-07, "loss": 1.5432, "step": 4725 }, { "epoch": 0.7127667596712164, "grad_norm": 1.1086502037535637, "learning_rate": 1.355260128752434e-07, "loss": 1.6048, "step": 4726 }, { "epoch": 0.7129175778598899, "grad_norm": 0.30833236238085443, "learning_rate": 1.354423683695382e-07, "loss": 1.5721, "step": 4727 }, { "epoch": 0.7130683960485634, "grad_norm": 0.3791106109638253, "learning_rate": 1.3535875519858568e-07, "loss": 1.5757, "step": 4728 }, { "epoch": 0.713219214237237, "grad_norm": 0.260077268877803, "learning_rate": 1.3527517338115942e-07, "loss": 1.5573, "step": 4729 }, { "epoch": 0.7133700324259106, "grad_norm": 0.24628343108179016, "learning_rate": 1.3519162293602587e-07, "loss": 1.5816, "step": 4730 }, { "epoch": 0.7135208506145841, "grad_norm": 0.24016223821112434, "learning_rate": 1.3510810388194476e-07, "loss": 1.4971, "step": 4731 }, { "epoch": 0.7136716688032577, "grad_norm": 0.2519599764404375, "learning_rate": 1.3502461623766838e-07, "loss": 1.5589, "step": 4732 }, { "epoch": 0.7138224869919312, "grad_norm": 0.2675908029027234, "learning_rate": 1.3494116002194217e-07, "loss": 1.5877, "step": 4733 }, { "epoch": 0.7139733051806048, "grad_norm": 0.310853199221443, "learning_rate": 1.3485773525350451e-07, "loss": 1.5517, "step": 4734 }, { "epoch": 0.7141241233692783, "grad_norm": 0.25426694369992525, "learning_rate": 1.3477434195108672e-07, "loss": 1.581, "step": 4735 }, { "epoch": 0.7142749415579519, "grad_norm": 0.24852694461858268, "learning_rate": 1.346909801334129e-07, "loss": 1.4889, "step": 4736 }, { "epoch": 0.7144257597466255, "grad_norm": 0.26412638082902884, "learning_rate": 1.346076498192004e-07, "loss": 1.5109, "step": 4737 }, { "epoch": 0.714576577935299, "grad_norm": 0.3346778331399256, "learning_rate": 1.3452435102715918e-07, "loss": 1.6408, "step": 4738 }, { "epoch": 0.7147273961239725, "grad_norm": 0.24639182747383537, "learning_rate": 1.3444108377599212e-07, "loss": 1.5529, "step": 4739 }, { "epoch": 0.7148782143126461, "grad_norm": 0.25655433508352976, "learning_rate": 1.3435784808439542e-07, "loss": 1.5435, "step": 4740 }, { "epoch": 0.7150290325013197, "grad_norm": 0.2717890791231324, "learning_rate": 1.3427464397105777e-07, "loss": 1.5816, "step": 4741 }, { "epoch": 0.7151798506899932, "grad_norm": 0.24936196683161244, "learning_rate": 1.3419147145466092e-07, "loss": 1.5409, "step": 4742 }, { "epoch": 0.7153306688786668, "grad_norm": 0.27944856658442013, "learning_rate": 1.3410833055387943e-07, "loss": 1.5387, "step": 4743 }, { "epoch": 0.7154814870673403, "grad_norm": 0.2568016870564878, "learning_rate": 1.3402522128738104e-07, "loss": 1.4872, "step": 4744 }, { "epoch": 0.7156323052560138, "grad_norm": 0.27254404256969117, "learning_rate": 1.339421436738261e-07, "loss": 1.6064, "step": 4745 }, { "epoch": 0.7157831234446874, "grad_norm": 0.274410169098975, "learning_rate": 1.3385909773186788e-07, "loss": 1.5203, "step": 4746 }, { "epoch": 0.715933941633361, "grad_norm": 0.2880226154380716, "learning_rate": 1.3377608348015278e-07, "loss": 1.5141, "step": 4747 }, { "epoch": 0.7160847598220346, "grad_norm": 0.2594462924354836, "learning_rate": 1.3369310093731983e-07, "loss": 1.4899, "step": 4748 }, { "epoch": 0.7162355780107081, "grad_norm": 0.30696420741370845, "learning_rate": 1.33610150122001e-07, "loss": 1.5421, "step": 4749 }, { "epoch": 0.7163863961993816, "grad_norm": 0.2618505672279436, "learning_rate": 1.335272310528212e-07, "loss": 1.538, "step": 4750 }, { "epoch": 0.7165372143880552, "grad_norm": 0.462287871540447, "learning_rate": 1.3344434374839822e-07, "loss": 1.5347, "step": 4751 }, { "epoch": 0.7166880325767288, "grad_norm": 0.24555954596563262, "learning_rate": 1.3336148822734253e-07, "loss": 1.4691, "step": 4752 }, { "epoch": 0.7168388507654023, "grad_norm": 0.35095253187412284, "learning_rate": 1.3327866450825782e-07, "loss": 1.5425, "step": 4753 }, { "epoch": 0.7169896689540759, "grad_norm": 0.2502740043949193, "learning_rate": 1.3319587260974025e-07, "loss": 1.5323, "step": 4754 }, { "epoch": 0.7171404871427494, "grad_norm": 0.2507176225991556, "learning_rate": 1.331131125503791e-07, "loss": 1.5308, "step": 4755 }, { "epoch": 0.7172913053314229, "grad_norm": 0.2819272418342701, "learning_rate": 1.330303843487563e-07, "loss": 1.5434, "step": 4756 }, { "epoch": 0.7174421235200965, "grad_norm": 0.2509324282898924, "learning_rate": 1.3294768802344692e-07, "loss": 1.5538, "step": 4757 }, { "epoch": 0.7175929417087701, "grad_norm": 0.24737899237106037, "learning_rate": 1.328650235930186e-07, "loss": 1.4796, "step": 4758 }, { "epoch": 0.7177437598974437, "grad_norm": 0.26206163566396135, "learning_rate": 1.327823910760318e-07, "loss": 1.516, "step": 4759 }, { "epoch": 0.7178945780861172, "grad_norm": 0.6149272723503363, "learning_rate": 1.3269979049104014e-07, "loss": 1.585, "step": 4760 }, { "epoch": 0.7180453962747907, "grad_norm": 0.25192326864076015, "learning_rate": 1.3261722185658974e-07, "loss": 1.5538, "step": 4761 }, { "epoch": 0.7181962144634643, "grad_norm": 0.24960088406697814, "learning_rate": 1.3253468519121963e-07, "loss": 1.5544, "step": 4762 }, { "epoch": 0.7183470326521378, "grad_norm": 0.2752348350354927, "learning_rate": 1.324521805134618e-07, "loss": 1.5641, "step": 4763 }, { "epoch": 0.7184978508408114, "grad_norm": 0.2498284925081655, "learning_rate": 1.3236970784184088e-07, "loss": 1.6076, "step": 4764 }, { "epoch": 0.718648669029485, "grad_norm": 0.24993634033255005, "learning_rate": 1.3228726719487437e-07, "loss": 1.6529, "step": 4765 }, { "epoch": 0.7187994872181586, "grad_norm": 0.24857935421463165, "learning_rate": 1.3220485859107267e-07, "loss": 1.5523, "step": 4766 }, { "epoch": 0.718950305406832, "grad_norm": 0.27442449103397504, "learning_rate": 1.321224820489389e-07, "loss": 1.6139, "step": 4767 }, { "epoch": 0.7191011235955056, "grad_norm": 0.2631059019473054, "learning_rate": 1.320401375869689e-07, "loss": 1.5582, "step": 4768 }, { "epoch": 0.7192519417841792, "grad_norm": 0.2751035641520621, "learning_rate": 1.3195782522365145e-07, "loss": 1.5074, "step": 4769 }, { "epoch": 0.7194027599728527, "grad_norm": 0.254206834439695, "learning_rate": 1.3187554497746813e-07, "loss": 1.5125, "step": 4770 }, { "epoch": 0.7195535781615263, "grad_norm": 0.26443731820936195, "learning_rate": 1.3179329686689323e-07, "loss": 1.4735, "step": 4771 }, { "epoch": 0.7197043963501998, "grad_norm": 0.27480922400219626, "learning_rate": 1.3171108091039374e-07, "loss": 1.5646, "step": 4772 }, { "epoch": 0.7198552145388734, "grad_norm": 0.2668415492180347, "learning_rate": 1.316288971264297e-07, "loss": 1.5483, "step": 4773 }, { "epoch": 0.7200060327275469, "grad_norm": 0.27113682079889084, "learning_rate": 1.3154674553345368e-07, "loss": 1.5722, "step": 4774 }, { "epoch": 0.7201568509162205, "grad_norm": 0.24571186963288189, "learning_rate": 1.31464626149911e-07, "loss": 1.5348, "step": 4775 }, { "epoch": 0.7203076691048941, "grad_norm": 0.33501354542952616, "learning_rate": 1.313825389942401e-07, "loss": 1.5666, "step": 4776 }, { "epoch": 0.7204584872935677, "grad_norm": 0.2708282584987015, "learning_rate": 1.3130048408487177e-07, "loss": 1.5743, "step": 4777 }, { "epoch": 0.7206093054822411, "grad_norm": 0.2560038596933287, "learning_rate": 1.3121846144022963e-07, "loss": 1.5117, "step": 4778 }, { "epoch": 0.7207601236709147, "grad_norm": 0.2460218550726402, "learning_rate": 1.3113647107873042e-07, "loss": 1.5944, "step": 4779 }, { "epoch": 0.7209109418595883, "grad_norm": 0.2540661878105851, "learning_rate": 1.310545130187832e-07, "loss": 1.5551, "step": 4780 }, { "epoch": 0.7210617600482618, "grad_norm": 0.26535813628475846, "learning_rate": 1.3097258727878996e-07, "loss": 1.5921, "step": 4781 }, { "epoch": 0.7212125782369354, "grad_norm": 0.25510044653757036, "learning_rate": 1.3089069387714534e-07, "loss": 1.5921, "step": 4782 }, { "epoch": 0.721363396425609, "grad_norm": 0.258672877029231, "learning_rate": 1.30808832832237e-07, "loss": 1.5307, "step": 4783 }, { "epoch": 0.7215142146142824, "grad_norm": 0.2515153158348181, "learning_rate": 1.3072700416244492e-07, "loss": 1.564, "step": 4784 }, { "epoch": 0.721665032802956, "grad_norm": 0.25595079191995307, "learning_rate": 1.3064520788614202e-07, "loss": 1.5595, "step": 4785 }, { "epoch": 0.7218158509916296, "grad_norm": 0.2503056974253421, "learning_rate": 1.3056344402169418e-07, "loss": 1.5445, "step": 4786 }, { "epoch": 0.7219666691803032, "grad_norm": 0.23401901746378476, "learning_rate": 1.3048171258745955e-07, "loss": 1.5035, "step": 4787 }, { "epoch": 0.7221174873689767, "grad_norm": 0.2654912607361534, "learning_rate": 1.3040001360178925e-07, "loss": 1.6015, "step": 4788 }, { "epoch": 0.7222683055576502, "grad_norm": 0.3003362992871323, "learning_rate": 1.3031834708302715e-07, "loss": 1.5482, "step": 4789 }, { "epoch": 0.7224191237463238, "grad_norm": 0.2459141520442908, "learning_rate": 1.3023671304950976e-07, "loss": 1.5025, "step": 4790 }, { "epoch": 0.7225699419349974, "grad_norm": 0.2769796726755768, "learning_rate": 1.3015511151956614e-07, "loss": 1.5705, "step": 4791 }, { "epoch": 0.7227207601236709, "grad_norm": 0.2628715158476944, "learning_rate": 1.3007354251151844e-07, "loss": 1.5736, "step": 4792 }, { "epoch": 0.7228715783123445, "grad_norm": 0.232517209113232, "learning_rate": 1.299920060436812e-07, "loss": 1.5009, "step": 4793 }, { "epoch": 0.7230223965010181, "grad_norm": 0.250920557316729, "learning_rate": 1.2991050213436166e-07, "loss": 1.5482, "step": 4794 }, { "epoch": 0.7231732146896915, "grad_norm": 0.2692022742934645, "learning_rate": 1.298290308018598e-07, "loss": 1.4868, "step": 4795 }, { "epoch": 0.7233240328783651, "grad_norm": 0.2830417321359642, "learning_rate": 1.2974759206446845e-07, "loss": 1.5895, "step": 4796 }, { "epoch": 0.7234748510670387, "grad_norm": 0.25235529470522844, "learning_rate": 1.296661859404729e-07, "loss": 1.5513, "step": 4797 }, { "epoch": 0.7236256692557123, "grad_norm": 0.26965882097816113, "learning_rate": 1.2958481244815111e-07, "loss": 1.5082, "step": 4798 }, { "epoch": 0.7237764874443858, "grad_norm": 0.37111415105454276, "learning_rate": 1.29503471605774e-07, "loss": 1.4878, "step": 4799 }, { "epoch": 0.7239273056330593, "grad_norm": 0.3637365626005936, "learning_rate": 1.2942216343160478e-07, "loss": 1.5448, "step": 4800 }, { "epoch": 0.7240781238217329, "grad_norm": 0.25419613760280135, "learning_rate": 1.2934088794389954e-07, "loss": 1.5455, "step": 4801 }, { "epoch": 0.7242289420104064, "grad_norm": 0.24881891337070325, "learning_rate": 1.2925964516090703e-07, "loss": 1.5603, "step": 4802 }, { "epoch": 0.72437976019908, "grad_norm": 0.2655133874520916, "learning_rate": 1.2917843510086866e-07, "loss": 1.61, "step": 4803 }, { "epoch": 0.7245305783877536, "grad_norm": 0.24981081472759925, "learning_rate": 1.2909725778201842e-07, "loss": 1.5664, "step": 4804 }, { "epoch": 0.7246813965764272, "grad_norm": 0.2618097018011796, "learning_rate": 1.2901611322258285e-07, "loss": 1.5684, "step": 4805 }, { "epoch": 0.7248322147651006, "grad_norm": 0.2461547827592507, "learning_rate": 1.2893500144078148e-07, "loss": 1.5833, "step": 4806 }, { "epoch": 0.7249830329537742, "grad_norm": 0.3153575492477137, "learning_rate": 1.2885392245482618e-07, "loss": 1.6003, "step": 4807 }, { "epoch": 0.7251338511424478, "grad_norm": 0.2555928410426974, "learning_rate": 1.287728762829214e-07, "loss": 1.5403, "step": 4808 }, { "epoch": 0.7252846693311213, "grad_norm": 0.30337841154326795, "learning_rate": 1.2869186294326463e-07, "loss": 1.5782, "step": 4809 }, { "epoch": 0.7254354875197949, "grad_norm": 0.3306463957817853, "learning_rate": 1.286108824540456e-07, "loss": 1.5603, "step": 4810 }, { "epoch": 0.7255863057084685, "grad_norm": 0.2454194997307586, "learning_rate": 1.2852993483344664e-07, "loss": 1.5024, "step": 4811 }, { "epoch": 0.725737123897142, "grad_norm": 0.26104941804853266, "learning_rate": 1.284490200996431e-07, "loss": 1.5667, "step": 4812 }, { "epoch": 0.7258879420858155, "grad_norm": 0.25637271753239255, "learning_rate": 1.2836813827080263e-07, "loss": 1.5568, "step": 4813 }, { "epoch": 0.7260387602744891, "grad_norm": 0.2648634274855605, "learning_rate": 1.2828728936508537e-07, "loss": 1.5988, "step": 4814 }, { "epoch": 0.7261895784631627, "grad_norm": 0.2612922500483929, "learning_rate": 1.282064734006445e-07, "loss": 1.5545, "step": 4815 }, { "epoch": 0.7263403966518363, "grad_norm": 0.28560409625042155, "learning_rate": 1.2812569039562548e-07, "loss": 1.6042, "step": 4816 }, { "epoch": 0.7264912148405097, "grad_norm": 0.2518002809513442, "learning_rate": 1.2804494036816636e-07, "loss": 1.4589, "step": 4817 }, { "epoch": 0.7266420330291833, "grad_norm": 0.24302328381493488, "learning_rate": 1.279642233363979e-07, "loss": 1.6153, "step": 4818 }, { "epoch": 0.7267928512178569, "grad_norm": 0.27424878212349574, "learning_rate": 1.2788353931844347e-07, "loss": 1.6389, "step": 4819 }, { "epoch": 0.7269436694065304, "grad_norm": 0.2432496072800537, "learning_rate": 1.2780288833241905e-07, "loss": 1.5211, "step": 4820 }, { "epoch": 0.727094487595204, "grad_norm": 0.24526076941733485, "learning_rate": 1.277222703964329e-07, "loss": 1.5853, "step": 4821 }, { "epoch": 0.7272453057838776, "grad_norm": 0.2758571399825683, "learning_rate": 1.276416855285864e-07, "loss": 1.517, "step": 4822 }, { "epoch": 0.727396123972551, "grad_norm": 0.2523044002321618, "learning_rate": 1.27561133746973e-07, "loss": 1.5637, "step": 4823 }, { "epoch": 0.7275469421612246, "grad_norm": 0.2448839969754353, "learning_rate": 1.274806150696789e-07, "loss": 1.5458, "step": 4824 }, { "epoch": 0.7276977603498982, "grad_norm": 0.26831978639492615, "learning_rate": 1.2740012951478304e-07, "loss": 1.5429, "step": 4825 }, { "epoch": 0.7278485785385718, "grad_norm": 0.3017575431328631, "learning_rate": 1.2731967710035667e-07, "loss": 1.5537, "step": 4826 }, { "epoch": 0.7279993967272453, "grad_norm": 0.25548910611664444, "learning_rate": 1.2723925784446366e-07, "loss": 1.5139, "step": 4827 }, { "epoch": 0.7281502149159189, "grad_norm": 0.2505657459733443, "learning_rate": 1.271588717651606e-07, "loss": 1.4835, "step": 4828 }, { "epoch": 0.7283010331045924, "grad_norm": 0.2570799187421738, "learning_rate": 1.2707851888049646e-07, "loss": 1.5586, "step": 4829 }, { "epoch": 0.728451851293266, "grad_norm": 0.24539688464693027, "learning_rate": 1.2699819920851274e-07, "loss": 1.5456, "step": 4830 }, { "epoch": 0.7286026694819395, "grad_norm": 0.25000991554635804, "learning_rate": 1.2691791276724353e-07, "loss": 1.5591, "step": 4831 }, { "epoch": 0.7287534876706131, "grad_norm": 0.24951798023113553, "learning_rate": 1.2683765957471563e-07, "loss": 1.5319, "step": 4832 }, { "epoch": 0.7289043058592867, "grad_norm": 0.3061178431026231, "learning_rate": 1.2675743964894808e-07, "loss": 1.5712, "step": 4833 }, { "epoch": 0.7290551240479601, "grad_norm": 0.26458100141929913, "learning_rate": 1.2667725300795256e-07, "loss": 1.6257, "step": 4834 }, { "epoch": 0.7292059422366337, "grad_norm": 0.3171487655534102, "learning_rate": 1.265970996697335e-07, "loss": 1.5573, "step": 4835 }, { "epoch": 0.7293567604253073, "grad_norm": 0.25811069191209124, "learning_rate": 1.2651697965228747e-07, "loss": 1.5435, "step": 4836 }, { "epoch": 0.7295075786139809, "grad_norm": 0.2565389245468662, "learning_rate": 1.2643689297360377e-07, "loss": 1.5934, "step": 4837 }, { "epoch": 0.7296583968026544, "grad_norm": 0.2633714387416033, "learning_rate": 1.263568396516643e-07, "loss": 1.5242, "step": 4838 }, { "epoch": 0.729809214991328, "grad_norm": 0.2553323568346628, "learning_rate": 1.262768197044433e-07, "loss": 1.576, "step": 4839 }, { "epoch": 0.7299600331800015, "grad_norm": 0.3301487630751055, "learning_rate": 1.2619683314990747e-07, "loss": 1.5358, "step": 4840 }, { "epoch": 0.730110851368675, "grad_norm": 0.25633908493026725, "learning_rate": 1.2611688000601637e-07, "loss": 1.5936, "step": 4841 }, { "epoch": 0.7302616695573486, "grad_norm": 0.24960387878766888, "learning_rate": 1.2603696029072158e-07, "loss": 1.533, "step": 4842 }, { "epoch": 0.7304124877460222, "grad_norm": 0.3349952372116707, "learning_rate": 1.2595707402196753e-07, "loss": 1.5558, "step": 4843 }, { "epoch": 0.7305633059346958, "grad_norm": 0.26827627120663905, "learning_rate": 1.258772212176909e-07, "loss": 1.5489, "step": 4844 }, { "epoch": 0.7307141241233692, "grad_norm": 0.26202005410067175, "learning_rate": 1.2579740189582114e-07, "loss": 1.6241, "step": 4845 }, { "epoch": 0.7308649423120428, "grad_norm": 0.23730845402549805, "learning_rate": 1.257176160742799e-07, "loss": 1.5512, "step": 4846 }, { "epoch": 0.7310157605007164, "grad_norm": 0.24628036544363902, "learning_rate": 1.2563786377098134e-07, "loss": 1.5274, "step": 4847 }, { "epoch": 0.73116657868939, "grad_norm": 0.3359114463154828, "learning_rate": 1.2555814500383234e-07, "loss": 1.5514, "step": 4848 }, { "epoch": 0.7313173968780635, "grad_norm": 0.2545087267482347, "learning_rate": 1.2547845979073202e-07, "loss": 1.5457, "step": 4849 }, { "epoch": 0.7314682150667371, "grad_norm": 0.2385362845552149, "learning_rate": 1.2539880814957195e-07, "loss": 1.6271, "step": 4850 }, { "epoch": 0.7316190332554106, "grad_norm": 0.2630604683576181, "learning_rate": 1.2531919009823638e-07, "loss": 1.5229, "step": 4851 }, { "epoch": 0.7317698514440841, "grad_norm": 0.2413971760254369, "learning_rate": 1.2523960565460183e-07, "loss": 1.6014, "step": 4852 }, { "epoch": 0.7319206696327577, "grad_norm": 0.35491120802072545, "learning_rate": 1.2516005483653728e-07, "loss": 1.5709, "step": 4853 }, { "epoch": 0.7320714878214313, "grad_norm": 0.23795666653362516, "learning_rate": 1.2508053766190428e-07, "loss": 1.6314, "step": 4854 }, { "epoch": 0.7322223060101049, "grad_norm": 0.3201093612781402, "learning_rate": 1.2500105414855672e-07, "loss": 1.6001, "step": 4855 }, { "epoch": 0.7323731241987784, "grad_norm": 0.2570636957802805, "learning_rate": 1.2492160431434095e-07, "loss": 1.5927, "step": 4856 }, { "epoch": 0.7325239423874519, "grad_norm": 0.27672499262231987, "learning_rate": 1.248421881770957e-07, "loss": 1.5212, "step": 4857 }, { "epoch": 0.7326747605761255, "grad_norm": 0.2881789131100355, "learning_rate": 1.2476280575465236e-07, "loss": 1.5669, "step": 4858 }, { "epoch": 0.732825578764799, "grad_norm": 0.2512900593411515, "learning_rate": 1.2468345706483454e-07, "loss": 1.5854, "step": 4859 }, { "epoch": 0.7329763969534726, "grad_norm": 0.24430168831316637, "learning_rate": 1.246041421254582e-07, "loss": 1.6113, "step": 4860 }, { "epoch": 0.7331272151421462, "grad_norm": 0.24967757059348708, "learning_rate": 1.2452486095433207e-07, "loss": 1.566, "step": 4861 }, { "epoch": 0.7332780333308196, "grad_norm": 0.2460779172665949, "learning_rate": 1.2444561356925696e-07, "loss": 1.5459, "step": 4862 }, { "epoch": 0.7334288515194932, "grad_norm": 0.24547609377071855, "learning_rate": 1.2436639998802617e-07, "loss": 1.5637, "step": 4863 }, { "epoch": 0.7335796697081668, "grad_norm": 0.25436731808135615, "learning_rate": 1.2428722022842558e-07, "loss": 1.5807, "step": 4864 }, { "epoch": 0.7337304878968404, "grad_norm": 0.31255660965857957, "learning_rate": 1.242080743082333e-07, "loss": 1.6048, "step": 4865 }, { "epoch": 0.7338813060855139, "grad_norm": 0.24347193523993743, "learning_rate": 1.241289622452198e-07, "loss": 1.609, "step": 4866 }, { "epoch": 0.7340321242741875, "grad_norm": 0.24839489010233448, "learning_rate": 1.240498840571482e-07, "loss": 1.5773, "step": 4867 }, { "epoch": 0.734182942462861, "grad_norm": 0.25858908645724965, "learning_rate": 1.239708397617738e-07, "loss": 1.5358, "step": 4868 }, { "epoch": 0.7343337606515346, "grad_norm": 0.25517992246543114, "learning_rate": 1.2389182937684424e-07, "loss": 1.4918, "step": 4869 }, { "epoch": 0.7344845788402081, "grad_norm": 0.2739609838242563, "learning_rate": 1.2381285292009975e-07, "loss": 1.5208, "step": 4870 }, { "epoch": 0.7346353970288817, "grad_norm": 0.2588505484162014, "learning_rate": 1.2373391040927288e-07, "loss": 1.5892, "step": 4871 }, { "epoch": 0.7347862152175553, "grad_norm": 0.2467211738657118, "learning_rate": 1.236550018620885e-07, "loss": 1.5012, "step": 4872 }, { "epoch": 0.7349370334062288, "grad_norm": 0.2505211482849064, "learning_rate": 1.2357612729626386e-07, "loss": 1.5041, "step": 4873 }, { "epoch": 0.7350878515949023, "grad_norm": 0.23844532247069256, "learning_rate": 1.2349728672950852e-07, "loss": 1.5505, "step": 4874 }, { "epoch": 0.7352386697835759, "grad_norm": 0.25889367117582207, "learning_rate": 1.2341848017952463e-07, "loss": 1.528, "step": 4875 }, { "epoch": 0.7353894879722495, "grad_norm": 0.2632746655943858, "learning_rate": 1.2333970766400645e-07, "loss": 1.6027, "step": 4876 }, { "epoch": 0.735540306160923, "grad_norm": 0.26375667831932015, "learning_rate": 1.232609692006407e-07, "loss": 1.5342, "step": 4877 }, { "epoch": 0.7356911243495966, "grad_norm": 0.2636727335172054, "learning_rate": 1.2318226480710657e-07, "loss": 1.5426, "step": 4878 }, { "epoch": 0.7358419425382701, "grad_norm": 0.2391237416136801, "learning_rate": 1.231035945010754e-07, "loss": 1.5567, "step": 4879 }, { "epoch": 0.7359927607269436, "grad_norm": 0.25500269314353563, "learning_rate": 1.2302495830021088e-07, "loss": 1.573, "step": 4880 }, { "epoch": 0.7361435789156172, "grad_norm": 0.24175138586985964, "learning_rate": 1.2294635622216932e-07, "loss": 1.4994, "step": 4881 }, { "epoch": 0.7362943971042908, "grad_norm": 0.2729612009679162, "learning_rate": 1.2286778828459906e-07, "loss": 1.594, "step": 4882 }, { "epoch": 0.7364452152929644, "grad_norm": 0.33025350959697874, "learning_rate": 1.2278925450514088e-07, "loss": 1.5365, "step": 4883 }, { "epoch": 0.7365960334816379, "grad_norm": 0.24528978937390591, "learning_rate": 1.2271075490142795e-07, "loss": 1.527, "step": 4884 }, { "epoch": 0.7367468516703114, "grad_norm": 0.24587315632508772, "learning_rate": 1.2263228949108575e-07, "loss": 1.5086, "step": 4885 }, { "epoch": 0.736897669858985, "grad_norm": 0.24091396943889565, "learning_rate": 1.2255385829173198e-07, "loss": 1.5683, "step": 4886 }, { "epoch": 0.7370484880476585, "grad_norm": 0.26317210667279417, "learning_rate": 1.2247546132097664e-07, "loss": 1.5471, "step": 4887 }, { "epoch": 0.7371993062363321, "grad_norm": 0.45806378602570813, "learning_rate": 1.2239709859642233e-07, "loss": 1.5186, "step": 4888 }, { "epoch": 0.7373501244250057, "grad_norm": 0.2603858399133861, "learning_rate": 1.223187701356637e-07, "loss": 1.5954, "step": 4889 }, { "epoch": 0.7375009426136792, "grad_norm": 0.2440198533434365, "learning_rate": 1.2224047595628764e-07, "loss": 1.5374, "step": 4890 }, { "epoch": 0.7376517608023527, "grad_norm": 0.8741362887191947, "learning_rate": 1.2216221607587366e-07, "loss": 1.559, "step": 4891 }, { "epoch": 0.7378025789910263, "grad_norm": 0.24756208490889292, "learning_rate": 1.2208399051199331e-07, "loss": 1.5677, "step": 4892 }, { "epoch": 0.7379533971796999, "grad_norm": 0.3713112296550961, "learning_rate": 1.2200579928221041e-07, "loss": 1.5299, "step": 4893 }, { "epoch": 0.7381042153683735, "grad_norm": 0.24478060913393343, "learning_rate": 1.2192764240408133e-07, "loss": 1.5512, "step": 4894 }, { "epoch": 0.738255033557047, "grad_norm": 0.24810953221364054, "learning_rate": 1.218495198951545e-07, "loss": 1.6259, "step": 4895 }, { "epoch": 0.7384058517457205, "grad_norm": 0.3194982608501394, "learning_rate": 1.2177143177297062e-07, "loss": 1.4957, "step": 4896 }, { "epoch": 0.7385566699343941, "grad_norm": 0.23913340094219115, "learning_rate": 1.2169337805506286e-07, "loss": 1.5925, "step": 4897 }, { "epoch": 0.7387074881230676, "grad_norm": 0.2401729841984577, "learning_rate": 1.2161535875895652e-07, "loss": 1.5484, "step": 4898 }, { "epoch": 0.7388583063117412, "grad_norm": 0.2560492767790417, "learning_rate": 1.215373739021692e-07, "loss": 1.571, "step": 4899 }, { "epoch": 0.7390091245004148, "grad_norm": 0.2533453774118272, "learning_rate": 1.214594235022107e-07, "loss": 1.5562, "step": 4900 }, { "epoch": 0.7391599426890884, "grad_norm": 0.24954588639369182, "learning_rate": 1.2138150757658327e-07, "loss": 1.5795, "step": 4901 }, { "epoch": 0.7393107608777618, "grad_norm": 0.3961490119640425, "learning_rate": 1.2130362614278125e-07, "loss": 1.5271, "step": 4902 }, { "epoch": 0.7394615790664354, "grad_norm": 0.24917330164657575, "learning_rate": 1.2122577921829124e-07, "loss": 1.5983, "step": 4903 }, { "epoch": 0.739612397255109, "grad_norm": 0.2498110920325101, "learning_rate": 1.211479668205923e-07, "loss": 1.6059, "step": 4904 }, { "epoch": 0.7397632154437825, "grad_norm": 0.404323150035056, "learning_rate": 1.210701889671554e-07, "loss": 1.5374, "step": 4905 }, { "epoch": 0.7399140336324561, "grad_norm": 0.25752710970601544, "learning_rate": 1.20992445675444e-07, "loss": 1.5844, "step": 4906 }, { "epoch": 0.7400648518211296, "grad_norm": 0.26327930214752393, "learning_rate": 1.209147369629138e-07, "loss": 1.5387, "step": 4907 }, { "epoch": 0.7402156700098032, "grad_norm": 0.26222434865951005, "learning_rate": 1.208370628470126e-07, "loss": 1.5743, "step": 4908 }, { "epoch": 0.7403664881984767, "grad_norm": 0.2464825262906466, "learning_rate": 1.2075942334518043e-07, "loss": 1.5022, "step": 4909 }, { "epoch": 0.7405173063871503, "grad_norm": 0.24335101661561845, "learning_rate": 1.206818184748498e-07, "loss": 1.5018, "step": 4910 }, { "epoch": 0.7406681245758239, "grad_norm": 0.25616286972518576, "learning_rate": 1.2060424825344513e-07, "loss": 1.5709, "step": 4911 }, { "epoch": 0.7408189427644974, "grad_norm": 0.2556399833872158, "learning_rate": 1.2052671269838326e-07, "loss": 1.4909, "step": 4912 }, { "epoch": 0.7409697609531709, "grad_norm": 0.27448818647694906, "learning_rate": 1.2044921182707306e-07, "loss": 1.5212, "step": 4913 }, { "epoch": 0.7411205791418445, "grad_norm": 0.27039827192529786, "learning_rate": 1.203717456569159e-07, "loss": 1.4644, "step": 4914 }, { "epoch": 0.7412713973305181, "grad_norm": 0.2517974792155157, "learning_rate": 1.202943142053051e-07, "loss": 1.5436, "step": 4915 }, { "epoch": 0.7414222155191916, "grad_norm": 0.2514834079126791, "learning_rate": 1.2021691748962622e-07, "loss": 1.5402, "step": 4916 }, { "epoch": 0.7415730337078652, "grad_norm": 0.2475307711527151, "learning_rate": 1.2013955552725722e-07, "loss": 1.5476, "step": 4917 }, { "epoch": 0.7417238518965388, "grad_norm": 0.25423670019438954, "learning_rate": 1.2006222833556805e-07, "loss": 1.6096, "step": 4918 }, { "epoch": 0.7418746700852122, "grad_norm": 0.27267555069412147, "learning_rate": 1.1998493593192084e-07, "loss": 1.5231, "step": 4919 }, { "epoch": 0.7420254882738858, "grad_norm": 0.26121014975893625, "learning_rate": 1.1990767833367011e-07, "loss": 1.5762, "step": 4920 }, { "epoch": 0.7421763064625594, "grad_norm": 0.2595423583273133, "learning_rate": 1.1983045555816236e-07, "loss": 1.5498, "step": 4921 }, { "epoch": 0.742327124651233, "grad_norm": 0.2541893990545665, "learning_rate": 1.1975326762273634e-07, "loss": 1.5345, "step": 4922 }, { "epoch": 0.7424779428399065, "grad_norm": 0.24344151161699468, "learning_rate": 1.1967611454472306e-07, "loss": 1.52, "step": 4923 }, { "epoch": 0.74262876102858, "grad_norm": 0.24929965046703223, "learning_rate": 1.1959899634144562e-07, "loss": 1.575, "step": 4924 }, { "epoch": 0.7427795792172536, "grad_norm": 0.26155719723606896, "learning_rate": 1.1952191303021926e-07, "loss": 1.5896, "step": 4925 }, { "epoch": 0.7429303974059271, "grad_norm": 0.26977541332904076, "learning_rate": 1.1944486462835139e-07, "loss": 1.5467, "step": 4926 }, { "epoch": 0.7430812155946007, "grad_norm": 0.27565973767509366, "learning_rate": 1.1936785115314173e-07, "loss": 1.5633, "step": 4927 }, { "epoch": 0.7432320337832743, "grad_norm": 0.25178798058837637, "learning_rate": 1.1929087262188203e-07, "loss": 1.5646, "step": 4928 }, { "epoch": 0.7433828519719479, "grad_norm": 0.2527947496709433, "learning_rate": 1.192139290518561e-07, "loss": 1.5178, "step": 4929 }, { "epoch": 0.7435336701606213, "grad_norm": 0.2510570946797427, "learning_rate": 1.1913702046034014e-07, "loss": 1.5101, "step": 4930 }, { "epoch": 0.7436844883492949, "grad_norm": 0.25647391759096383, "learning_rate": 1.1906014686460233e-07, "loss": 1.6167, "step": 4931 }, { "epoch": 0.7438353065379685, "grad_norm": 0.5134223535051595, "learning_rate": 1.1898330828190293e-07, "loss": 1.567, "step": 4932 }, { "epoch": 0.743986124726642, "grad_norm": 0.3244622011687557, "learning_rate": 1.1890650472949463e-07, "loss": 1.5749, "step": 4933 }, { "epoch": 0.7441369429153156, "grad_norm": 0.2515454757116323, "learning_rate": 1.1882973622462195e-07, "loss": 1.5309, "step": 4934 }, { "epoch": 0.7442877611039891, "grad_norm": 0.2450180040181453, "learning_rate": 1.1875300278452166e-07, "loss": 1.5503, "step": 4935 }, { "epoch": 0.7444385792926627, "grad_norm": 0.30533582464679626, "learning_rate": 1.1867630442642259e-07, "loss": 1.5621, "step": 4936 }, { "epoch": 0.7445893974813362, "grad_norm": 0.26862067115477706, "learning_rate": 1.185996411675459e-07, "loss": 1.5826, "step": 4937 }, { "epoch": 0.7447402156700098, "grad_norm": 0.24082233081492946, "learning_rate": 1.1852301302510466e-07, "loss": 1.533, "step": 4938 }, { "epoch": 0.7448910338586834, "grad_norm": 0.2594698908196981, "learning_rate": 1.1844642001630402e-07, "loss": 1.5032, "step": 4939 }, { "epoch": 0.745041852047357, "grad_norm": 0.2559417207583824, "learning_rate": 1.1836986215834152e-07, "loss": 1.5699, "step": 4940 }, { "epoch": 0.7451926702360304, "grad_norm": 0.2812017700416851, "learning_rate": 1.1829333946840654e-07, "loss": 1.494, "step": 4941 }, { "epoch": 0.745343488424704, "grad_norm": 0.2468609173351973, "learning_rate": 1.1821685196368056e-07, "loss": 1.5215, "step": 4942 }, { "epoch": 0.7454943066133776, "grad_norm": 0.24703477600416315, "learning_rate": 1.1814039966133743e-07, "loss": 1.5627, "step": 4943 }, { "epoch": 0.7456451248020511, "grad_norm": 0.26930351840059186, "learning_rate": 1.1806398257854283e-07, "loss": 1.6403, "step": 4944 }, { "epoch": 0.7457959429907247, "grad_norm": 0.2484494879334176, "learning_rate": 1.1798760073245456e-07, "loss": 1.5543, "step": 4945 }, { "epoch": 0.7459467611793983, "grad_norm": 0.2598991579530039, "learning_rate": 1.179112541402227e-07, "loss": 1.5641, "step": 4946 }, { "epoch": 0.7460975793680718, "grad_norm": 0.2499149483355594, "learning_rate": 1.178349428189892e-07, "loss": 1.6063, "step": 4947 }, { "epoch": 0.7462483975567453, "grad_norm": 0.26364334144722557, "learning_rate": 1.177586667858882e-07, "loss": 1.5886, "step": 4948 }, { "epoch": 0.7463992157454189, "grad_norm": 0.24273355169903088, "learning_rate": 1.176824260580458e-07, "loss": 1.5432, "step": 4949 }, { "epoch": 0.7465500339340925, "grad_norm": 0.248398952345595, "learning_rate": 1.1760622065258045e-07, "loss": 1.5106, "step": 4950 }, { "epoch": 0.746700852122766, "grad_norm": 0.2591335359742776, "learning_rate": 1.1753005058660235e-07, "loss": 1.6749, "step": 4951 }, { "epoch": 0.7468516703114395, "grad_norm": 0.2685687249343771, "learning_rate": 1.1745391587721384e-07, "loss": 1.5833, "step": 4952 }, { "epoch": 0.7470024885001131, "grad_norm": 0.260740723925062, "learning_rate": 1.1737781654150954e-07, "loss": 1.5123, "step": 4953 }, { "epoch": 0.7471533066887867, "grad_norm": 0.254406185396711, "learning_rate": 1.1730175259657592e-07, "loss": 1.6124, "step": 4954 }, { "epoch": 0.7473041248774602, "grad_norm": 0.249995359524058, "learning_rate": 1.1722572405949142e-07, "loss": 1.6095, "step": 4955 }, { "epoch": 0.7474549430661338, "grad_norm": 0.40428727869443287, "learning_rate": 1.1714973094732685e-07, "loss": 1.5825, "step": 4956 }, { "epoch": 0.7476057612548074, "grad_norm": 0.2707533573954845, "learning_rate": 1.170737732771448e-07, "loss": 1.5751, "step": 4957 }, { "epoch": 0.7477565794434808, "grad_norm": 0.24719276947553667, "learning_rate": 1.1699785106599988e-07, "loss": 1.4995, "step": 4958 }, { "epoch": 0.7479073976321544, "grad_norm": 0.37578501917362633, "learning_rate": 1.1692196433093901e-07, "loss": 1.535, "step": 4959 }, { "epoch": 0.748058215820828, "grad_norm": 0.2815482979032689, "learning_rate": 1.1684611308900089e-07, "loss": 1.5078, "step": 4960 }, { "epoch": 0.7482090340095016, "grad_norm": 0.23479697436304425, "learning_rate": 1.1677029735721633e-07, "loss": 1.5504, "step": 4961 }, { "epoch": 0.7483598521981751, "grad_norm": 0.2573652262763795, "learning_rate": 1.1669451715260808e-07, "loss": 1.5825, "step": 4962 }, { "epoch": 0.7485106703868487, "grad_norm": 0.2531541201062456, "learning_rate": 1.1661877249219119e-07, "loss": 1.5803, "step": 4963 }, { "epoch": 0.7486614885755222, "grad_norm": 0.23667926303795242, "learning_rate": 1.1654306339297243e-07, "loss": 1.4844, "step": 4964 }, { "epoch": 0.7488123067641957, "grad_norm": 0.2478635136891246, "learning_rate": 1.1646738987195066e-07, "loss": 1.6326, "step": 4965 }, { "epoch": 0.7489631249528693, "grad_norm": 0.2392680276111686, "learning_rate": 1.1639175194611691e-07, "loss": 1.517, "step": 4966 }, { "epoch": 0.7491139431415429, "grad_norm": 0.28320944578932267, "learning_rate": 1.1631614963245402e-07, "loss": 1.6226, "step": 4967 }, { "epoch": 0.7492647613302165, "grad_norm": 0.24608625349818133, "learning_rate": 1.1624058294793687e-07, "loss": 1.5865, "step": 4968 }, { "epoch": 0.7494155795188899, "grad_norm": 0.24516006928790682, "learning_rate": 1.1616505190953247e-07, "loss": 1.5116, "step": 4969 }, { "epoch": 0.7495663977075635, "grad_norm": 0.24647472487682642, "learning_rate": 1.1608955653419973e-07, "loss": 1.5507, "step": 4970 }, { "epoch": 0.7497172158962371, "grad_norm": 0.24875229566089416, "learning_rate": 1.1601409683888945e-07, "loss": 1.4911, "step": 4971 }, { "epoch": 0.7498680340849107, "grad_norm": 0.23487532169749784, "learning_rate": 1.159386728405447e-07, "loss": 1.5221, "step": 4972 }, { "epoch": 0.7500188522735842, "grad_norm": 0.25496571236082377, "learning_rate": 1.1586328455610026e-07, "loss": 1.5835, "step": 4973 }, { "epoch": 0.7501696704622578, "grad_norm": 0.2596088183079915, "learning_rate": 1.1578793200248302e-07, "loss": 1.5166, "step": 4974 }, { "epoch": 0.7503204886509313, "grad_norm": 0.27419554174727, "learning_rate": 1.1571261519661174e-07, "loss": 1.5739, "step": 4975 }, { "epoch": 0.7504713068396048, "grad_norm": 0.2543888042401833, "learning_rate": 1.1563733415539738e-07, "loss": 1.559, "step": 4976 }, { "epoch": 0.7506221250282784, "grad_norm": 0.25937322181867734, "learning_rate": 1.1556208889574265e-07, "loss": 1.5478, "step": 4977 }, { "epoch": 0.750772943216952, "grad_norm": 0.28582306238223393, "learning_rate": 1.1548687943454221e-07, "loss": 1.5822, "step": 4978 }, { "epoch": 0.7509237614056256, "grad_norm": 0.24300248886885564, "learning_rate": 1.15411705788683e-07, "loss": 1.521, "step": 4979 }, { "epoch": 0.751074579594299, "grad_norm": 0.2466945390160691, "learning_rate": 1.1533656797504351e-07, "loss": 1.6157, "step": 4980 }, { "epoch": 0.7512253977829726, "grad_norm": 0.5555991220101545, "learning_rate": 1.1526146601049439e-07, "loss": 1.5467, "step": 4981 }, { "epoch": 0.7513762159716462, "grad_norm": 0.25727078930771363, "learning_rate": 1.1518639991189831e-07, "loss": 1.5622, "step": 4982 }, { "epoch": 0.7515270341603197, "grad_norm": 0.2451247504649221, "learning_rate": 1.1511136969610975e-07, "loss": 1.5494, "step": 4983 }, { "epoch": 0.7516778523489933, "grad_norm": 0.2447929104257427, "learning_rate": 1.1503637537997508e-07, "loss": 1.5519, "step": 4984 }, { "epoch": 0.7518286705376669, "grad_norm": 0.24481084601332812, "learning_rate": 1.1496141698033284e-07, "loss": 1.5932, "step": 4985 }, { "epoch": 0.7519794887263404, "grad_norm": 0.3341531795440303, "learning_rate": 1.1488649451401336e-07, "loss": 1.5481, "step": 4986 }, { "epoch": 0.7521303069150139, "grad_norm": 0.24060085308458376, "learning_rate": 1.1481160799783889e-07, "loss": 1.6076, "step": 4987 }, { "epoch": 0.7522811251036875, "grad_norm": 0.2607819323107068, "learning_rate": 1.1473675744862353e-07, "loss": 1.6102, "step": 4988 }, { "epoch": 0.7524319432923611, "grad_norm": 0.4569374987845713, "learning_rate": 1.1466194288317359e-07, "loss": 1.5122, "step": 4989 }, { "epoch": 0.7525827614810346, "grad_norm": 0.2642753303846815, "learning_rate": 1.1458716431828701e-07, "loss": 1.6151, "step": 4990 }, { "epoch": 0.7527335796697082, "grad_norm": 0.24128226500742275, "learning_rate": 1.1451242177075373e-07, "loss": 1.5247, "step": 4991 }, { "epoch": 0.7528843978583817, "grad_norm": 0.2464668191518985, "learning_rate": 1.1443771525735575e-07, "loss": 1.574, "step": 4992 }, { "epoch": 0.7530352160470553, "grad_norm": 0.24845291887234755, "learning_rate": 1.1436304479486679e-07, "loss": 1.5154, "step": 4993 }, { "epoch": 0.7531860342357288, "grad_norm": 0.2611909477100272, "learning_rate": 1.1428841040005243e-07, "loss": 1.6241, "step": 4994 }, { "epoch": 0.7533368524244024, "grad_norm": 0.25661868482566025, "learning_rate": 1.142138120896705e-07, "loss": 1.5354, "step": 4995 }, { "epoch": 0.753487670613076, "grad_norm": 0.2798025246675307, "learning_rate": 1.1413924988047034e-07, "loss": 1.5309, "step": 4996 }, { "epoch": 0.7536384888017494, "grad_norm": 0.27912914616880197, "learning_rate": 1.1406472378919329e-07, "loss": 1.5718, "step": 4997 }, { "epoch": 0.753789306990423, "grad_norm": 0.24277232163790605, "learning_rate": 1.1399023383257277e-07, "loss": 1.5643, "step": 4998 }, { "epoch": 0.7539401251790966, "grad_norm": 0.3617782813224853, "learning_rate": 1.1391578002733388e-07, "loss": 1.5847, "step": 4999 }, { "epoch": 0.7540909433677702, "grad_norm": 0.26007769529671915, "learning_rate": 1.1384136239019368e-07, "loss": 1.5742, "step": 5000 }, { "epoch": 0.7542417615564437, "grad_norm": 0.26579375940669586, "learning_rate": 1.13766980937861e-07, "loss": 1.5151, "step": 5001 }, { "epoch": 0.7543925797451173, "grad_norm": 0.2720902322803907, "learning_rate": 1.1369263568703679e-07, "loss": 1.5097, "step": 5002 }, { "epoch": 0.7545433979337908, "grad_norm": 0.2511906572314177, "learning_rate": 1.1361832665441365e-07, "loss": 1.5099, "step": 5003 }, { "epoch": 0.7546942161224643, "grad_norm": 0.24154146744530233, "learning_rate": 1.1354405385667607e-07, "loss": 1.5492, "step": 5004 }, { "epoch": 0.7548450343111379, "grad_norm": 0.2572353015729826, "learning_rate": 1.1346981731050058e-07, "loss": 1.6045, "step": 5005 }, { "epoch": 0.7549958524998115, "grad_norm": 0.25829188720635793, "learning_rate": 1.133956170325554e-07, "loss": 1.5816, "step": 5006 }, { "epoch": 0.7551466706884851, "grad_norm": 0.2774856071266561, "learning_rate": 1.1332145303950055e-07, "loss": 1.5602, "step": 5007 }, { "epoch": 0.7552974888771586, "grad_norm": 0.2721287551465047, "learning_rate": 1.1324732534798817e-07, "loss": 1.5472, "step": 5008 }, { "epoch": 0.7554483070658321, "grad_norm": 0.2436616598675525, "learning_rate": 1.1317323397466201e-07, "loss": 1.6173, "step": 5009 }, { "epoch": 0.7555991252545057, "grad_norm": 0.25715980408099304, "learning_rate": 1.1309917893615767e-07, "loss": 1.5151, "step": 5010 }, { "epoch": 0.7557499434431793, "grad_norm": 0.2464845622169951, "learning_rate": 1.1302516024910279e-07, "loss": 1.6045, "step": 5011 }, { "epoch": 0.7559007616318528, "grad_norm": 0.2672272217687471, "learning_rate": 1.1295117793011667e-07, "loss": 1.5588, "step": 5012 }, { "epoch": 0.7560515798205264, "grad_norm": 0.2424124619486496, "learning_rate": 1.1287723199581049e-07, "loss": 1.6073, "step": 5013 }, { "epoch": 0.7562023980091999, "grad_norm": 0.2584854770320498, "learning_rate": 1.128033224627872e-07, "loss": 1.5845, "step": 5014 }, { "epoch": 0.7563532161978734, "grad_norm": 0.2836166456867082, "learning_rate": 1.1272944934764175e-07, "loss": 1.6881, "step": 5015 }, { "epoch": 0.756504034386547, "grad_norm": 0.24450066706031187, "learning_rate": 1.1265561266696079e-07, "loss": 1.5112, "step": 5016 }, { "epoch": 0.7566548525752206, "grad_norm": 0.2352581205448825, "learning_rate": 1.125818124373227e-07, "loss": 1.5355, "step": 5017 }, { "epoch": 0.7568056707638942, "grad_norm": 0.24222512360340281, "learning_rate": 1.1250804867529792e-07, "loss": 1.5726, "step": 5018 }, { "epoch": 0.7569564889525677, "grad_norm": 0.26113164150923096, "learning_rate": 1.1243432139744849e-07, "loss": 1.5462, "step": 5019 }, { "epoch": 0.7571073071412412, "grad_norm": 0.24774383004217806, "learning_rate": 1.1236063062032832e-07, "loss": 1.6045, "step": 5020 }, { "epoch": 0.7572581253299148, "grad_norm": 0.2895263852510736, "learning_rate": 1.1228697636048318e-07, "loss": 1.5447, "step": 5021 }, { "epoch": 0.7574089435185883, "grad_norm": 0.24985005566487262, "learning_rate": 1.1221335863445062e-07, "loss": 1.54, "step": 5022 }, { "epoch": 0.7575597617072619, "grad_norm": 0.30047437792071696, "learning_rate": 1.1213977745875986e-07, "loss": 1.5912, "step": 5023 }, { "epoch": 0.7577105798959355, "grad_norm": 0.2559631034377228, "learning_rate": 1.1206623284993205e-07, "loss": 1.576, "step": 5024 }, { "epoch": 0.757861398084609, "grad_norm": 0.24488841104514433, "learning_rate": 1.1199272482448016e-07, "loss": 1.5611, "step": 5025 }, { "epoch": 0.7580122162732825, "grad_norm": 0.2506081363400735, "learning_rate": 1.1191925339890885e-07, "loss": 1.5457, "step": 5026 }, { "epoch": 0.7581630344619561, "grad_norm": 0.2548257218134441, "learning_rate": 1.1184581858971454e-07, "loss": 1.5363, "step": 5027 }, { "epoch": 0.7583138526506297, "grad_norm": 0.2599352467154265, "learning_rate": 1.1177242041338558e-07, "loss": 1.5705, "step": 5028 }, { "epoch": 0.7584646708393032, "grad_norm": 0.2992245833249952, "learning_rate": 1.1169905888640197e-07, "loss": 1.5595, "step": 5029 }, { "epoch": 0.7586154890279768, "grad_norm": 0.26473812364540256, "learning_rate": 1.1162573402523539e-07, "loss": 1.5335, "step": 5030 }, { "epoch": 0.7587663072166503, "grad_norm": 0.24633036947358974, "learning_rate": 1.115524458463496e-07, "loss": 1.516, "step": 5031 }, { "epoch": 0.7589171254053239, "grad_norm": 0.2593413769776572, "learning_rate": 1.1147919436619982e-07, "loss": 1.6015, "step": 5032 }, { "epoch": 0.7590679435939974, "grad_norm": 0.2752753592917604, "learning_rate": 1.114059796012331e-07, "loss": 1.5842, "step": 5033 }, { "epoch": 0.759218761782671, "grad_norm": 0.2764460999783728, "learning_rate": 1.1133280156788841e-07, "loss": 1.5341, "step": 5034 }, { "epoch": 0.7593695799713446, "grad_norm": 0.24793911244162387, "learning_rate": 1.1125966028259628e-07, "loss": 1.6194, "step": 5035 }, { "epoch": 0.7595203981600182, "grad_norm": 0.26031083680762124, "learning_rate": 1.1118655576177906e-07, "loss": 1.5888, "step": 5036 }, { "epoch": 0.7596712163486916, "grad_norm": 0.2483042361062299, "learning_rate": 1.1111348802185078e-07, "loss": 1.5365, "step": 5037 }, { "epoch": 0.7598220345373652, "grad_norm": 0.247028271129063, "learning_rate": 1.1104045707921743e-07, "loss": 1.5584, "step": 5038 }, { "epoch": 0.7599728527260388, "grad_norm": 0.27686028713949173, "learning_rate": 1.1096746295027647e-07, "loss": 1.5153, "step": 5039 }, { "epoch": 0.7601236709147123, "grad_norm": 0.2546933802016954, "learning_rate": 1.1089450565141718e-07, "loss": 1.6124, "step": 5040 }, { "epoch": 0.7602744891033859, "grad_norm": 0.29014179257294564, "learning_rate": 1.1082158519902072e-07, "loss": 1.5824, "step": 5041 }, { "epoch": 0.7604253072920594, "grad_norm": 0.2604184056232571, "learning_rate": 1.1074870160945976e-07, "loss": 1.6372, "step": 5042 }, { "epoch": 0.760576125480733, "grad_norm": 0.25121624505515716, "learning_rate": 1.1067585489909878e-07, "loss": 1.5824, "step": 5043 }, { "epoch": 0.7607269436694065, "grad_norm": 0.282952493155054, "learning_rate": 1.1060304508429406e-07, "loss": 1.6134, "step": 5044 }, { "epoch": 0.7608777618580801, "grad_norm": 0.46881777025757665, "learning_rate": 1.1053027218139351e-07, "loss": 1.4651, "step": 5045 }, { "epoch": 0.7610285800467537, "grad_norm": 0.24818367502647792, "learning_rate": 1.1045753620673667e-07, "loss": 1.5725, "step": 5046 }, { "epoch": 0.7611793982354272, "grad_norm": 0.24944519143597316, "learning_rate": 1.1038483717665503e-07, "loss": 1.5695, "step": 5047 }, { "epoch": 0.7613302164241007, "grad_norm": 0.2537350616836102, "learning_rate": 1.1031217510747156e-07, "loss": 1.5754, "step": 5048 }, { "epoch": 0.7614810346127743, "grad_norm": 0.2466758581154355, "learning_rate": 1.1023955001550103e-07, "loss": 1.5672, "step": 5049 }, { "epoch": 0.7616318528014479, "grad_norm": 0.274456611738853, "learning_rate": 1.1016696191704979e-07, "loss": 1.5828, "step": 5050 }, { "epoch": 0.7617826709901214, "grad_norm": 0.24776615534578775, "learning_rate": 1.1009441082841614e-07, "loss": 1.6203, "step": 5051 }, { "epoch": 0.761933489178795, "grad_norm": 0.27968409507398106, "learning_rate": 1.1002189676588985e-07, "loss": 1.5833, "step": 5052 }, { "epoch": 0.7620843073674686, "grad_norm": 0.2570880513767808, "learning_rate": 1.0994941974575237e-07, "loss": 1.5976, "step": 5053 }, { "epoch": 0.762235125556142, "grad_norm": 0.5737033159122047, "learning_rate": 1.0987697978427702e-07, "loss": 1.5633, "step": 5054 }, { "epoch": 0.7623859437448156, "grad_norm": 0.247452268839846, "learning_rate": 1.0980457689772862e-07, "loss": 1.5988, "step": 5055 }, { "epoch": 0.7625367619334892, "grad_norm": 0.2438038408104602, "learning_rate": 1.0973221110236363e-07, "loss": 1.5329, "step": 5056 }, { "epoch": 0.7626875801221628, "grad_norm": 0.2730328055823197, "learning_rate": 1.096598824144305e-07, "loss": 1.5276, "step": 5057 }, { "epoch": 0.7628383983108363, "grad_norm": 0.2570461385203547, "learning_rate": 1.0958759085016896e-07, "loss": 1.5214, "step": 5058 }, { "epoch": 0.7629892164995098, "grad_norm": 0.24501258526815628, "learning_rate": 1.095153364258106e-07, "loss": 1.5642, "step": 5059 }, { "epoch": 0.7631400346881834, "grad_norm": 0.25496296302770444, "learning_rate": 1.0944311915757869e-07, "loss": 1.5868, "step": 5060 }, { "epoch": 0.7632908528768569, "grad_norm": 0.25572422863875494, "learning_rate": 1.0937093906168807e-07, "loss": 1.5427, "step": 5061 }, { "epoch": 0.7634416710655305, "grad_norm": 0.29023173011170667, "learning_rate": 1.0929879615434529e-07, "loss": 1.5807, "step": 5062 }, { "epoch": 0.7635924892542041, "grad_norm": 0.26293886464282606, "learning_rate": 1.0922669045174846e-07, "loss": 1.5797, "step": 5063 }, { "epoch": 0.7637433074428777, "grad_norm": 0.3782063987669869, "learning_rate": 1.0915462197008754e-07, "loss": 1.5318, "step": 5064 }, { "epoch": 0.7638941256315511, "grad_norm": 0.28029250684341045, "learning_rate": 1.0908259072554397e-07, "loss": 1.5944, "step": 5065 }, { "epoch": 0.7640449438202247, "grad_norm": 0.25338732580070844, "learning_rate": 1.0901059673429072e-07, "loss": 1.5392, "step": 5066 }, { "epoch": 0.7641957620088983, "grad_norm": 0.2500040273740219, "learning_rate": 1.0893864001249275e-07, "loss": 1.5127, "step": 5067 }, { "epoch": 0.7643465801975718, "grad_norm": 0.2629491004831539, "learning_rate": 1.0886672057630634e-07, "loss": 1.5428, "step": 5068 }, { "epoch": 0.7644973983862454, "grad_norm": 0.2555813459761103, "learning_rate": 1.0879483844187939e-07, "loss": 1.5993, "step": 5069 }, { "epoch": 0.7646482165749189, "grad_norm": 0.2511057797810466, "learning_rate": 1.0872299362535173e-07, "loss": 1.5218, "step": 5070 }, { "epoch": 0.7647990347635925, "grad_norm": 0.24639035790289063, "learning_rate": 1.0865118614285451e-07, "loss": 1.5672, "step": 5071 }, { "epoch": 0.764949852952266, "grad_norm": 0.25237923393152023, "learning_rate": 1.0857941601051059e-07, "loss": 1.5934, "step": 5072 }, { "epoch": 0.7651006711409396, "grad_norm": 0.24358021773191016, "learning_rate": 1.085076832444345e-07, "loss": 1.5214, "step": 5073 }, { "epoch": 0.7652514893296132, "grad_norm": 0.2757877174872806, "learning_rate": 1.0843598786073233e-07, "loss": 1.5243, "step": 5074 }, { "epoch": 0.7654023075182868, "grad_norm": 0.24732749051442404, "learning_rate": 1.0836432987550176e-07, "loss": 1.5443, "step": 5075 }, { "epoch": 0.7655531257069602, "grad_norm": 0.2519313151788913, "learning_rate": 1.0829270930483204e-07, "loss": 1.5767, "step": 5076 }, { "epoch": 0.7657039438956338, "grad_norm": 0.35168934252187173, "learning_rate": 1.0822112616480412e-07, "loss": 1.5573, "step": 5077 }, { "epoch": 0.7658547620843074, "grad_norm": 0.2848108080477447, "learning_rate": 1.081495804714906e-07, "loss": 1.5374, "step": 5078 }, { "epoch": 0.7660055802729809, "grad_norm": 0.455667879865247, "learning_rate": 1.0807807224095547e-07, "loss": 1.6263, "step": 5079 }, { "epoch": 0.7661563984616545, "grad_norm": 0.25405772398517595, "learning_rate": 1.0800660148925436e-07, "loss": 1.5841, "step": 5080 }, { "epoch": 0.7663072166503281, "grad_norm": 0.253593517656124, "learning_rate": 1.0793516823243468e-07, "loss": 1.5692, "step": 5081 }, { "epoch": 0.7664580348390015, "grad_norm": 0.3045819421057285, "learning_rate": 1.078637724865352e-07, "loss": 1.5518, "step": 5082 }, { "epoch": 0.7666088530276751, "grad_norm": 0.4177817143813259, "learning_rate": 1.0779241426758627e-07, "loss": 1.5759, "step": 5083 }, { "epoch": 0.7667596712163487, "grad_norm": 0.24682410044797307, "learning_rate": 1.0772109359161003e-07, "loss": 1.5823, "step": 5084 }, { "epoch": 0.7669104894050223, "grad_norm": 0.2693160890115367, "learning_rate": 1.0764981047461996e-07, "loss": 1.591, "step": 5085 }, { "epoch": 0.7670613075936958, "grad_norm": 0.2505752972828048, "learning_rate": 1.0757856493262116e-07, "loss": 1.594, "step": 5086 }, { "epoch": 0.7672121257823693, "grad_norm": 0.3321816099078353, "learning_rate": 1.0750735698161045e-07, "loss": 1.5276, "step": 5087 }, { "epoch": 0.7673629439710429, "grad_norm": 0.4899980373642501, "learning_rate": 1.0743618663757602e-07, "loss": 1.5406, "step": 5088 }, { "epoch": 0.7675137621597165, "grad_norm": 0.30735719382349896, "learning_rate": 1.0736505391649762e-07, "loss": 1.5763, "step": 5089 }, { "epoch": 0.76766458034839, "grad_norm": 0.24252898756360913, "learning_rate": 1.0729395883434675e-07, "loss": 1.5664, "step": 5090 }, { "epoch": 0.7678153985370636, "grad_norm": 0.26556494556336524, "learning_rate": 1.0722290140708628e-07, "loss": 1.6136, "step": 5091 }, { "epoch": 0.7679662167257372, "grad_norm": 0.25334258561603834, "learning_rate": 1.0715188165067066e-07, "loss": 1.6102, "step": 5092 }, { "epoch": 0.7681170349144106, "grad_norm": 0.25168076140626744, "learning_rate": 1.0708089958104576e-07, "loss": 1.5315, "step": 5093 }, { "epoch": 0.7682678531030842, "grad_norm": 0.24437736466101306, "learning_rate": 1.0700995521414938e-07, "loss": 1.5378, "step": 5094 }, { "epoch": 0.7684186712917578, "grad_norm": 0.2706173777643933, "learning_rate": 1.0693904856591043e-07, "loss": 1.5875, "step": 5095 }, { "epoch": 0.7685694894804314, "grad_norm": 0.26091900532545664, "learning_rate": 1.068681796522495e-07, "loss": 1.6139, "step": 5096 }, { "epoch": 0.7687203076691049, "grad_norm": 0.2668382964154832, "learning_rate": 1.0679734848907885e-07, "loss": 1.588, "step": 5097 }, { "epoch": 0.7688711258577785, "grad_norm": 0.24883145644711072, "learning_rate": 1.0672655509230205e-07, "loss": 1.4768, "step": 5098 }, { "epoch": 0.769021944046452, "grad_norm": 0.27166435863492955, "learning_rate": 1.0665579947781425e-07, "loss": 1.5197, "step": 5099 }, { "epoch": 0.7691727622351255, "grad_norm": 0.2772294693142326, "learning_rate": 1.0658508166150224e-07, "loss": 1.523, "step": 5100 }, { "epoch": 0.7693235804237991, "grad_norm": 0.26345014775586284, "learning_rate": 1.0651440165924419e-07, "loss": 1.6231, "step": 5101 }, { "epoch": 0.7694743986124727, "grad_norm": 0.2625428479171914, "learning_rate": 1.0644375948690975e-07, "loss": 1.5537, "step": 5102 }, { "epoch": 0.7696252168011463, "grad_norm": 0.24545151495665093, "learning_rate": 1.0637315516036027e-07, "loss": 1.6497, "step": 5103 }, { "epoch": 0.7697760349898197, "grad_norm": 0.29316282299242524, "learning_rate": 1.063025886954484e-07, "loss": 1.591, "step": 5104 }, { "epoch": 0.7699268531784933, "grad_norm": 0.2483624076119689, "learning_rate": 1.0623206010801837e-07, "loss": 1.552, "step": 5105 }, { "epoch": 0.7700776713671669, "grad_norm": 0.25246809965954947, "learning_rate": 1.0616156941390586e-07, "loss": 1.572, "step": 5106 }, { "epoch": 0.7702284895558404, "grad_norm": 0.24407033874182604, "learning_rate": 1.0609111662893822e-07, "loss": 1.5878, "step": 5107 }, { "epoch": 0.770379307744514, "grad_norm": 0.24774824490308997, "learning_rate": 1.0602070176893404e-07, "loss": 1.5668, "step": 5108 }, { "epoch": 0.7705301259331876, "grad_norm": 0.24146342902464263, "learning_rate": 1.059503248497035e-07, "loss": 1.5541, "step": 5109 }, { "epoch": 0.7706809441218611, "grad_norm": 0.23805835167014303, "learning_rate": 1.0587998588704834e-07, "loss": 1.495, "step": 5110 }, { "epoch": 0.7708317623105346, "grad_norm": 0.42185474556300606, "learning_rate": 1.0580968489676167e-07, "loss": 1.5745, "step": 5111 }, { "epoch": 0.7709825804992082, "grad_norm": 0.2408951418800673, "learning_rate": 1.0573942189462803e-07, "loss": 1.5365, "step": 5112 }, { "epoch": 0.7711333986878818, "grad_norm": 0.42443576706367386, "learning_rate": 1.0566919689642368e-07, "loss": 1.545, "step": 5113 }, { "epoch": 0.7712842168765554, "grad_norm": 0.26429294303783457, "learning_rate": 1.0559900991791606e-07, "loss": 1.4837, "step": 5114 }, { "epoch": 0.7714350350652288, "grad_norm": 0.2831171957305455, "learning_rate": 1.0552886097486416e-07, "loss": 1.5854, "step": 5115 }, { "epoch": 0.7715858532539024, "grad_norm": 0.25376435672152076, "learning_rate": 1.0545875008301861e-07, "loss": 1.5179, "step": 5116 }, { "epoch": 0.771736671442576, "grad_norm": 0.24585108407865713, "learning_rate": 1.0538867725812123e-07, "loss": 1.5157, "step": 5117 }, { "epoch": 0.7718874896312495, "grad_norm": 0.2667675570241291, "learning_rate": 1.0531864251590542e-07, "loss": 1.5122, "step": 5118 }, { "epoch": 0.7720383078199231, "grad_norm": 0.25634356723734913, "learning_rate": 1.0524864587209598e-07, "loss": 1.6761, "step": 5119 }, { "epoch": 0.7721891260085967, "grad_norm": 0.40152559169486846, "learning_rate": 1.051786873424093e-07, "loss": 1.6062, "step": 5120 }, { "epoch": 0.7723399441972701, "grad_norm": 0.5199439778760333, "learning_rate": 1.0510876694255308e-07, "loss": 1.5619, "step": 5121 }, { "epoch": 0.7724907623859437, "grad_norm": 0.2580699708718366, "learning_rate": 1.0503888468822639e-07, "loss": 1.5517, "step": 5122 }, { "epoch": 0.7726415805746173, "grad_norm": 0.2545315956830306, "learning_rate": 1.0496904059511994e-07, "loss": 1.5236, "step": 5123 }, { "epoch": 0.7727923987632909, "grad_norm": 0.24856695616944577, "learning_rate": 1.0489923467891576e-07, "loss": 1.5601, "step": 5124 }, { "epoch": 0.7729432169519644, "grad_norm": 0.25293373694071336, "learning_rate": 1.0482946695528719e-07, "loss": 1.5245, "step": 5125 }, { "epoch": 0.773094035140638, "grad_norm": 0.24443089481446287, "learning_rate": 1.0475973743989927e-07, "loss": 1.5907, "step": 5126 }, { "epoch": 0.7732448533293115, "grad_norm": 0.272129843089619, "learning_rate": 1.0469004614840823e-07, "loss": 1.5011, "step": 5127 }, { "epoch": 0.773395671517985, "grad_norm": 0.2619860150616335, "learning_rate": 1.0462039309646177e-07, "loss": 1.6184, "step": 5128 }, { "epoch": 0.7735464897066586, "grad_norm": 0.24823666256911225, "learning_rate": 1.0455077829969911e-07, "loss": 1.5534, "step": 5129 }, { "epoch": 0.7736973078953322, "grad_norm": 0.27543584201099786, "learning_rate": 1.0448120177375075e-07, "loss": 1.5553, "step": 5130 }, { "epoch": 0.7738481260840058, "grad_norm": 0.2454650406977061, "learning_rate": 1.0441166353423864e-07, "loss": 1.5141, "step": 5131 }, { "epoch": 0.7739989442726792, "grad_norm": 0.24311302458080847, "learning_rate": 1.0434216359677609e-07, "loss": 1.5771, "step": 5132 }, { "epoch": 0.7741497624613528, "grad_norm": 0.2510070298633273, "learning_rate": 1.0427270197696798e-07, "loss": 1.5942, "step": 5133 }, { "epoch": 0.7743005806500264, "grad_norm": 0.24700988295710286, "learning_rate": 1.0420327869041043e-07, "loss": 1.5162, "step": 5134 }, { "epoch": 0.7744513988387, "grad_norm": 0.9863476661457141, "learning_rate": 1.0413389375269089e-07, "loss": 1.5606, "step": 5135 }, { "epoch": 0.7746022170273735, "grad_norm": 0.2501003593389527, "learning_rate": 1.0406454717938842e-07, "loss": 1.5663, "step": 5136 }, { "epoch": 0.7747530352160471, "grad_norm": 0.2686564153867668, "learning_rate": 1.0399523898607329e-07, "loss": 1.5506, "step": 5137 }, { "epoch": 0.7749038534047206, "grad_norm": 0.2520650053807712, "learning_rate": 1.0392596918830716e-07, "loss": 1.5425, "step": 5138 }, { "epoch": 0.7750546715933941, "grad_norm": 0.25532459334127455, "learning_rate": 1.0385673780164326e-07, "loss": 1.5156, "step": 5139 }, { "epoch": 0.7752054897820677, "grad_norm": 0.25637021843688246, "learning_rate": 1.0378754484162597e-07, "loss": 1.5837, "step": 5140 }, { "epoch": 0.7753563079707413, "grad_norm": 0.2615259072232839, "learning_rate": 1.0371839032379104e-07, "loss": 1.5349, "step": 5141 }, { "epoch": 0.7755071261594149, "grad_norm": 0.24496714282901963, "learning_rate": 1.036492742636658e-07, "loss": 1.552, "step": 5142 }, { "epoch": 0.7756579443480883, "grad_norm": 0.2613104991310393, "learning_rate": 1.0358019667676879e-07, "loss": 1.5723, "step": 5143 }, { "epoch": 0.7758087625367619, "grad_norm": 0.2639473884716577, "learning_rate": 1.0351115757860992e-07, "loss": 1.4878, "step": 5144 }, { "epoch": 0.7759595807254355, "grad_norm": 0.2808735969070876, "learning_rate": 1.0344215698469044e-07, "loss": 1.6, "step": 5145 }, { "epoch": 0.776110398914109, "grad_norm": 0.2491555953757118, "learning_rate": 1.0337319491050307e-07, "loss": 1.5844, "step": 5146 }, { "epoch": 0.7762612171027826, "grad_norm": 0.25991931785708977, "learning_rate": 1.0330427137153175e-07, "loss": 1.5831, "step": 5147 }, { "epoch": 0.7764120352914562, "grad_norm": 0.2742991791455993, "learning_rate": 1.0323538638325183e-07, "loss": 1.5945, "step": 5148 }, { "epoch": 0.7765628534801297, "grad_norm": 0.25883039506044575, "learning_rate": 1.0316653996113003e-07, "loss": 1.5559, "step": 5149 }, { "epoch": 0.7767136716688032, "grad_norm": 0.2655681031376377, "learning_rate": 1.0309773212062436e-07, "loss": 1.5658, "step": 5150 }, { "epoch": 0.7768644898574768, "grad_norm": 0.300007684282825, "learning_rate": 1.030289628771841e-07, "loss": 1.5492, "step": 5151 }, { "epoch": 0.7770153080461504, "grad_norm": 0.25078131996972297, "learning_rate": 1.0296023224625012e-07, "loss": 1.561, "step": 5152 }, { "epoch": 0.777166126234824, "grad_norm": 0.2518685712142501, "learning_rate": 1.0289154024325433e-07, "loss": 1.5263, "step": 5153 }, { "epoch": 0.7773169444234975, "grad_norm": 0.2369554858026924, "learning_rate": 1.0282288688362012e-07, "loss": 1.5365, "step": 5154 }, { "epoch": 0.777467762612171, "grad_norm": 0.2523849726536303, "learning_rate": 1.0275427218276209e-07, "loss": 1.6622, "step": 5155 }, { "epoch": 0.7776185808008446, "grad_norm": 0.3445991104335428, "learning_rate": 1.0268569615608635e-07, "loss": 1.6526, "step": 5156 }, { "epoch": 0.7777693989895181, "grad_norm": 0.32051184261339966, "learning_rate": 1.0261715881899019e-07, "loss": 1.5817, "step": 5157 }, { "epoch": 0.7779202171781917, "grad_norm": 0.4438827106717608, "learning_rate": 1.0254866018686214e-07, "loss": 1.5408, "step": 5158 }, { "epoch": 0.7780710353668653, "grad_norm": 0.2504474562113185, "learning_rate": 1.0248020027508231e-07, "loss": 1.5919, "step": 5159 }, { "epoch": 0.7782218535555387, "grad_norm": 0.2487038606727896, "learning_rate": 1.024117790990218e-07, "loss": 1.6189, "step": 5160 }, { "epoch": 0.7783726717442123, "grad_norm": 0.24337959048412816, "learning_rate": 1.0234339667404318e-07, "loss": 1.6063, "step": 5161 }, { "epoch": 0.7785234899328859, "grad_norm": 0.2789888112152276, "learning_rate": 1.0227505301550037e-07, "loss": 1.4946, "step": 5162 }, { "epoch": 0.7786743081215595, "grad_norm": 0.3383774441602223, "learning_rate": 1.0220674813873847e-07, "loss": 1.5146, "step": 5163 }, { "epoch": 0.778825126310233, "grad_norm": 0.41575331098938056, "learning_rate": 1.0213848205909386e-07, "loss": 1.5281, "step": 5164 }, { "epoch": 0.7789759444989066, "grad_norm": 0.2470574744341523, "learning_rate": 1.0207025479189435e-07, "loss": 1.5721, "step": 5165 }, { "epoch": 0.7791267626875801, "grad_norm": 0.24746981011226615, "learning_rate": 1.0200206635245892e-07, "loss": 1.5174, "step": 5166 }, { "epoch": 0.7792775808762537, "grad_norm": 0.254621670888393, "learning_rate": 1.0193391675609786e-07, "loss": 1.6554, "step": 5167 }, { "epoch": 0.7794283990649272, "grad_norm": 0.2517442422822756, "learning_rate": 1.0186580601811265e-07, "loss": 1.5588, "step": 5168 }, { "epoch": 0.7795792172536008, "grad_norm": 0.2524935229273349, "learning_rate": 1.017977341537963e-07, "loss": 1.5498, "step": 5169 }, { "epoch": 0.7797300354422744, "grad_norm": 0.2637390863959784, "learning_rate": 1.0172970117843282e-07, "loss": 1.5733, "step": 5170 }, { "epoch": 0.779880853630948, "grad_norm": 0.28437078199905974, "learning_rate": 1.0166170710729755e-07, "loss": 1.5658, "step": 5171 }, { "epoch": 0.7800316718196214, "grad_norm": 0.2673622905711399, "learning_rate": 1.0159375195565729e-07, "loss": 1.5313, "step": 5172 }, { "epoch": 0.780182490008295, "grad_norm": 0.24615532045152091, "learning_rate": 1.0152583573876983e-07, "loss": 1.5598, "step": 5173 }, { "epoch": 0.7803333081969686, "grad_norm": 0.26648977590140616, "learning_rate": 1.0145795847188433e-07, "loss": 1.4859, "step": 5174 }, { "epoch": 0.7804841263856421, "grad_norm": 0.25841066994939277, "learning_rate": 1.0139012017024135e-07, "loss": 1.5512, "step": 5175 }, { "epoch": 0.7806349445743157, "grad_norm": 0.24448615213183508, "learning_rate": 1.0132232084907243e-07, "loss": 1.5373, "step": 5176 }, { "epoch": 0.7807857627629892, "grad_norm": 0.24528000389884158, "learning_rate": 1.0125456052360045e-07, "loss": 1.5653, "step": 5177 }, { "epoch": 0.7809365809516627, "grad_norm": 0.2474044667383555, "learning_rate": 1.0118683920903975e-07, "loss": 1.5745, "step": 5178 }, { "epoch": 0.7810873991403363, "grad_norm": 0.2761230065419814, "learning_rate": 1.0111915692059566e-07, "loss": 1.6278, "step": 5179 }, { "epoch": 0.7812382173290099, "grad_norm": 0.43992774047553906, "learning_rate": 1.010515136734648e-07, "loss": 1.5916, "step": 5180 }, { "epoch": 0.7813890355176835, "grad_norm": 0.24466744172378638, "learning_rate": 1.0098390948283499e-07, "loss": 1.5438, "step": 5181 }, { "epoch": 0.781539853706357, "grad_norm": 0.24309196970415786, "learning_rate": 1.0091634436388549e-07, "loss": 1.5997, "step": 5182 }, { "epoch": 0.7816906718950305, "grad_norm": 0.24513875337909635, "learning_rate": 1.0084881833178652e-07, "loss": 1.5485, "step": 5183 }, { "epoch": 0.7818414900837041, "grad_norm": 0.6544087327244111, "learning_rate": 1.0078133140169965e-07, "loss": 1.5465, "step": 5184 }, { "epoch": 0.7819923082723776, "grad_norm": 0.25270703405805567, "learning_rate": 1.0071388358877775e-07, "loss": 1.5103, "step": 5185 }, { "epoch": 0.7821431264610512, "grad_norm": 0.4938550184353543, "learning_rate": 1.0064647490816472e-07, "loss": 1.5426, "step": 5186 }, { "epoch": 0.7822939446497248, "grad_norm": 0.26403755528150696, "learning_rate": 1.0057910537499578e-07, "loss": 1.611, "step": 5187 }, { "epoch": 0.7824447628383983, "grad_norm": 0.24171431971485452, "learning_rate": 1.0051177500439742e-07, "loss": 1.588, "step": 5188 }, { "epoch": 0.7825955810270718, "grad_norm": 0.23879309098552787, "learning_rate": 1.0044448381148726e-07, "loss": 1.6279, "step": 5189 }, { "epoch": 0.7827463992157454, "grad_norm": 0.24518655059112635, "learning_rate": 1.0037723181137403e-07, "loss": 1.5294, "step": 5190 }, { "epoch": 0.782897217404419, "grad_norm": 0.48801982637795555, "learning_rate": 1.0031001901915789e-07, "loss": 1.5268, "step": 5191 }, { "epoch": 0.7830480355930926, "grad_norm": 0.246061195139339, "learning_rate": 1.0024284544993003e-07, "loss": 1.6452, "step": 5192 }, { "epoch": 0.7831988537817661, "grad_norm": 0.25484960572036536, "learning_rate": 1.0017571111877283e-07, "loss": 1.4964, "step": 5193 }, { "epoch": 0.7833496719704396, "grad_norm": 0.2641759867426765, "learning_rate": 1.0010861604075987e-07, "loss": 1.5249, "step": 5194 }, { "epoch": 0.7835004901591132, "grad_norm": 0.2749696841708133, "learning_rate": 1.0004156023095611e-07, "loss": 1.5734, "step": 5195 }, { "epoch": 0.7836513083477867, "grad_norm": 0.2400846391621469, "learning_rate": 9.99745437044174e-08, "loss": 1.5363, "step": 5196 }, { "epoch": 0.7838021265364603, "grad_norm": 0.2503792022857372, "learning_rate": 9.99075664761909e-08, "loss": 1.5607, "step": 5197 }, { "epoch": 0.7839529447251339, "grad_norm": 0.25126265300389844, "learning_rate": 9.984062856131502e-08, "loss": 1.5441, "step": 5198 }, { "epoch": 0.7841037629138075, "grad_norm": 0.2677246673958432, "learning_rate": 9.977372997481922e-08, "loss": 1.5171, "step": 5199 }, { "epoch": 0.7842545811024809, "grad_norm": 0.27186577521057703, "learning_rate": 9.970687073172415e-08, "loss": 1.5189, "step": 5200 }, { "epoch": 0.7844053992911545, "grad_norm": 0.24286649058717943, "learning_rate": 9.964005084704177e-08, "loss": 1.5378, "step": 5201 }, { "epoch": 0.7845562174798281, "grad_norm": 0.25296634340355056, "learning_rate": 9.957327033577498e-08, "loss": 1.5879, "step": 5202 }, { "epoch": 0.7847070356685016, "grad_norm": 0.24729615374562958, "learning_rate": 9.950652921291796e-08, "loss": 1.5808, "step": 5203 }, { "epoch": 0.7848578538571752, "grad_norm": 0.2759538028414474, "learning_rate": 9.943982749345615e-08, "loss": 1.5551, "step": 5204 }, { "epoch": 0.7850086720458487, "grad_norm": 0.24625468461101757, "learning_rate": 9.937316519236592e-08, "loss": 1.5504, "step": 5205 }, { "epoch": 0.7851594902345223, "grad_norm": 0.2551362944408527, "learning_rate": 9.93065423246149e-08, "loss": 1.6395, "step": 5206 }, { "epoch": 0.7853103084231958, "grad_norm": 0.24286043771088153, "learning_rate": 9.923995890516184e-08, "loss": 1.545, "step": 5207 }, { "epoch": 0.7854611266118694, "grad_norm": 0.24377166566239084, "learning_rate": 9.917341494895678e-08, "loss": 1.6084, "step": 5208 }, { "epoch": 0.785611944800543, "grad_norm": 0.24603192824167494, "learning_rate": 9.910691047094072e-08, "loss": 1.564, "step": 5209 }, { "epoch": 0.7857627629892165, "grad_norm": 0.2426345984172691, "learning_rate": 9.904044548604573e-08, "loss": 1.4926, "step": 5210 }, { "epoch": 0.78591358117789, "grad_norm": 0.25474503066062437, "learning_rate": 9.897402000919533e-08, "loss": 1.5594, "step": 5211 }, { "epoch": 0.7860643993665636, "grad_norm": 0.24394729119339043, "learning_rate": 9.890763405530386e-08, "loss": 1.5512, "step": 5212 }, { "epoch": 0.7862152175552372, "grad_norm": 0.2517485169788096, "learning_rate": 9.884128763927688e-08, "loss": 1.5152, "step": 5213 }, { "epoch": 0.7863660357439107, "grad_norm": 0.24075771254911466, "learning_rate": 9.877498077601117e-08, "loss": 1.5828, "step": 5214 }, { "epoch": 0.7865168539325843, "grad_norm": 0.2576568077914992, "learning_rate": 9.870871348039452e-08, "loss": 1.5435, "step": 5215 }, { "epoch": 0.7866676721212579, "grad_norm": 0.2488922033748577, "learning_rate": 9.864248576730583e-08, "loss": 1.5554, "step": 5216 }, { "epoch": 0.7868184903099313, "grad_norm": 0.24442321649992887, "learning_rate": 9.857629765161521e-08, "loss": 1.5341, "step": 5217 }, { "epoch": 0.7869693084986049, "grad_norm": 0.268373426064988, "learning_rate": 9.851014914818382e-08, "loss": 1.5983, "step": 5218 }, { "epoch": 0.7871201266872785, "grad_norm": 0.27454531743987626, "learning_rate": 9.844404027186387e-08, "loss": 1.616, "step": 5219 }, { "epoch": 0.7872709448759521, "grad_norm": 0.24735204104979536, "learning_rate": 9.837797103749871e-08, "loss": 1.6034, "step": 5220 }, { "epoch": 0.7874217630646256, "grad_norm": 0.24328973411274415, "learning_rate": 9.831194145992291e-08, "loss": 1.5682, "step": 5221 }, { "epoch": 0.7875725812532991, "grad_norm": 0.2849190394836823, "learning_rate": 9.824595155396198e-08, "loss": 1.6179, "step": 5222 }, { "epoch": 0.7877233994419727, "grad_norm": 0.2868891873378707, "learning_rate": 9.818000133443252e-08, "loss": 1.514, "step": 5223 }, { "epoch": 0.7878742176306462, "grad_norm": 0.29915894733704507, "learning_rate": 9.811409081614238e-08, "loss": 1.5292, "step": 5224 }, { "epoch": 0.7880250358193198, "grad_norm": 0.25681452067867805, "learning_rate": 9.804822001389034e-08, "loss": 1.5641, "step": 5225 }, { "epoch": 0.7881758540079934, "grad_norm": 0.5143721419834408, "learning_rate": 9.798238894246627e-08, "loss": 1.5118, "step": 5226 }, { "epoch": 0.788326672196667, "grad_norm": 0.24450340290190584, "learning_rate": 9.791659761665125e-08, "loss": 1.5542, "step": 5227 }, { "epoch": 0.7884774903853404, "grad_norm": 0.25923111975771784, "learning_rate": 9.785084605121731e-08, "loss": 1.553, "step": 5228 }, { "epoch": 0.788628308574014, "grad_norm": 0.576592704674624, "learning_rate": 9.778513426092749e-08, "loss": 1.5407, "step": 5229 }, { "epoch": 0.7887791267626876, "grad_norm": 0.25600492943228165, "learning_rate": 9.77194622605362e-08, "loss": 1.5689, "step": 5230 }, { "epoch": 0.7889299449513612, "grad_norm": 0.24986631977565338, "learning_rate": 9.765383006478861e-08, "loss": 1.5635, "step": 5231 }, { "epoch": 0.7890807631400347, "grad_norm": 0.2553981532136114, "learning_rate": 9.758823768842106e-08, "loss": 1.5453, "step": 5232 }, { "epoch": 0.7892315813287082, "grad_norm": 0.2693895420664751, "learning_rate": 9.752268514616086e-08, "loss": 1.5, "step": 5233 }, { "epoch": 0.7893823995173818, "grad_norm": 0.24512872950012005, "learning_rate": 9.745717245272664e-08, "loss": 1.5462, "step": 5234 }, { "epoch": 0.7895332177060553, "grad_norm": 0.29255385539188583, "learning_rate": 9.739169962282781e-08, "loss": 1.6196, "step": 5235 }, { "epoch": 0.7896840358947289, "grad_norm": 0.24636829891140796, "learning_rate": 9.732626667116489e-08, "loss": 1.5105, "step": 5236 }, { "epoch": 0.7898348540834025, "grad_norm": 0.24649437419168208, "learning_rate": 9.72608736124296e-08, "loss": 1.523, "step": 5237 }, { "epoch": 0.7899856722720761, "grad_norm": 0.24873170942575037, "learning_rate": 9.719552046130452e-08, "loss": 1.6408, "step": 5238 }, { "epoch": 0.7901364904607495, "grad_norm": 0.2548158490497884, "learning_rate": 9.713020723246324e-08, "loss": 1.567, "step": 5239 }, { "epoch": 0.7902873086494231, "grad_norm": 0.42845738937801925, "learning_rate": 9.706493394057067e-08, "loss": 1.6048, "step": 5240 }, { "epoch": 0.7904381268380967, "grad_norm": 0.2844146144024818, "learning_rate": 9.699970060028247e-08, "loss": 1.6515, "step": 5241 }, { "epoch": 0.7905889450267702, "grad_norm": 0.24872106490086418, "learning_rate": 9.693450722624542e-08, "loss": 1.6053, "step": 5242 }, { "epoch": 0.7907397632154438, "grad_norm": 0.32871423293171437, "learning_rate": 9.68693538330973e-08, "loss": 1.5577, "step": 5243 }, { "epoch": 0.7908905814041174, "grad_norm": 0.2728321330895966, "learning_rate": 9.6804240435467e-08, "loss": 1.5897, "step": 5244 }, { "epoch": 0.7910413995927909, "grad_norm": 0.24331676661144996, "learning_rate": 9.673916704797438e-08, "loss": 1.5672, "step": 5245 }, { "epoch": 0.7911922177814644, "grad_norm": 0.26678977606867443, "learning_rate": 9.667413368523023e-08, "loss": 1.596, "step": 5246 }, { "epoch": 0.791343035970138, "grad_norm": 0.31409689319400996, "learning_rate": 9.660914036183653e-08, "loss": 1.5556, "step": 5247 }, { "epoch": 0.7914938541588116, "grad_norm": 0.25646123946223787, "learning_rate": 9.654418709238619e-08, "loss": 1.4908, "step": 5248 }, { "epoch": 0.7916446723474851, "grad_norm": 0.2567826501041337, "learning_rate": 9.647927389146295e-08, "loss": 1.6256, "step": 5249 }, { "epoch": 0.7917954905361586, "grad_norm": 0.34114059546512304, "learning_rate": 9.641440077364188e-08, "loss": 1.51, "step": 5250 }, { "epoch": 0.7919463087248322, "grad_norm": 0.2836721666035514, "learning_rate": 9.634956775348883e-08, "loss": 1.5525, "step": 5251 }, { "epoch": 0.7920971269135058, "grad_norm": 0.23733357097386582, "learning_rate": 9.628477484556065e-08, "loss": 1.484, "step": 5252 }, { "epoch": 0.7922479451021793, "grad_norm": 0.25173384900472523, "learning_rate": 9.622002206440533e-08, "loss": 1.5501, "step": 5253 }, { "epoch": 0.7923987632908529, "grad_norm": 0.3801763344859971, "learning_rate": 9.615530942456168e-08, "loss": 1.517, "step": 5254 }, { "epoch": 0.7925495814795265, "grad_norm": 0.24662474824068642, "learning_rate": 9.609063694055964e-08, "loss": 1.5852, "step": 5255 }, { "epoch": 0.7927003996681999, "grad_norm": 0.26165667966294054, "learning_rate": 9.602600462691991e-08, "loss": 1.5105, "step": 5256 }, { "epoch": 0.7928512178568735, "grad_norm": 0.2962893428088458, "learning_rate": 9.596141249815455e-08, "loss": 1.528, "step": 5257 }, { "epoch": 0.7930020360455471, "grad_norm": 0.2650559965583865, "learning_rate": 9.589686056876624e-08, "loss": 1.5849, "step": 5258 }, { "epoch": 0.7931528542342207, "grad_norm": 0.2716594700912555, "learning_rate": 9.583234885324873e-08, "loss": 1.5872, "step": 5259 }, { "epoch": 0.7933036724228942, "grad_norm": 0.2619807567043884, "learning_rate": 9.576787736608693e-08, "loss": 1.5347, "step": 5260 }, { "epoch": 0.7934544906115678, "grad_norm": 0.2643445440132794, "learning_rate": 9.570344612175643e-08, "loss": 1.5998, "step": 5261 }, { "epoch": 0.7936053088002413, "grad_norm": 0.25171253202602445, "learning_rate": 9.563905513472395e-08, "loss": 1.5692, "step": 5262 }, { "epoch": 0.7937561269889148, "grad_norm": 0.27166046322517085, "learning_rate": 9.55747044194472e-08, "loss": 1.5505, "step": 5263 }, { "epoch": 0.7939069451775884, "grad_norm": 0.2535333741114826, "learning_rate": 9.551039399037475e-08, "loss": 1.5318, "step": 5264 }, { "epoch": 0.794057763366262, "grad_norm": 0.2507836488559641, "learning_rate": 9.544612386194612e-08, "loss": 1.5812, "step": 5265 }, { "epoch": 0.7942085815549356, "grad_norm": 0.2539476403745853, "learning_rate": 9.538189404859192e-08, "loss": 1.4597, "step": 5266 }, { "epoch": 0.794359399743609, "grad_norm": 0.2592222527971778, "learning_rate": 9.531770456473355e-08, "loss": 1.5503, "step": 5267 }, { "epoch": 0.7945102179322826, "grad_norm": 0.2598003965601039, "learning_rate": 9.525355542478348e-08, "loss": 1.5817, "step": 5268 }, { "epoch": 0.7946610361209562, "grad_norm": 0.24709229152520368, "learning_rate": 9.51894466431449e-08, "loss": 1.5441, "step": 5269 }, { "epoch": 0.7948118543096298, "grad_norm": 0.26921306649993715, "learning_rate": 9.51253782342123e-08, "loss": 1.5136, "step": 5270 }, { "epoch": 0.7949626724983033, "grad_norm": 0.26097673845803504, "learning_rate": 9.50613502123708e-08, "loss": 1.5609, "step": 5271 }, { "epoch": 0.7951134906869769, "grad_norm": 0.26364841943998557, "learning_rate": 9.49973625919965e-08, "loss": 1.5982, "step": 5272 }, { "epoch": 0.7952643088756504, "grad_norm": 0.28794281341300015, "learning_rate": 9.493341538745664e-08, "loss": 1.573, "step": 5273 }, { "epoch": 0.7954151270643239, "grad_norm": 0.33487655215897294, "learning_rate": 9.48695086131091e-08, "loss": 1.5709, "step": 5274 }, { "epoch": 0.7955659452529975, "grad_norm": 0.24130890877284242, "learning_rate": 9.480564228330283e-08, "loss": 1.5723, "step": 5275 }, { "epoch": 0.7957167634416711, "grad_norm": 0.2625366907215809, "learning_rate": 9.474181641237778e-08, "loss": 1.5325, "step": 5276 }, { "epoch": 0.7958675816303447, "grad_norm": 0.26232962210393057, "learning_rate": 9.467803101466462e-08, "loss": 1.5742, "step": 5277 }, { "epoch": 0.7960183998190181, "grad_norm": 0.23504685894343988, "learning_rate": 9.461428610448502e-08, "loss": 1.5213, "step": 5278 }, { "epoch": 0.7961692180076917, "grad_norm": 0.26532676636442065, "learning_rate": 9.455058169615163e-08, "loss": 1.5344, "step": 5279 }, { "epoch": 0.7963200361963653, "grad_norm": 0.24510599024024202, "learning_rate": 9.448691780396792e-08, "loss": 1.5386, "step": 5280 }, { "epoch": 0.7964708543850388, "grad_norm": 1.031700399816234, "learning_rate": 9.44232944422283e-08, "loss": 1.6055, "step": 5281 }, { "epoch": 0.7966216725737124, "grad_norm": 0.26191947447633585, "learning_rate": 9.4359711625218e-08, "loss": 1.5396, "step": 5282 }, { "epoch": 0.796772490762386, "grad_norm": 0.25671463940734085, "learning_rate": 9.42961693672133e-08, "loss": 1.5745, "step": 5283 }, { "epoch": 0.7969233089510595, "grad_norm": 0.24064938910739106, "learning_rate": 9.42326676824812e-08, "loss": 1.5457, "step": 5284 }, { "epoch": 0.797074127139733, "grad_norm": 0.3121141778522017, "learning_rate": 9.416920658527983e-08, "loss": 1.5594, "step": 5285 }, { "epoch": 0.7972249453284066, "grad_norm": 0.24925526232670184, "learning_rate": 9.410578608985786e-08, "loss": 1.5715, "step": 5286 }, { "epoch": 0.7973757635170802, "grad_norm": 0.2635196049687862, "learning_rate": 9.40424062104552e-08, "loss": 1.5411, "step": 5287 }, { "epoch": 0.7975265817057537, "grad_norm": 0.27385968980861314, "learning_rate": 9.397906696130243e-08, "loss": 1.5748, "step": 5288 }, { "epoch": 0.7976773998944273, "grad_norm": 0.24713953186058749, "learning_rate": 9.391576835662096e-08, "loss": 1.5194, "step": 5289 }, { "epoch": 0.7978282180831008, "grad_norm": 0.30874219514176704, "learning_rate": 9.38525104106233e-08, "loss": 1.5026, "step": 5290 }, { "epoch": 0.7979790362717744, "grad_norm": 0.25061358837709624, "learning_rate": 9.378929313751266e-08, "loss": 1.5311, "step": 5291 }, { "epoch": 0.7981298544604479, "grad_norm": 0.24458375046614347, "learning_rate": 9.37261165514831e-08, "loss": 1.5807, "step": 5292 }, { "epoch": 0.7982806726491215, "grad_norm": 0.23696780363506526, "learning_rate": 9.366298066671973e-08, "loss": 1.6163, "step": 5293 }, { "epoch": 0.7984314908377951, "grad_norm": 0.26478478952353585, "learning_rate": 9.359988549739828e-08, "loss": 1.5109, "step": 5294 }, { "epoch": 0.7985823090264685, "grad_norm": 0.2501202826056413, "learning_rate": 9.353683105768544e-08, "loss": 1.5546, "step": 5295 }, { "epoch": 0.7987331272151421, "grad_norm": 0.2445865178732147, "learning_rate": 9.34738173617389e-08, "loss": 1.5804, "step": 5296 }, { "epoch": 0.7988839454038157, "grad_norm": 0.2883543741249625, "learning_rate": 9.341084442370697e-08, "loss": 1.5302, "step": 5297 }, { "epoch": 0.7990347635924893, "grad_norm": 0.25099696195827753, "learning_rate": 9.334791225772893e-08, "loss": 1.556, "step": 5298 }, { "epoch": 0.7991855817811628, "grad_norm": 0.23455636692257625, "learning_rate": 9.328502087793482e-08, "loss": 1.5932, "step": 5299 }, { "epoch": 0.7993363999698364, "grad_norm": 0.2728960714104501, "learning_rate": 9.322217029844573e-08, "loss": 1.577, "step": 5300 }, { "epoch": 0.7994872181585099, "grad_norm": 0.27709587109866557, "learning_rate": 9.315936053337333e-08, "loss": 1.5744, "step": 5301 }, { "epoch": 0.7996380363471834, "grad_norm": 0.25922070011737874, "learning_rate": 9.309659159682024e-08, "loss": 1.5334, "step": 5302 }, { "epoch": 0.799788854535857, "grad_norm": 0.2508166462275185, "learning_rate": 9.303386350288e-08, "loss": 1.5497, "step": 5303 }, { "epoch": 0.7999396727245306, "grad_norm": 0.25757627776221476, "learning_rate": 9.297117626563686e-08, "loss": 1.56, "step": 5304 }, { "epoch": 0.8000904909132042, "grad_norm": 0.25218239326517317, "learning_rate": 9.290852989916581e-08, "loss": 1.5865, "step": 5305 }, { "epoch": 0.8002413091018777, "grad_norm": 0.25401844710881205, "learning_rate": 9.284592441753298e-08, "loss": 1.5299, "step": 5306 }, { "epoch": 0.8003921272905512, "grad_norm": 0.2811756303548548, "learning_rate": 9.278335983479504e-08, "loss": 1.5726, "step": 5307 }, { "epoch": 0.8005429454792248, "grad_norm": 0.24685673296726654, "learning_rate": 9.272083616499948e-08, "loss": 1.5447, "step": 5308 }, { "epoch": 0.8006937636678984, "grad_norm": 0.2541167362442535, "learning_rate": 9.265835342218481e-08, "loss": 1.6192, "step": 5309 }, { "epoch": 0.8008445818565719, "grad_norm": 0.2541763550342351, "learning_rate": 9.259591162038019e-08, "loss": 1.5799, "step": 5310 }, { "epoch": 0.8009954000452455, "grad_norm": 0.24062327291007268, "learning_rate": 9.253351077360558e-08, "loss": 1.5336, "step": 5311 }, { "epoch": 0.801146218233919, "grad_norm": 0.24377484811412073, "learning_rate": 9.247115089587176e-08, "loss": 1.6561, "step": 5312 }, { "epoch": 0.8012970364225925, "grad_norm": 0.2619299920309576, "learning_rate": 9.240883200118046e-08, "loss": 1.5988, "step": 5313 }, { "epoch": 0.8014478546112661, "grad_norm": 0.25311637076284305, "learning_rate": 9.234655410352403e-08, "loss": 1.5791, "step": 5314 }, { "epoch": 0.8015986727999397, "grad_norm": 0.24374721927243836, "learning_rate": 9.228431721688557e-08, "loss": 1.5806, "step": 5315 }, { "epoch": 0.8017494909886133, "grad_norm": 0.2423027621176886, "learning_rate": 9.222212135523924e-08, "loss": 1.5073, "step": 5316 }, { "epoch": 0.8019003091772868, "grad_norm": 0.24484754116073318, "learning_rate": 9.215996653254977e-08, "loss": 1.5388, "step": 5317 }, { "epoch": 0.8020511273659603, "grad_norm": 0.2615928740002736, "learning_rate": 9.20978527627726e-08, "loss": 1.5687, "step": 5318 }, { "epoch": 0.8022019455546339, "grad_norm": 0.2516112458899224, "learning_rate": 9.203578005985427e-08, "loss": 1.5286, "step": 5319 }, { "epoch": 0.8023527637433074, "grad_norm": 0.25795245742735634, "learning_rate": 9.197374843773182e-08, "loss": 1.5351, "step": 5320 }, { "epoch": 0.802503581931981, "grad_norm": 0.2481199157870698, "learning_rate": 9.191175791033308e-08, "loss": 1.5578, "step": 5321 }, { "epoch": 0.8026544001206546, "grad_norm": 0.26970641849821136, "learning_rate": 9.184980849157689e-08, "loss": 1.6133, "step": 5322 }, { "epoch": 0.802805218309328, "grad_norm": 0.40884427230147374, "learning_rate": 9.17879001953726e-08, "loss": 1.5045, "step": 5323 }, { "epoch": 0.8029560364980016, "grad_norm": 0.26584043367342286, "learning_rate": 9.172603303562046e-08, "loss": 1.596, "step": 5324 }, { "epoch": 0.8031068546866752, "grad_norm": 0.24749377355422092, "learning_rate": 9.166420702621136e-08, "loss": 1.6246, "step": 5325 }, { "epoch": 0.8032576728753488, "grad_norm": 0.26131795595872453, "learning_rate": 9.160242218102718e-08, "loss": 1.5267, "step": 5326 }, { "epoch": 0.8034084910640223, "grad_norm": 0.24781848229229342, "learning_rate": 9.154067851394035e-08, "loss": 1.5093, "step": 5327 }, { "epoch": 0.8035593092526959, "grad_norm": 0.25944684942346824, "learning_rate": 9.147897603881405e-08, "loss": 1.566, "step": 5328 }, { "epoch": 0.8037101274413694, "grad_norm": 0.2621925392436018, "learning_rate": 9.14173147695024e-08, "loss": 1.6605, "step": 5329 }, { "epoch": 0.803860945630043, "grad_norm": 0.301692620618676, "learning_rate": 9.135569471985013e-08, "loss": 1.593, "step": 5330 }, { "epoch": 0.8040117638187165, "grad_norm": 0.25014830888623185, "learning_rate": 9.129411590369263e-08, "loss": 1.5825, "step": 5331 }, { "epoch": 0.8041625820073901, "grad_norm": 0.25920278714820694, "learning_rate": 9.123257833485629e-08, "loss": 1.5638, "step": 5332 }, { "epoch": 0.8043134001960637, "grad_norm": 0.25065550082065197, "learning_rate": 9.117108202715803e-08, "loss": 1.5359, "step": 5333 }, { "epoch": 0.8044642183847373, "grad_norm": 0.24765080141677795, "learning_rate": 9.110962699440544e-08, "loss": 1.4746, "step": 5334 }, { "epoch": 0.8046150365734107, "grad_norm": 0.38934375374357866, "learning_rate": 9.104821325039715e-08, "loss": 1.5905, "step": 5335 }, { "epoch": 0.8047658547620843, "grad_norm": 0.24887350237507042, "learning_rate": 9.098684080892226e-08, "loss": 1.5848, "step": 5336 }, { "epoch": 0.8049166729507579, "grad_norm": 0.250743742241653, "learning_rate": 9.092550968376067e-08, "loss": 1.5651, "step": 5337 }, { "epoch": 0.8050674911394314, "grad_norm": 0.24632833456968029, "learning_rate": 9.08642198886829e-08, "loss": 1.5344, "step": 5338 }, { "epoch": 0.805218309328105, "grad_norm": 0.25992383642761396, "learning_rate": 9.080297143745049e-08, "loss": 1.5933, "step": 5339 }, { "epoch": 0.8053691275167785, "grad_norm": 0.259916730791826, "learning_rate": 9.07417643438154e-08, "loss": 1.5681, "step": 5340 }, { "epoch": 0.805519945705452, "grad_norm": 0.23859363293643113, "learning_rate": 9.068059862152031e-08, "loss": 1.5128, "step": 5341 }, { "epoch": 0.8056707638941256, "grad_norm": 0.24195583611150917, "learning_rate": 9.061947428429889e-08, "loss": 1.5135, "step": 5342 }, { "epoch": 0.8058215820827992, "grad_norm": 0.2431272382246692, "learning_rate": 9.055839134587527e-08, "loss": 1.6044, "step": 5343 }, { "epoch": 0.8059724002714728, "grad_norm": 0.25546031392204177, "learning_rate": 9.049734981996424e-08, "loss": 1.5352, "step": 5344 }, { "epoch": 0.8061232184601463, "grad_norm": 0.2712017381244022, "learning_rate": 9.043634972027155e-08, "loss": 1.5402, "step": 5345 }, { "epoch": 0.8062740366488198, "grad_norm": 0.274957572626681, "learning_rate": 9.037539106049345e-08, "loss": 1.5007, "step": 5346 }, { "epoch": 0.8064248548374934, "grad_norm": 0.25596132221287465, "learning_rate": 9.031447385431684e-08, "loss": 1.5955, "step": 5347 }, { "epoch": 0.806575673026167, "grad_norm": 0.24419676172022808, "learning_rate": 9.025359811541955e-08, "loss": 1.5236, "step": 5348 }, { "epoch": 0.8067264912148405, "grad_norm": 0.24905555635542445, "learning_rate": 9.01927638574699e-08, "loss": 1.5338, "step": 5349 }, { "epoch": 0.8068773094035141, "grad_norm": 0.2656314969267587, "learning_rate": 9.013197109412696e-08, "loss": 1.5723, "step": 5350 }, { "epoch": 0.8070281275921877, "grad_norm": 0.2623964491270413, "learning_rate": 9.007121983904037e-08, "loss": 1.5422, "step": 5351 }, { "epoch": 0.8071789457808611, "grad_norm": 0.25206837273975874, "learning_rate": 9.00105101058507e-08, "loss": 1.5458, "step": 5352 }, { "epoch": 0.8073297639695347, "grad_norm": 0.2492787955366975, "learning_rate": 8.994984190818903e-08, "loss": 1.5401, "step": 5353 }, { "epoch": 0.8074805821582083, "grad_norm": 0.28513932967276034, "learning_rate": 8.988921525967705e-08, "loss": 1.5206, "step": 5354 }, { "epoch": 0.8076314003468819, "grad_norm": 0.2584154267957481, "learning_rate": 8.982863017392732e-08, "loss": 1.5066, "step": 5355 }, { "epoch": 0.8077822185355554, "grad_norm": 0.254232521760263, "learning_rate": 8.976808666454291e-08, "loss": 1.6354, "step": 5356 }, { "epoch": 0.8079330367242289, "grad_norm": 0.2475693806936419, "learning_rate": 8.970758474511752e-08, "loss": 1.571, "step": 5357 }, { "epoch": 0.8080838549129025, "grad_norm": 0.25953367558525964, "learning_rate": 8.96471244292357e-08, "loss": 1.5508, "step": 5358 }, { "epoch": 0.808234673101576, "grad_norm": 0.5908046759790779, "learning_rate": 8.958670573047255e-08, "loss": 1.5292, "step": 5359 }, { "epoch": 0.8083854912902496, "grad_norm": 0.2551127997023796, "learning_rate": 8.952632866239374e-08, "loss": 1.5858, "step": 5360 }, { "epoch": 0.8085363094789232, "grad_norm": 0.252857184965345, "learning_rate": 8.946599323855578e-08, "loss": 1.6471, "step": 5361 }, { "epoch": 0.8086871276675968, "grad_norm": 0.2781140975758444, "learning_rate": 8.94056994725057e-08, "loss": 1.5221, "step": 5362 }, { "epoch": 0.8088379458562702, "grad_norm": 0.2465808512536548, "learning_rate": 8.934544737778118e-08, "loss": 1.6323, "step": 5363 }, { "epoch": 0.8089887640449438, "grad_norm": 0.2573413992597441, "learning_rate": 8.928523696791055e-08, "loss": 1.5326, "step": 5364 }, { "epoch": 0.8091395822336174, "grad_norm": 0.2462046847547708, "learning_rate": 8.922506825641285e-08, "loss": 1.6586, "step": 5365 }, { "epoch": 0.809290400422291, "grad_norm": 0.25037190609185495, "learning_rate": 8.916494125679772e-08, "loss": 1.5154, "step": 5366 }, { "epoch": 0.8094412186109645, "grad_norm": 0.25180805274015566, "learning_rate": 8.910485598256532e-08, "loss": 1.5549, "step": 5367 }, { "epoch": 0.809592036799638, "grad_norm": 0.25981895801806054, "learning_rate": 8.904481244720668e-08, "loss": 1.5517, "step": 5368 }, { "epoch": 0.8097428549883116, "grad_norm": 0.2631646512048799, "learning_rate": 8.898481066420324e-08, "loss": 1.6065, "step": 5369 }, { "epoch": 0.8098936731769851, "grad_norm": 0.24932562170508332, "learning_rate": 8.892485064702712e-08, "loss": 1.5323, "step": 5370 }, { "epoch": 0.8100444913656587, "grad_norm": 0.24436695030970323, "learning_rate": 8.88649324091412e-08, "loss": 1.5663, "step": 5371 }, { "epoch": 0.8101953095543323, "grad_norm": 0.3269133308214128, "learning_rate": 8.880505596399882e-08, "loss": 1.508, "step": 5372 }, { "epoch": 0.8103461277430059, "grad_norm": 0.24294520604001518, "learning_rate": 8.874522132504394e-08, "loss": 1.5495, "step": 5373 }, { "epoch": 0.8104969459316793, "grad_norm": 0.24278171864311007, "learning_rate": 8.868542850571121e-08, "loss": 1.5866, "step": 5374 }, { "epoch": 0.8106477641203529, "grad_norm": 0.4172245566715417, "learning_rate": 8.862567751942585e-08, "loss": 1.6476, "step": 5375 }, { "epoch": 0.8107985823090265, "grad_norm": 0.24977377337571738, "learning_rate": 8.856596837960376e-08, "loss": 1.633, "step": 5376 }, { "epoch": 0.8109494004977, "grad_norm": 0.29329456679490906, "learning_rate": 8.850630109965125e-08, "loss": 1.4838, "step": 5377 }, { "epoch": 0.8111002186863736, "grad_norm": 0.25501453774994753, "learning_rate": 8.844667569296549e-08, "loss": 1.6073, "step": 5378 }, { "epoch": 0.8112510368750472, "grad_norm": 0.24424479931122867, "learning_rate": 8.838709217293407e-08, "loss": 1.4627, "step": 5379 }, { "epoch": 0.8114018550637206, "grad_norm": 0.260366118063312, "learning_rate": 8.832755055293517e-08, "loss": 1.5528, "step": 5380 }, { "epoch": 0.8115526732523942, "grad_norm": 0.4415329037429282, "learning_rate": 8.826805084633773e-08, "loss": 1.5864, "step": 5381 }, { "epoch": 0.8117034914410678, "grad_norm": 0.2507284828558614, "learning_rate": 8.820859306650114e-08, "loss": 1.5795, "step": 5382 }, { "epoch": 0.8118543096297414, "grad_norm": 0.26354637795152236, "learning_rate": 8.814917722677528e-08, "loss": 1.5162, "step": 5383 }, { "epoch": 0.8120051278184149, "grad_norm": 0.25882985694211996, "learning_rate": 8.80898033405009e-08, "loss": 1.501, "step": 5384 }, { "epoch": 0.8121559460070884, "grad_norm": 0.24968693494284327, "learning_rate": 8.80304714210091e-08, "loss": 1.5276, "step": 5385 }, { "epoch": 0.812306764195762, "grad_norm": 0.255317334408465, "learning_rate": 8.79711814816216e-08, "loss": 1.5747, "step": 5386 }, { "epoch": 0.8124575823844356, "grad_norm": 0.24498829617791465, "learning_rate": 8.791193353565071e-08, "loss": 1.5387, "step": 5387 }, { "epoch": 0.8126084005731091, "grad_norm": 0.2503163183131022, "learning_rate": 8.785272759639941e-08, "loss": 1.481, "step": 5388 }, { "epoch": 0.8127592187617827, "grad_norm": 0.25392590128550246, "learning_rate": 8.779356367716106e-08, "loss": 1.576, "step": 5389 }, { "epoch": 0.8129100369504563, "grad_norm": 0.2693098078403543, "learning_rate": 8.773444179121969e-08, "loss": 1.5928, "step": 5390 }, { "epoch": 0.8130608551391297, "grad_norm": 0.2658132062565851, "learning_rate": 8.767536195184996e-08, "loss": 1.6161, "step": 5391 }, { "epoch": 0.8132116733278033, "grad_norm": 0.25606448372362856, "learning_rate": 8.761632417231696e-08, "loss": 1.5879, "step": 5392 }, { "epoch": 0.8133624915164769, "grad_norm": 0.2674645289633079, "learning_rate": 8.755732846587636e-08, "loss": 1.5681, "step": 5393 }, { "epoch": 0.8135133097051505, "grad_norm": 0.2383322629257858, "learning_rate": 8.74983748457745e-08, "loss": 1.5701, "step": 5394 }, { "epoch": 0.813664127893824, "grad_norm": 0.24718613518887608, "learning_rate": 8.743946332524812e-08, "loss": 1.6227, "step": 5395 }, { "epoch": 0.8138149460824976, "grad_norm": 0.2548462219820224, "learning_rate": 8.738059391752456e-08, "loss": 1.5131, "step": 5396 }, { "epoch": 0.8139657642711711, "grad_norm": 0.2727091785242275, "learning_rate": 8.732176663582182e-08, "loss": 1.6031, "step": 5397 }, { "epoch": 0.8141165824598446, "grad_norm": 0.24118012755987908, "learning_rate": 8.726298149334824e-08, "loss": 1.5522, "step": 5398 }, { "epoch": 0.8142674006485182, "grad_norm": 0.24933804716672786, "learning_rate": 8.72042385033028e-08, "loss": 1.5314, "step": 5399 }, { "epoch": 0.8144182188371918, "grad_norm": 0.25911084966975867, "learning_rate": 8.7145537678875e-08, "loss": 1.5313, "step": 5400 }, { "epoch": 0.8145690370258654, "grad_norm": 0.3206244188727088, "learning_rate": 8.708687903324499e-08, "loss": 1.5206, "step": 5401 }, { "epoch": 0.8147198552145388, "grad_norm": 0.272195332433252, "learning_rate": 8.702826257958325e-08, "loss": 1.567, "step": 5402 }, { "epoch": 0.8148706734032124, "grad_norm": 0.2790391207210687, "learning_rate": 8.696968833105083e-08, "loss": 1.46, "step": 5403 }, { "epoch": 0.815021491591886, "grad_norm": 0.2394075464882588, "learning_rate": 8.691115630079953e-08, "loss": 1.5442, "step": 5404 }, { "epoch": 0.8151723097805595, "grad_norm": 0.2676690166921416, "learning_rate": 8.685266650197136e-08, "loss": 1.5634, "step": 5405 }, { "epoch": 0.8153231279692331, "grad_norm": 0.25896659649800485, "learning_rate": 8.679421894769897e-08, "loss": 1.597, "step": 5406 }, { "epoch": 0.8154739461579067, "grad_norm": 0.38741568974714197, "learning_rate": 8.673581365110564e-08, "loss": 1.5574, "step": 5407 }, { "epoch": 0.8156247643465802, "grad_norm": 0.2508110110315982, "learning_rate": 8.667745062530502e-08, "loss": 1.5147, "step": 5408 }, { "epoch": 0.8157755825352537, "grad_norm": 0.261320500360099, "learning_rate": 8.66191298834012e-08, "loss": 1.5707, "step": 5409 }, { "epoch": 0.8159264007239273, "grad_norm": 0.32222062350335506, "learning_rate": 8.656085143848907e-08, "loss": 1.5306, "step": 5410 }, { "epoch": 0.8160772189126009, "grad_norm": 0.2517801262747225, "learning_rate": 8.650261530365372e-08, "loss": 1.5784, "step": 5411 }, { "epoch": 0.8162280371012745, "grad_norm": 0.24409824743779973, "learning_rate": 8.644442149197089e-08, "loss": 1.5261, "step": 5412 }, { "epoch": 0.8163788552899479, "grad_norm": 0.24812057004238666, "learning_rate": 8.638627001650676e-08, "loss": 1.5896, "step": 5413 }, { "epoch": 0.8165296734786215, "grad_norm": 0.2474975475339188, "learning_rate": 8.632816089031808e-08, "loss": 1.5342, "step": 5414 }, { "epoch": 0.8166804916672951, "grad_norm": 0.27549008020970944, "learning_rate": 8.627009412645197e-08, "loss": 1.5721, "step": 5415 }, { "epoch": 0.8168313098559686, "grad_norm": 0.256658953687918, "learning_rate": 8.621206973794617e-08, "loss": 1.5522, "step": 5416 }, { "epoch": 0.8169821280446422, "grad_norm": 0.2613980832353188, "learning_rate": 8.615408773782884e-08, "loss": 1.5271, "step": 5417 }, { "epoch": 0.8171329462333158, "grad_norm": 0.24113470747515664, "learning_rate": 8.609614813911861e-08, "loss": 1.5702, "step": 5418 }, { "epoch": 0.8172837644219892, "grad_norm": 0.25148116486031524, "learning_rate": 8.603825095482459e-08, "loss": 1.6049, "step": 5419 }, { "epoch": 0.8174345826106628, "grad_norm": 0.33292332289003057, "learning_rate": 8.59803961979464e-08, "loss": 1.6006, "step": 5420 }, { "epoch": 0.8175854007993364, "grad_norm": 0.2546474181958, "learning_rate": 8.592258388147416e-08, "loss": 1.565, "step": 5421 }, { "epoch": 0.81773621898801, "grad_norm": 0.2444825543043335, "learning_rate": 8.586481401838833e-08, "loss": 1.5486, "step": 5422 }, { "epoch": 0.8178870371766835, "grad_norm": 0.2912897847245712, "learning_rate": 8.580708662166004e-08, "loss": 1.5549, "step": 5423 }, { "epoch": 0.8180378553653571, "grad_norm": 0.2460550010359619, "learning_rate": 8.57494017042507e-08, "loss": 1.5158, "step": 5424 }, { "epoch": 0.8181886735540306, "grad_norm": 0.24223485160905156, "learning_rate": 8.569175927911227e-08, "loss": 1.5291, "step": 5425 }, { "epoch": 0.8183394917427042, "grad_norm": 0.2703071094741545, "learning_rate": 8.563415935918708e-08, "loss": 1.5467, "step": 5426 }, { "epoch": 0.8184903099313777, "grad_norm": 0.2583706245350999, "learning_rate": 8.55766019574081e-08, "loss": 1.5992, "step": 5427 }, { "epoch": 0.8186411281200513, "grad_norm": 0.33994776617608374, "learning_rate": 8.551908708669864e-08, "loss": 1.5422, "step": 5428 }, { "epoch": 0.8187919463087249, "grad_norm": 0.26498457659972996, "learning_rate": 8.546161475997232e-08, "loss": 1.5438, "step": 5429 }, { "epoch": 0.8189427644973983, "grad_norm": 0.24547134501992465, "learning_rate": 8.540418499013353e-08, "loss": 1.4488, "step": 5430 }, { "epoch": 0.8190935826860719, "grad_norm": 0.24827873309112944, "learning_rate": 8.534679779007686e-08, "loss": 1.5305, "step": 5431 }, { "epoch": 0.8192444008747455, "grad_norm": 0.24132056654808884, "learning_rate": 8.528945317268735e-08, "loss": 1.5605, "step": 5432 }, { "epoch": 0.8193952190634191, "grad_norm": 0.2485427899850471, "learning_rate": 8.523215115084062e-08, "loss": 1.6074, "step": 5433 }, { "epoch": 0.8195460372520926, "grad_norm": 0.2631974283968051, "learning_rate": 8.51748917374026e-08, "loss": 1.6224, "step": 5434 }, { "epoch": 0.8196968554407662, "grad_norm": 0.24912974465370194, "learning_rate": 8.511767494522968e-08, "loss": 1.5186, "step": 5435 }, { "epoch": 0.8198476736294397, "grad_norm": 0.2737340974688945, "learning_rate": 8.506050078716876e-08, "loss": 1.6187, "step": 5436 }, { "epoch": 0.8199984918181132, "grad_norm": 0.25737401441478086, "learning_rate": 8.500336927605706e-08, "loss": 1.5347, "step": 5437 }, { "epoch": 0.8201493100067868, "grad_norm": 0.2560828165673062, "learning_rate": 8.494628042472223e-08, "loss": 1.5867, "step": 5438 }, { "epoch": 0.8203001281954604, "grad_norm": 0.25727026274899956, "learning_rate": 8.488923424598243e-08, "loss": 1.5816, "step": 5439 }, { "epoch": 0.820450946384134, "grad_norm": 0.2565306185789074, "learning_rate": 8.483223075264622e-08, "loss": 1.5893, "step": 5440 }, { "epoch": 0.8206017645728075, "grad_norm": 0.2537942028175509, "learning_rate": 8.477526995751247e-08, "loss": 1.5945, "step": 5441 }, { "epoch": 0.820752582761481, "grad_norm": 0.2733803871603261, "learning_rate": 8.471835187337053e-08, "loss": 1.5049, "step": 5442 }, { "epoch": 0.8209034009501546, "grad_norm": 0.2534012718871278, "learning_rate": 8.466147651300029e-08, "loss": 1.5302, "step": 5443 }, { "epoch": 0.8210542191388281, "grad_norm": 0.2573195107412062, "learning_rate": 8.460464388917184e-08, "loss": 1.5285, "step": 5444 }, { "epoch": 0.8212050373275017, "grad_norm": 0.2494917233132808, "learning_rate": 8.454785401464567e-08, "loss": 1.5708, "step": 5445 }, { "epoch": 0.8213558555161753, "grad_norm": 0.2738697235871858, "learning_rate": 8.449110690217295e-08, "loss": 1.5621, "step": 5446 }, { "epoch": 0.8215066737048488, "grad_norm": 0.2437305077525022, "learning_rate": 8.443440256449493e-08, "loss": 1.556, "step": 5447 }, { "epoch": 0.8216574918935223, "grad_norm": 0.29219678686720424, "learning_rate": 8.437774101434338e-08, "loss": 1.5577, "step": 5448 }, { "epoch": 0.8218083100821959, "grad_norm": 0.25668764384134174, "learning_rate": 8.432112226444057e-08, "loss": 1.5769, "step": 5449 }, { "epoch": 0.8219591282708695, "grad_norm": 0.2556210092097604, "learning_rate": 8.426454632749896e-08, "loss": 1.5504, "step": 5450 }, { "epoch": 0.822109946459543, "grad_norm": 0.24933253322043045, "learning_rate": 8.420801321622157e-08, "loss": 1.506, "step": 5451 }, { "epoch": 0.8222607646482166, "grad_norm": 0.2635338886111469, "learning_rate": 8.415152294330161e-08, "loss": 1.5611, "step": 5452 }, { "epoch": 0.8224115828368901, "grad_norm": 0.25933779608687785, "learning_rate": 8.409507552142293e-08, "loss": 1.5929, "step": 5453 }, { "epoch": 0.8225624010255637, "grad_norm": 0.2639221458810809, "learning_rate": 8.403867096325956e-08, "loss": 1.5985, "step": 5454 }, { "epoch": 0.8227132192142372, "grad_norm": 0.2561121934952759, "learning_rate": 8.398230928147593e-08, "loss": 1.5388, "step": 5455 }, { "epoch": 0.8228640374029108, "grad_norm": 0.24240913723860447, "learning_rate": 8.392599048872697e-08, "loss": 1.5786, "step": 5456 }, { "epoch": 0.8230148555915844, "grad_norm": 0.2876358324857365, "learning_rate": 8.386971459765781e-08, "loss": 1.5611, "step": 5457 }, { "epoch": 0.8231656737802578, "grad_norm": 0.24344122254880732, "learning_rate": 8.3813481620904e-08, "loss": 1.5714, "step": 5458 }, { "epoch": 0.8233164919689314, "grad_norm": 0.2714474288976156, "learning_rate": 8.375729157109159e-08, "loss": 1.5763, "step": 5459 }, { "epoch": 0.823467310157605, "grad_norm": 0.25141151167514675, "learning_rate": 8.370114446083685e-08, "loss": 1.5913, "step": 5460 }, { "epoch": 0.8236181283462786, "grad_norm": 0.24774573760534288, "learning_rate": 8.364504030274636e-08, "loss": 1.598, "step": 5461 }, { "epoch": 0.8237689465349521, "grad_norm": 0.24693739103143914, "learning_rate": 8.358897910941723e-08, "loss": 1.5434, "step": 5462 }, { "epoch": 0.8239197647236257, "grad_norm": 0.24586716731845726, "learning_rate": 8.35329608934368e-08, "loss": 1.5677, "step": 5463 }, { "epoch": 0.8240705829122992, "grad_norm": 0.24333422423598058, "learning_rate": 8.347698566738278e-08, "loss": 1.5386, "step": 5464 }, { "epoch": 0.8242214011009728, "grad_norm": 0.2705858912166686, "learning_rate": 8.342105344382319e-08, "loss": 1.5513, "step": 5465 }, { "epoch": 0.8243722192896463, "grad_norm": 0.2513852238797157, "learning_rate": 8.336516423531658e-08, "loss": 1.5458, "step": 5466 }, { "epoch": 0.8245230374783199, "grad_norm": 0.2677073243030713, "learning_rate": 8.330931805441161e-08, "loss": 1.484, "step": 5467 }, { "epoch": 0.8246738556669935, "grad_norm": 0.2417387343568687, "learning_rate": 8.32535149136473e-08, "loss": 1.5808, "step": 5468 }, { "epoch": 0.824824673855667, "grad_norm": 0.2564645935056465, "learning_rate": 8.319775482555325e-08, "loss": 1.6054, "step": 5469 }, { "epoch": 0.8249754920443405, "grad_norm": 0.26104144470292767, "learning_rate": 8.314203780264914e-08, "loss": 1.5334, "step": 5470 }, { "epoch": 0.8251263102330141, "grad_norm": 0.23933997322284464, "learning_rate": 8.3086363857445e-08, "loss": 1.5773, "step": 5471 }, { "epoch": 0.8252771284216877, "grad_norm": 0.24138886614256383, "learning_rate": 8.303073300244137e-08, "loss": 1.535, "step": 5472 }, { "epoch": 0.8254279466103612, "grad_norm": 0.24743779693958193, "learning_rate": 8.297514525012895e-08, "loss": 1.5456, "step": 5473 }, { "epoch": 0.8255787647990348, "grad_norm": 0.24615796285010186, "learning_rate": 8.29196006129888e-08, "loss": 1.5427, "step": 5474 }, { "epoch": 0.8257295829877083, "grad_norm": 0.2704201999162556, "learning_rate": 8.286409910349227e-08, "loss": 1.4741, "step": 5475 }, { "epoch": 0.8258804011763818, "grad_norm": 0.24226687705850347, "learning_rate": 8.280864073410116e-08, "loss": 1.5244, "step": 5476 }, { "epoch": 0.8260312193650554, "grad_norm": 0.2678719068712771, "learning_rate": 8.27532255172674e-08, "loss": 1.5282, "step": 5477 }, { "epoch": 0.826182037553729, "grad_norm": 0.2636572663143235, "learning_rate": 8.269785346543334e-08, "loss": 1.5513, "step": 5478 }, { "epoch": 0.8263328557424026, "grad_norm": 0.2798055508355367, "learning_rate": 8.264252459103169e-08, "loss": 1.5618, "step": 5479 }, { "epoch": 0.8264836739310761, "grad_norm": 0.6178532566448571, "learning_rate": 8.258723890648533e-08, "loss": 1.6157, "step": 5480 }, { "epoch": 0.8266344921197496, "grad_norm": 0.29986391638280213, "learning_rate": 8.253199642420747e-08, "loss": 1.5464, "step": 5481 }, { "epoch": 0.8267853103084232, "grad_norm": 0.2520571795820321, "learning_rate": 8.247679715660174e-08, "loss": 1.6092, "step": 5482 }, { "epoch": 0.8269361284970967, "grad_norm": 0.2367118617638586, "learning_rate": 8.242164111606193e-08, "loss": 1.5416, "step": 5483 }, { "epoch": 0.8270869466857703, "grad_norm": 0.2719291940920188, "learning_rate": 8.236652831497215e-08, "loss": 1.5367, "step": 5484 }, { "epoch": 0.8272377648744439, "grad_norm": 0.2502373723967542, "learning_rate": 8.231145876570691e-08, "loss": 1.5677, "step": 5485 }, { "epoch": 0.8273885830631175, "grad_norm": 0.2789187183183438, "learning_rate": 8.225643248063091e-08, "loss": 1.5479, "step": 5486 }, { "epoch": 0.8275394012517909, "grad_norm": 0.27432617296751016, "learning_rate": 8.220144947209914e-08, "loss": 1.5756, "step": 5487 }, { "epoch": 0.8276902194404645, "grad_norm": 0.5782504197737236, "learning_rate": 8.21465097524568e-08, "loss": 1.5519, "step": 5488 }, { "epoch": 0.8278410376291381, "grad_norm": 0.2427849386158717, "learning_rate": 8.209161333403961e-08, "loss": 1.5212, "step": 5489 }, { "epoch": 0.8279918558178117, "grad_norm": 0.25720121590749473, "learning_rate": 8.203676022917333e-08, "loss": 1.5388, "step": 5490 }, { "epoch": 0.8281426740064852, "grad_norm": 0.2641600883680885, "learning_rate": 8.198195045017403e-08, "loss": 1.5655, "step": 5491 }, { "epoch": 0.8282934921951587, "grad_norm": 0.27795188615700056, "learning_rate": 8.192718400934817e-08, "loss": 1.59, "step": 5492 }, { "epoch": 0.8284443103838323, "grad_norm": 0.24300716325259156, "learning_rate": 8.187246091899244e-08, "loss": 1.5495, "step": 5493 }, { "epoch": 0.8285951285725058, "grad_norm": 0.25179431711121714, "learning_rate": 8.181778119139376e-08, "loss": 1.6051, "step": 5494 }, { "epoch": 0.8287459467611794, "grad_norm": 0.24904010718627334, "learning_rate": 8.17631448388292e-08, "loss": 1.4861, "step": 5495 }, { "epoch": 0.828896764949853, "grad_norm": 0.2540518015165979, "learning_rate": 8.170855187356637e-08, "loss": 1.5089, "step": 5496 }, { "epoch": 0.8290475831385266, "grad_norm": 0.2665402508650307, "learning_rate": 8.165400230786292e-08, "loss": 1.5303, "step": 5497 }, { "epoch": 0.8291984013272, "grad_norm": 0.2436131353265455, "learning_rate": 8.159949615396677e-08, "loss": 1.5759, "step": 5498 }, { "epoch": 0.8293492195158736, "grad_norm": 0.24836390355686813, "learning_rate": 8.154503342411619e-08, "loss": 1.5564, "step": 5499 }, { "epoch": 0.8295000377045472, "grad_norm": 0.25474851828650724, "learning_rate": 8.149061413053966e-08, "loss": 1.5884, "step": 5500 }, { "epoch": 0.8296508558932207, "grad_norm": 0.2624267415973674, "learning_rate": 8.143623828545581e-08, "loss": 1.5885, "step": 5501 }, { "epoch": 0.8298016740818943, "grad_norm": 0.28913872445950517, "learning_rate": 8.138190590107375e-08, "loss": 1.5947, "step": 5502 }, { "epoch": 0.8299524922705678, "grad_norm": 0.26690457060983375, "learning_rate": 8.132761698959258e-08, "loss": 1.5308, "step": 5503 }, { "epoch": 0.8301033104592414, "grad_norm": 0.2511696551494377, "learning_rate": 8.127337156320177e-08, "loss": 1.514, "step": 5504 }, { "epoch": 0.8302541286479149, "grad_norm": 0.2466855227551212, "learning_rate": 8.121916963408092e-08, "loss": 1.5415, "step": 5505 }, { "epoch": 0.8304049468365885, "grad_norm": 0.24274934564025172, "learning_rate": 8.116501121440005e-08, "loss": 1.6399, "step": 5506 }, { "epoch": 0.8305557650252621, "grad_norm": 0.2729618232546073, "learning_rate": 8.111089631631929e-08, "loss": 1.6211, "step": 5507 }, { "epoch": 0.8307065832139356, "grad_norm": 0.24271182317106732, "learning_rate": 8.105682495198891e-08, "loss": 1.5797, "step": 5508 }, { "epoch": 0.8308574014026091, "grad_norm": 0.2507970848649864, "learning_rate": 8.100279713354963e-08, "loss": 1.4944, "step": 5509 }, { "epoch": 0.8310082195912827, "grad_norm": 0.2404971843812375, "learning_rate": 8.094881287313221e-08, "loss": 1.5518, "step": 5510 }, { "epoch": 0.8311590377799563, "grad_norm": 0.3189830776593624, "learning_rate": 8.089487218285767e-08, "loss": 1.5306, "step": 5511 }, { "epoch": 0.8313098559686298, "grad_norm": 0.2390377412502148, "learning_rate": 8.084097507483732e-08, "loss": 1.5635, "step": 5512 }, { "epoch": 0.8314606741573034, "grad_norm": 0.24737509670879096, "learning_rate": 8.078712156117261e-08, "loss": 1.5703, "step": 5513 }, { "epoch": 0.831611492345977, "grad_norm": 0.24042296074711592, "learning_rate": 8.073331165395516e-08, "loss": 1.5698, "step": 5514 }, { "epoch": 0.8317623105346504, "grad_norm": 0.2469180979004506, "learning_rate": 8.067954536526697e-08, "loss": 1.5306, "step": 5515 }, { "epoch": 0.831913128723324, "grad_norm": 0.25633531814682303, "learning_rate": 8.062582270718008e-08, "loss": 1.5689, "step": 5516 }, { "epoch": 0.8320639469119976, "grad_norm": 0.381908193777433, "learning_rate": 8.057214369175679e-08, "loss": 1.5383, "step": 5517 }, { "epoch": 0.8322147651006712, "grad_norm": 0.33798264426474456, "learning_rate": 8.051850833104955e-08, "loss": 1.6076, "step": 5518 }, { "epoch": 0.8323655832893447, "grad_norm": 0.25391239904314006, "learning_rate": 8.046491663710119e-08, "loss": 1.5714, "step": 5519 }, { "epoch": 0.8325164014780182, "grad_norm": 0.28153495556832586, "learning_rate": 8.04113686219445e-08, "loss": 1.5612, "step": 5520 }, { "epoch": 0.8326672196666918, "grad_norm": 0.256479463704896, "learning_rate": 8.035786429760258e-08, "loss": 1.5384, "step": 5521 }, { "epoch": 0.8328180378553653, "grad_norm": 0.26785797989635485, "learning_rate": 8.030440367608876e-08, "loss": 1.541, "step": 5522 }, { "epoch": 0.8329688560440389, "grad_norm": 1.2850846020371895, "learning_rate": 8.025098676940649e-08, "loss": 1.5482, "step": 5523 }, { "epoch": 0.8331196742327125, "grad_norm": 0.28770264232268766, "learning_rate": 8.019761358954936e-08, "loss": 1.5556, "step": 5524 }, { "epoch": 0.8332704924213861, "grad_norm": 0.316290337019335, "learning_rate": 8.014428414850129e-08, "loss": 1.5643, "step": 5525 }, { "epoch": 0.8334213106100595, "grad_norm": 0.2534859342404115, "learning_rate": 8.009099845823628e-08, "loss": 1.5442, "step": 5526 }, { "epoch": 0.8335721287987331, "grad_norm": 0.24181255104096053, "learning_rate": 8.003775653071841e-08, "loss": 1.5488, "step": 5527 }, { "epoch": 0.8337229469874067, "grad_norm": 0.25788127231317026, "learning_rate": 7.99845583779022e-08, "loss": 1.6367, "step": 5528 }, { "epoch": 0.8338737651760803, "grad_norm": 0.2612586270195239, "learning_rate": 7.993140401173211e-08, "loss": 1.5347, "step": 5529 }, { "epoch": 0.8340245833647538, "grad_norm": 0.3201414132852245, "learning_rate": 7.987829344414284e-08, "loss": 1.5919, "step": 5530 }, { "epoch": 0.8341754015534274, "grad_norm": 0.24560379475832575, "learning_rate": 7.982522668705922e-08, "loss": 1.5847, "step": 5531 }, { "epoch": 0.8343262197421009, "grad_norm": 0.2479258474056162, "learning_rate": 7.977220375239635e-08, "loss": 1.5666, "step": 5532 }, { "epoch": 0.8344770379307744, "grad_norm": 0.3420062562165439, "learning_rate": 7.971922465205944e-08, "loss": 1.5402, "step": 5533 }, { "epoch": 0.834627856119448, "grad_norm": 0.32407071948208743, "learning_rate": 7.966628939794371e-08, "loss": 1.5554, "step": 5534 }, { "epoch": 0.8347786743081216, "grad_norm": 0.25065365752743557, "learning_rate": 7.961339800193484e-08, "loss": 1.6027, "step": 5535 }, { "epoch": 0.8349294924967952, "grad_norm": 0.28240372022150034, "learning_rate": 7.956055047590839e-08, "loss": 1.5693, "step": 5536 }, { "epoch": 0.8350803106854686, "grad_norm": 0.25248800480380923, "learning_rate": 7.950774683173011e-08, "loss": 1.5187, "step": 5537 }, { "epoch": 0.8352311288741422, "grad_norm": 0.2730600523929056, "learning_rate": 7.945498708125612e-08, "loss": 1.5953, "step": 5538 }, { "epoch": 0.8353819470628158, "grad_norm": 0.2821780763486024, "learning_rate": 7.940227123633243e-08, "loss": 1.6235, "step": 5539 }, { "epoch": 0.8355327652514893, "grad_norm": 0.24347407182184386, "learning_rate": 7.934959930879521e-08, "loss": 1.5455, "step": 5540 }, { "epoch": 0.8356835834401629, "grad_norm": 0.25890397409168897, "learning_rate": 7.929697131047099e-08, "loss": 1.5123, "step": 5541 }, { "epoch": 0.8358344016288365, "grad_norm": 0.24575142926152252, "learning_rate": 7.924438725317618e-08, "loss": 1.5395, "step": 5542 }, { "epoch": 0.83598521981751, "grad_norm": 0.2890724391677038, "learning_rate": 7.919184714871749e-08, "loss": 1.5507, "step": 5543 }, { "epoch": 0.8361360380061835, "grad_norm": 0.2556031827743178, "learning_rate": 7.91393510088916e-08, "loss": 1.6032, "step": 5544 }, { "epoch": 0.8362868561948571, "grad_norm": 0.2393186238286218, "learning_rate": 7.90868988454856e-08, "loss": 1.5682, "step": 5545 }, { "epoch": 0.8364376743835307, "grad_norm": 0.2543272307152719, "learning_rate": 7.90344906702764e-08, "loss": 1.5486, "step": 5546 }, { "epoch": 0.8365884925722042, "grad_norm": 0.2593772272704211, "learning_rate": 7.898212649503113e-08, "loss": 1.5835, "step": 5547 }, { "epoch": 0.8367393107608777, "grad_norm": 1.1666915568949274, "learning_rate": 7.892980633150718e-08, "loss": 1.6054, "step": 5548 }, { "epoch": 0.8368901289495513, "grad_norm": 0.2557101868732255, "learning_rate": 7.887753019145192e-08, "loss": 1.5073, "step": 5549 }, { "epoch": 0.8370409471382249, "grad_norm": 0.27047911109897305, "learning_rate": 7.882529808660282e-08, "loss": 1.5251, "step": 5550 }, { "epoch": 0.8371917653268984, "grad_norm": 0.2590908915314607, "learning_rate": 7.877311002868755e-08, "loss": 1.608, "step": 5551 }, { "epoch": 0.837342583515572, "grad_norm": 0.23856843868428404, "learning_rate": 7.872096602942385e-08, "loss": 1.5582, "step": 5552 }, { "epoch": 0.8374934017042456, "grad_norm": 0.24239285227628923, "learning_rate": 7.866886610051951e-08, "loss": 1.5828, "step": 5553 }, { "epoch": 0.837644219892919, "grad_norm": 0.2559961614417599, "learning_rate": 7.861681025367257e-08, "loss": 1.5976, "step": 5554 }, { "epoch": 0.8377950380815926, "grad_norm": 0.24511248185342885, "learning_rate": 7.856479850057105e-08, "loss": 1.6036, "step": 5555 }, { "epoch": 0.8379458562702662, "grad_norm": 0.25345278706853985, "learning_rate": 7.851283085289308e-08, "loss": 1.5205, "step": 5556 }, { "epoch": 0.8380966744589398, "grad_norm": 0.2674505680921593, "learning_rate": 7.846090732230689e-08, "loss": 1.5836, "step": 5557 }, { "epoch": 0.8382474926476133, "grad_norm": 0.2675322282599704, "learning_rate": 7.840902792047087e-08, "loss": 1.5791, "step": 5558 }, { "epoch": 0.8383983108362869, "grad_norm": 0.6383639200515794, "learning_rate": 7.835719265903347e-08, "loss": 1.5299, "step": 5559 }, { "epoch": 0.8385491290249604, "grad_norm": 0.3140365428237704, "learning_rate": 7.830540154963315e-08, "loss": 1.6029, "step": 5560 }, { "epoch": 0.838699947213634, "grad_norm": 0.2496152360860865, "learning_rate": 7.82536546038986e-08, "loss": 1.5501, "step": 5561 }, { "epoch": 0.8388507654023075, "grad_norm": 0.25380911714867804, "learning_rate": 7.820195183344848e-08, "loss": 1.5474, "step": 5562 }, { "epoch": 0.8390015835909811, "grad_norm": 0.3370098993111597, "learning_rate": 7.81502932498915e-08, "loss": 1.5793, "step": 5563 }, { "epoch": 0.8391524017796547, "grad_norm": 0.24127129356228108, "learning_rate": 7.809867886482666e-08, "loss": 1.5889, "step": 5564 }, { "epoch": 0.8393032199683281, "grad_norm": 0.24667600687525584, "learning_rate": 7.804710868984279e-08, "loss": 1.5609, "step": 5565 }, { "epoch": 0.8394540381570017, "grad_norm": 0.2400473169198927, "learning_rate": 7.799558273651889e-08, "loss": 1.5117, "step": 5566 }, { "epoch": 0.8396048563456753, "grad_norm": 0.24668529167986164, "learning_rate": 7.794410101642411e-08, "loss": 1.5906, "step": 5567 }, { "epoch": 0.8397556745343489, "grad_norm": 0.25670417154480313, "learning_rate": 7.789266354111755e-08, "loss": 1.5443, "step": 5568 }, { "epoch": 0.8399064927230224, "grad_norm": 0.2523566172943502, "learning_rate": 7.784127032214843e-08, "loss": 1.5969, "step": 5569 }, { "epoch": 0.840057310911696, "grad_norm": 0.25452856223989523, "learning_rate": 7.778992137105597e-08, "loss": 1.5534, "step": 5570 }, { "epoch": 0.8402081291003695, "grad_norm": 0.2544835216453723, "learning_rate": 7.773861669936962e-08, "loss": 1.5626, "step": 5571 }, { "epoch": 0.840358947289043, "grad_norm": 0.2699831837110939, "learning_rate": 7.768735631860867e-08, "loss": 1.6507, "step": 5572 }, { "epoch": 0.8405097654777166, "grad_norm": 0.25028425880563293, "learning_rate": 7.763614024028259e-08, "loss": 1.5905, "step": 5573 }, { "epoch": 0.8406605836663902, "grad_norm": 0.2814255923009269, "learning_rate": 7.758496847589091e-08, "loss": 1.5698, "step": 5574 }, { "epoch": 0.8408114018550638, "grad_norm": 0.6791691978786829, "learning_rate": 7.75338410369232e-08, "loss": 1.516, "step": 5575 }, { "epoch": 0.8409622200437373, "grad_norm": 0.25940149426018794, "learning_rate": 7.748275793485896e-08, "loss": 1.5214, "step": 5576 }, { "epoch": 0.8411130382324108, "grad_norm": 0.24413826048920553, "learning_rate": 7.743171918116797e-08, "loss": 1.5391, "step": 5577 }, { "epoch": 0.8412638564210844, "grad_norm": 0.2461978788362587, "learning_rate": 7.738072478730983e-08, "loss": 1.5605, "step": 5578 }, { "epoch": 0.8414146746097579, "grad_norm": 0.26108702667629563, "learning_rate": 7.732977476473425e-08, "loss": 1.6231, "step": 5579 }, { "epoch": 0.8415654927984315, "grad_norm": 0.2595152265125195, "learning_rate": 7.727886912488106e-08, "loss": 1.5943, "step": 5580 }, { "epoch": 0.8417163109871051, "grad_norm": 0.2658759507588597, "learning_rate": 7.722800787918005e-08, "loss": 1.5507, "step": 5581 }, { "epoch": 0.8418671291757786, "grad_norm": 0.448379173388005, "learning_rate": 7.717719103905102e-08, "loss": 1.4971, "step": 5582 }, { "epoch": 0.8420179473644521, "grad_norm": 0.24230736636294858, "learning_rate": 7.71264186159038e-08, "loss": 1.5485, "step": 5583 }, { "epoch": 0.8421687655531257, "grad_norm": 0.2533778985670752, "learning_rate": 7.707569062113835e-08, "loss": 1.5598, "step": 5584 }, { "epoch": 0.8423195837417993, "grad_norm": 0.2798241000686758, "learning_rate": 7.702500706614454e-08, "loss": 1.6241, "step": 5585 }, { "epoch": 0.8424704019304728, "grad_norm": 0.25086697903795707, "learning_rate": 7.697436796230228e-08, "loss": 1.5514, "step": 5586 }, { "epoch": 0.8426212201191464, "grad_norm": 0.2887072752715407, "learning_rate": 7.69237733209816e-08, "loss": 1.522, "step": 5587 }, { "epoch": 0.8427720383078199, "grad_norm": 0.42417698732702486, "learning_rate": 7.687322315354242e-08, "loss": 1.6105, "step": 5588 }, { "epoch": 0.8429228564964935, "grad_norm": 0.25776113315138227, "learning_rate": 7.682271747133468e-08, "loss": 1.5826, "step": 5589 }, { "epoch": 0.843073674685167, "grad_norm": 0.244347068403562, "learning_rate": 7.67722562856985e-08, "loss": 1.6212, "step": 5590 }, { "epoch": 0.8432244928738406, "grad_norm": 0.28406537972836354, "learning_rate": 7.67218396079638e-08, "loss": 1.5369, "step": 5591 }, { "epoch": 0.8433753110625142, "grad_norm": 0.25018817775540264, "learning_rate": 7.66714674494506e-08, "loss": 1.6136, "step": 5592 }, { "epoch": 0.8435261292511876, "grad_norm": 0.2660496140642561, "learning_rate": 7.662113982146887e-08, "loss": 1.5692, "step": 5593 }, { "epoch": 0.8436769474398612, "grad_norm": 0.24446740140627457, "learning_rate": 7.657085673531874e-08, "loss": 1.5945, "step": 5594 }, { "epoch": 0.8438277656285348, "grad_norm": 0.2528406292945506, "learning_rate": 7.652061820229016e-08, "loss": 1.5499, "step": 5595 }, { "epoch": 0.8439785838172084, "grad_norm": 0.24120135201280207, "learning_rate": 7.647042423366312e-08, "loss": 1.502, "step": 5596 }, { "epoch": 0.8441294020058819, "grad_norm": 0.24289934292254134, "learning_rate": 7.642027484070772e-08, "loss": 1.5658, "step": 5597 }, { "epoch": 0.8442802201945555, "grad_norm": 0.24757642087809018, "learning_rate": 7.637017003468385e-08, "loss": 1.5622, "step": 5598 }, { "epoch": 0.844431038383229, "grad_norm": 0.2507387146409576, "learning_rate": 7.632010982684153e-08, "loss": 1.5469, "step": 5599 }, { "epoch": 0.8445818565719025, "grad_norm": 0.26293837564154415, "learning_rate": 7.627009422842081e-08, "loss": 1.5901, "step": 5600 }, { "epoch": 0.8447326747605761, "grad_norm": 0.24592612200606645, "learning_rate": 7.622012325065157e-08, "loss": 1.5197, "step": 5601 }, { "epoch": 0.8448834929492497, "grad_norm": 0.3179625061264734, "learning_rate": 7.617019690475374e-08, "loss": 1.5853, "step": 5602 }, { "epoch": 0.8450343111379233, "grad_norm": 0.31593090013279307, "learning_rate": 7.61203152019373e-08, "loss": 1.5618, "step": 5603 }, { "epoch": 0.8451851293265968, "grad_norm": 0.24646286521620397, "learning_rate": 7.607047815340212e-08, "loss": 1.511, "step": 5604 }, { "epoch": 0.8453359475152703, "grad_norm": 0.24985925653566338, "learning_rate": 7.602068577033807e-08, "loss": 1.5493, "step": 5605 }, { "epoch": 0.8454867657039439, "grad_norm": 0.23941690494100681, "learning_rate": 7.597093806392493e-08, "loss": 1.5622, "step": 5606 }, { "epoch": 0.8456375838926175, "grad_norm": 1.316138816902561, "learning_rate": 7.59212350453326e-08, "loss": 1.6131, "step": 5607 }, { "epoch": 0.845788402081291, "grad_norm": 0.239637249766247, "learning_rate": 7.587157672572083e-08, "loss": 1.5916, "step": 5608 }, { "epoch": 0.8459392202699646, "grad_norm": 0.3962825317708086, "learning_rate": 7.582196311623928e-08, "loss": 1.589, "step": 5609 }, { "epoch": 0.8460900384586381, "grad_norm": 0.2630091138207068, "learning_rate": 7.577239422802779e-08, "loss": 1.5847, "step": 5610 }, { "epoch": 0.8462408566473116, "grad_norm": 0.2599746401134038, "learning_rate": 7.572287007221593e-08, "loss": 1.5781, "step": 5611 }, { "epoch": 0.8463916748359852, "grad_norm": 0.25472182490482365, "learning_rate": 7.567339065992328e-08, "loss": 1.5655, "step": 5612 }, { "epoch": 0.8465424930246588, "grad_norm": 0.24206481005066374, "learning_rate": 7.56239560022595e-08, "loss": 1.4791, "step": 5613 }, { "epoch": 0.8466933112133324, "grad_norm": 0.24641177015967947, "learning_rate": 7.557456611032408e-08, "loss": 1.5529, "step": 5614 }, { "epoch": 0.8468441294020059, "grad_norm": 0.39188335002461877, "learning_rate": 7.552522099520645e-08, "loss": 1.5349, "step": 5615 }, { "epoch": 0.8469949475906794, "grad_norm": 0.28795551978446376, "learning_rate": 7.547592066798609e-08, "loss": 1.5694, "step": 5616 }, { "epoch": 0.847145765779353, "grad_norm": 0.2527501508656064, "learning_rate": 7.542666513973229e-08, "loss": 1.573, "step": 5617 }, { "epoch": 0.8472965839680265, "grad_norm": 0.32103011451645874, "learning_rate": 7.537745442150443e-08, "loss": 1.5757, "step": 5618 }, { "epoch": 0.8474474021567001, "grad_norm": 0.306449996138448, "learning_rate": 7.532828852435163e-08, "loss": 1.5639, "step": 5619 }, { "epoch": 0.8475982203453737, "grad_norm": 0.25749194558302557, "learning_rate": 7.52791674593132e-08, "loss": 1.5643, "step": 5620 }, { "epoch": 0.8477490385340473, "grad_norm": 0.2840668647636897, "learning_rate": 7.523009123741819e-08, "loss": 1.5831, "step": 5621 }, { "epoch": 0.8478998567227207, "grad_norm": 0.24338742656455553, "learning_rate": 7.518105986968558e-08, "loss": 1.5688, "step": 5622 }, { "epoch": 0.8480506749113943, "grad_norm": 0.24009537974004305, "learning_rate": 7.513207336712446e-08, "loss": 1.5483, "step": 5623 }, { "epoch": 0.8482014931000679, "grad_norm": 0.25237577432805486, "learning_rate": 7.508313174073364e-08, "loss": 1.5215, "step": 5624 }, { "epoch": 0.8483523112887414, "grad_norm": 0.24748066675250419, "learning_rate": 7.503423500150194e-08, "loss": 1.6166, "step": 5625 }, { "epoch": 0.848503129477415, "grad_norm": 0.25702548854248036, "learning_rate": 7.498538316040818e-08, "loss": 1.4807, "step": 5626 }, { "epoch": 0.8486539476660885, "grad_norm": 0.25364583712625355, "learning_rate": 7.493657622842098e-08, "loss": 1.5994, "step": 5627 }, { "epoch": 0.8488047658547621, "grad_norm": 0.2600433736421623, "learning_rate": 7.488781421649882e-08, "loss": 1.5958, "step": 5628 }, { "epoch": 0.8489555840434356, "grad_norm": 0.32046931742402496, "learning_rate": 7.483909713559035e-08, "loss": 1.6043, "step": 5629 }, { "epoch": 0.8491064022321092, "grad_norm": 0.4017573004659793, "learning_rate": 7.479042499663392e-08, "loss": 1.5679, "step": 5630 }, { "epoch": 0.8492572204207828, "grad_norm": 0.26866946450764245, "learning_rate": 7.474179781055782e-08, "loss": 1.592, "step": 5631 }, { "epoch": 0.8494080386094564, "grad_norm": 0.24874450046086075, "learning_rate": 7.46932155882802e-08, "loss": 1.5707, "step": 5632 }, { "epoch": 0.8495588567981298, "grad_norm": 0.660046811125414, "learning_rate": 7.464467834070934e-08, "loss": 1.5288, "step": 5633 }, { "epoch": 0.8497096749868034, "grad_norm": 0.252021037615029, "learning_rate": 7.459618607874316e-08, "loss": 1.5588, "step": 5634 }, { "epoch": 0.849860493175477, "grad_norm": 0.25898808254480193, "learning_rate": 7.454773881326957e-08, "loss": 1.5758, "step": 5635 }, { "epoch": 0.8500113113641505, "grad_norm": 0.24795417150648452, "learning_rate": 7.449933655516646e-08, "loss": 1.5693, "step": 5636 }, { "epoch": 0.8501621295528241, "grad_norm": 0.25017592633937047, "learning_rate": 7.445097931530156e-08, "loss": 1.5651, "step": 5637 }, { "epoch": 0.8503129477414976, "grad_norm": 0.24218888929716142, "learning_rate": 7.440266710453235e-08, "loss": 1.5262, "step": 5638 }, { "epoch": 0.8504637659301711, "grad_norm": 0.24610871936433715, "learning_rate": 7.435439993370649e-08, "loss": 1.5375, "step": 5639 }, { "epoch": 0.8506145841188447, "grad_norm": 0.25806357334048957, "learning_rate": 7.430617781366128e-08, "loss": 1.5717, "step": 5640 }, { "epoch": 0.8507654023075183, "grad_norm": 0.25833198862097434, "learning_rate": 7.425800075522395e-08, "loss": 1.6277, "step": 5641 }, { "epoch": 0.8509162204961919, "grad_norm": 0.39064568521698845, "learning_rate": 7.420986876921174e-08, "loss": 1.5529, "step": 5642 }, { "epoch": 0.8510670386848654, "grad_norm": 0.25120786856002797, "learning_rate": 7.416178186643168e-08, "loss": 1.5673, "step": 5643 }, { "epoch": 0.8512178568735389, "grad_norm": 0.25188296940139504, "learning_rate": 7.411374005768061e-08, "loss": 1.5236, "step": 5644 }, { "epoch": 0.8513686750622125, "grad_norm": 0.2389433206874455, "learning_rate": 7.406574335374534e-08, "loss": 1.5731, "step": 5645 }, { "epoch": 0.851519493250886, "grad_norm": 0.2497285089354711, "learning_rate": 7.401779176540256e-08, "loss": 1.5333, "step": 5646 }, { "epoch": 0.8516703114395596, "grad_norm": 0.25756152163144364, "learning_rate": 7.396988530341877e-08, "loss": 1.5509, "step": 5647 }, { "epoch": 0.8518211296282332, "grad_norm": 0.2539468520583549, "learning_rate": 7.392202397855036e-08, "loss": 1.5229, "step": 5648 }, { "epoch": 0.8519719478169068, "grad_norm": 0.26121415983020824, "learning_rate": 7.387420780154363e-08, "loss": 1.4851, "step": 5649 }, { "epoch": 0.8521227660055802, "grad_norm": 0.261710287768565, "learning_rate": 7.382643678313467e-08, "loss": 1.5977, "step": 5650 }, { "epoch": 0.8522735841942538, "grad_norm": 0.240550327088055, "learning_rate": 7.377871093404946e-08, "loss": 1.52, "step": 5651 }, { "epoch": 0.8524244023829274, "grad_norm": 0.2570836869612234, "learning_rate": 7.373103026500387e-08, "loss": 1.4896, "step": 5652 }, { "epoch": 0.852575220571601, "grad_norm": 0.24238387060556735, "learning_rate": 7.36833947867036e-08, "loss": 1.5692, "step": 5653 }, { "epoch": 0.8527260387602745, "grad_norm": 0.2377588804521395, "learning_rate": 7.363580450984417e-08, "loss": 1.4929, "step": 5654 }, { "epoch": 0.852876856948948, "grad_norm": 0.2501993070047394, "learning_rate": 7.3588259445111e-08, "loss": 1.6074, "step": 5655 }, { "epoch": 0.8530276751376216, "grad_norm": 0.2906505749795475, "learning_rate": 7.354075960317934e-08, "loss": 1.5748, "step": 5656 }, { "epoch": 0.8531784933262951, "grad_norm": 0.39481323868466706, "learning_rate": 7.349330499471432e-08, "loss": 1.5164, "step": 5657 }, { "epoch": 0.8533293115149687, "grad_norm": 0.2640431809310381, "learning_rate": 7.344589563037081e-08, "loss": 1.5883, "step": 5658 }, { "epoch": 0.8534801297036423, "grad_norm": 0.24985927957112009, "learning_rate": 7.339853152079366e-08, "loss": 1.5835, "step": 5659 }, { "epoch": 0.8536309478923159, "grad_norm": 0.24758996546327033, "learning_rate": 7.335121267661747e-08, "loss": 1.6124, "step": 5660 }, { "epoch": 0.8537817660809893, "grad_norm": 0.2858069415079648, "learning_rate": 7.330393910846664e-08, "loss": 1.5454, "step": 5661 }, { "epoch": 0.8539325842696629, "grad_norm": 0.2541569244416825, "learning_rate": 7.32567108269556e-08, "loss": 1.5531, "step": 5662 }, { "epoch": 0.8540834024583365, "grad_norm": 0.2504834782357204, "learning_rate": 7.320952784268838e-08, "loss": 1.5662, "step": 5663 }, { "epoch": 0.85423422064701, "grad_norm": 0.24326816683178065, "learning_rate": 7.316239016625888e-08, "loss": 1.549, "step": 5664 }, { "epoch": 0.8543850388356836, "grad_norm": 0.24333803414480876, "learning_rate": 7.311529780825101e-08, "loss": 1.6189, "step": 5665 }, { "epoch": 0.8545358570243572, "grad_norm": 0.24632431855036313, "learning_rate": 7.306825077923832e-08, "loss": 1.5989, "step": 5666 }, { "epoch": 0.8546866752130307, "grad_norm": 0.24275227716740327, "learning_rate": 7.302124908978419e-08, "loss": 1.539, "step": 5667 }, { "epoch": 0.8548374934017042, "grad_norm": 0.27317071756739514, "learning_rate": 7.297429275044193e-08, "loss": 1.5005, "step": 5668 }, { "epoch": 0.8549883115903778, "grad_norm": 0.5863882394070457, "learning_rate": 7.292738177175462e-08, "loss": 1.6435, "step": 5669 }, { "epoch": 0.8551391297790514, "grad_norm": 0.2546533554328017, "learning_rate": 7.28805161642551e-08, "loss": 1.5396, "step": 5670 }, { "epoch": 0.855289947967725, "grad_norm": 0.2611516498872777, "learning_rate": 7.283369593846602e-08, "loss": 1.5391, "step": 5671 }, { "epoch": 0.8554407661563984, "grad_norm": 0.25575431344121646, "learning_rate": 7.278692110489998e-08, "loss": 1.66, "step": 5672 }, { "epoch": 0.855591584345072, "grad_norm": 0.25337217431271386, "learning_rate": 7.274019167405922e-08, "loss": 1.5817, "step": 5673 }, { "epoch": 0.8557424025337456, "grad_norm": 0.25231846480237374, "learning_rate": 7.269350765643588e-08, "loss": 1.5301, "step": 5674 }, { "epoch": 0.8558932207224191, "grad_norm": 0.2430603528695757, "learning_rate": 7.264686906251191e-08, "loss": 1.5735, "step": 5675 }, { "epoch": 0.8560440389110927, "grad_norm": 0.25613443783027007, "learning_rate": 7.260027590275903e-08, "loss": 1.5524, "step": 5676 }, { "epoch": 0.8561948570997663, "grad_norm": 0.41771966232826235, "learning_rate": 7.255372818763868e-08, "loss": 1.5148, "step": 5677 }, { "epoch": 0.8563456752884397, "grad_norm": 0.26630543909180215, "learning_rate": 7.25072259276023e-08, "loss": 1.6217, "step": 5678 }, { "epoch": 0.8564964934771133, "grad_norm": 0.27526533874062464, "learning_rate": 7.246076913309094e-08, "loss": 1.5862, "step": 5679 }, { "epoch": 0.8566473116657869, "grad_norm": 0.2429329669365647, "learning_rate": 7.241435781453543e-08, "loss": 1.6406, "step": 5680 }, { "epoch": 0.8567981298544605, "grad_norm": 0.2668874326283706, "learning_rate": 7.23679919823566e-08, "loss": 1.5187, "step": 5681 }, { "epoch": 0.856948948043134, "grad_norm": 0.24466236372841946, "learning_rate": 7.232167164696487e-08, "loss": 1.5276, "step": 5682 }, { "epoch": 0.8570997662318075, "grad_norm": 0.30415978131778804, "learning_rate": 7.22753968187605e-08, "loss": 1.5922, "step": 5683 }, { "epoch": 0.8572505844204811, "grad_norm": 0.26139733105451657, "learning_rate": 7.222916750813349e-08, "loss": 1.5761, "step": 5684 }, { "epoch": 0.8574014026091547, "grad_norm": 0.6127118554212033, "learning_rate": 7.218298372546374e-08, "loss": 1.5888, "step": 5685 }, { "epoch": 0.8575522207978282, "grad_norm": 0.2401331880168586, "learning_rate": 7.213684548112085e-08, "loss": 1.5736, "step": 5686 }, { "epoch": 0.8577030389865018, "grad_norm": 0.2444064979383252, "learning_rate": 7.209075278546411e-08, "loss": 1.5905, "step": 5687 }, { "epoch": 0.8578538571751754, "grad_norm": 0.2532890179769075, "learning_rate": 7.204470564884281e-08, "loss": 1.4892, "step": 5688 }, { "epoch": 0.8580046753638488, "grad_norm": 0.23250439984319513, "learning_rate": 7.19987040815958e-08, "loss": 1.5769, "step": 5689 }, { "epoch": 0.8581554935525224, "grad_norm": 0.2862891322557891, "learning_rate": 7.195274809405173e-08, "loss": 1.5883, "step": 5690 }, { "epoch": 0.858306311741196, "grad_norm": 0.26399834309076403, "learning_rate": 7.190683769652914e-08, "loss": 1.5601, "step": 5691 }, { "epoch": 0.8584571299298696, "grad_norm": 0.24562698941058486, "learning_rate": 7.186097289933623e-08, "loss": 1.5354, "step": 5692 }, { "epoch": 0.8586079481185431, "grad_norm": 0.2715514934169429, "learning_rate": 7.181515371277094e-08, "loss": 1.592, "step": 5693 }, { "epoch": 0.8587587663072167, "grad_norm": 0.2523047760437185, "learning_rate": 7.176938014712101e-08, "loss": 1.5668, "step": 5694 }, { "epoch": 0.8589095844958902, "grad_norm": 2.259263223543891, "learning_rate": 7.172365221266402e-08, "loss": 1.6373, "step": 5695 }, { "epoch": 0.8590604026845637, "grad_norm": 0.26405129336613986, "learning_rate": 7.167796991966716e-08, "loss": 1.6148, "step": 5696 }, { "epoch": 0.8592112208732373, "grad_norm": 0.24976180957847277, "learning_rate": 7.163233327838739e-08, "loss": 1.58, "step": 5697 }, { "epoch": 0.8593620390619109, "grad_norm": 0.2584168725948234, "learning_rate": 7.158674229907154e-08, "loss": 1.5139, "step": 5698 }, { "epoch": 0.8595128572505845, "grad_norm": 0.2590229882263229, "learning_rate": 7.154119699195613e-08, "loss": 1.5317, "step": 5699 }, { "epoch": 0.8596636754392579, "grad_norm": 0.2861304517231533, "learning_rate": 7.149569736726736e-08, "loss": 1.6012, "step": 5700 }, { "epoch": 0.8598144936279315, "grad_norm": 0.2847592989614429, "learning_rate": 7.145024343522119e-08, "loss": 1.5457, "step": 5701 }, { "epoch": 0.8599653118166051, "grad_norm": 0.8991918096872035, "learning_rate": 7.140483520602345e-08, "loss": 1.6882, "step": 5702 }, { "epoch": 0.8601161300052786, "grad_norm": 0.2564977267425987, "learning_rate": 7.135947268986955e-08, "loss": 1.5235, "step": 5703 }, { "epoch": 0.8602669481939522, "grad_norm": 0.2441356832573264, "learning_rate": 7.131415589694465e-08, "loss": 1.5601, "step": 5704 }, { "epoch": 0.8604177663826258, "grad_norm": 0.24779244117565505, "learning_rate": 7.126888483742377e-08, "loss": 1.4907, "step": 5705 }, { "epoch": 0.8605685845712993, "grad_norm": 0.24055359786155137, "learning_rate": 7.122365952147156e-08, "loss": 1.6108, "step": 5706 }, { "epoch": 0.8607194027599728, "grad_norm": 0.2662572871437054, "learning_rate": 7.117847995924237e-08, "loss": 1.543, "step": 5707 }, { "epoch": 0.8608702209486464, "grad_norm": 0.2548125020106732, "learning_rate": 7.113334616088041e-08, "loss": 1.6025, "step": 5708 }, { "epoch": 0.86102103913732, "grad_norm": 0.25052042672988706, "learning_rate": 7.108825813651951e-08, "loss": 1.5637, "step": 5709 }, { "epoch": 0.8611718573259936, "grad_norm": 0.2727387078531334, "learning_rate": 7.104321589628318e-08, "loss": 1.6394, "step": 5710 }, { "epoch": 0.8613226755146671, "grad_norm": 0.27553186094645604, "learning_rate": 7.099821945028474e-08, "loss": 1.5856, "step": 5711 }, { "epoch": 0.8614734937033406, "grad_norm": 0.24357870548678773, "learning_rate": 7.095326880862729e-08, "loss": 1.555, "step": 5712 }, { "epoch": 0.8616243118920142, "grad_norm": 0.3300369116661402, "learning_rate": 7.090836398140346e-08, "loss": 1.5183, "step": 5713 }, { "epoch": 0.8617751300806877, "grad_norm": 0.2534416640536777, "learning_rate": 7.086350497869567e-08, "loss": 1.5871, "step": 5714 }, { "epoch": 0.8619259482693613, "grad_norm": 0.24730149269518625, "learning_rate": 7.081869181057618e-08, "loss": 1.5443, "step": 5715 }, { "epoch": 0.8620767664580349, "grad_norm": 0.2487861282647905, "learning_rate": 7.077392448710681e-08, "loss": 1.5806, "step": 5716 }, { "epoch": 0.8622275846467083, "grad_norm": 0.2554637618776619, "learning_rate": 7.072920301833905e-08, "loss": 1.5154, "step": 5717 }, { "epoch": 0.8623784028353819, "grad_norm": 0.2516345689248287, "learning_rate": 7.068452741431429e-08, "loss": 1.5671, "step": 5718 }, { "epoch": 0.8625292210240555, "grad_norm": 0.24574308695529595, "learning_rate": 7.063989768506345e-08, "loss": 1.5913, "step": 5719 }, { "epoch": 0.8626800392127291, "grad_norm": 0.3014547838909663, "learning_rate": 7.059531384060716e-08, "loss": 1.5404, "step": 5720 }, { "epoch": 0.8628308574014026, "grad_norm": 0.25323018494644195, "learning_rate": 7.055077589095587e-08, "loss": 1.5407, "step": 5721 }, { "epoch": 0.8629816755900762, "grad_norm": 0.31986724307662023, "learning_rate": 7.050628384610962e-08, "loss": 1.5921, "step": 5722 }, { "epoch": 0.8631324937787497, "grad_norm": 0.2924292113515159, "learning_rate": 7.046183771605815e-08, "loss": 1.4993, "step": 5723 }, { "epoch": 0.8632833119674233, "grad_norm": 0.2472371685103149, "learning_rate": 7.041743751078089e-08, "loss": 1.5444, "step": 5724 }, { "epoch": 0.8634341301560968, "grad_norm": 0.245464840150854, "learning_rate": 7.037308324024704e-08, "loss": 1.5841, "step": 5725 }, { "epoch": 0.8635849483447704, "grad_norm": 0.24565565993670377, "learning_rate": 7.03287749144154e-08, "loss": 1.5878, "step": 5726 }, { "epoch": 0.863735766533444, "grad_norm": 0.2387712430985956, "learning_rate": 7.028451254323447e-08, "loss": 1.5179, "step": 5727 }, { "epoch": 0.8638865847221174, "grad_norm": 0.24125454898533927, "learning_rate": 7.024029613664247e-08, "loss": 1.4979, "step": 5728 }, { "epoch": 0.864037402910791, "grad_norm": 0.2606603120641778, "learning_rate": 7.019612570456723e-08, "loss": 1.612, "step": 5729 }, { "epoch": 0.8641882210994646, "grad_norm": 0.2708461358028818, "learning_rate": 7.015200125692629e-08, "loss": 1.4908, "step": 5730 }, { "epoch": 0.8643390392881382, "grad_norm": 0.2495650801146675, "learning_rate": 7.010792280362695e-08, "loss": 1.6097, "step": 5731 }, { "epoch": 0.8644898574768117, "grad_norm": 0.2630385060355195, "learning_rate": 7.006389035456604e-08, "loss": 1.607, "step": 5732 }, { "epoch": 0.8646406756654853, "grad_norm": 0.2541416968748563, "learning_rate": 7.00199039196301e-08, "loss": 1.5801, "step": 5733 }, { "epoch": 0.8647914938541588, "grad_norm": 0.26597707933889025, "learning_rate": 6.997596350869547e-08, "loss": 1.5898, "step": 5734 }, { "epoch": 0.8649423120428323, "grad_norm": 0.2506369269093555, "learning_rate": 6.993206913162799e-08, "loss": 1.5741, "step": 5735 }, { "epoch": 0.8650931302315059, "grad_norm": 0.2634889182152881, "learning_rate": 6.988822079828322e-08, "loss": 1.5036, "step": 5736 }, { "epoch": 0.8652439484201795, "grad_norm": 0.4923504453984544, "learning_rate": 6.984441851850637e-08, "loss": 1.5609, "step": 5737 }, { "epoch": 0.8653947666088531, "grad_norm": 0.2521371255702623, "learning_rate": 6.980066230213236e-08, "loss": 1.5463, "step": 5738 }, { "epoch": 0.8655455847975266, "grad_norm": 0.24552411522231396, "learning_rate": 6.975695215898576e-08, "loss": 1.4921, "step": 5739 }, { "epoch": 0.8656964029862001, "grad_norm": 0.978859631129009, "learning_rate": 6.97132880988807e-08, "loss": 1.5929, "step": 5740 }, { "epoch": 0.8658472211748737, "grad_norm": 0.24414622961417418, "learning_rate": 6.966967013162108e-08, "loss": 1.5397, "step": 5741 }, { "epoch": 0.8659980393635472, "grad_norm": 0.24789664979346512, "learning_rate": 6.962609826700042e-08, "loss": 1.5825, "step": 5742 }, { "epoch": 0.8661488575522208, "grad_norm": 0.23748110644282722, "learning_rate": 6.95825725148018e-08, "loss": 1.5822, "step": 5743 }, { "epoch": 0.8662996757408944, "grad_norm": 0.3831547332485798, "learning_rate": 6.953909288479811e-08, "loss": 1.5613, "step": 5744 }, { "epoch": 0.8664504939295679, "grad_norm": 0.23485383567984625, "learning_rate": 6.949565938675177e-08, "loss": 1.5425, "step": 5745 }, { "epoch": 0.8666013121182414, "grad_norm": 0.2429333921619344, "learning_rate": 6.945227203041478e-08, "loss": 1.5293, "step": 5746 }, { "epoch": 0.866752130306915, "grad_norm": 0.2368445493413857, "learning_rate": 6.940893082552898e-08, "loss": 1.4339, "step": 5747 }, { "epoch": 0.8669029484955886, "grad_norm": 0.26343309796527276, "learning_rate": 6.936563578182568e-08, "loss": 1.585, "step": 5748 }, { "epoch": 0.8670537666842622, "grad_norm": 0.26592788291536285, "learning_rate": 6.932238690902587e-08, "loss": 1.5261, "step": 5749 }, { "epoch": 0.8672045848729357, "grad_norm": 0.2662401816901963, "learning_rate": 6.927918421684018e-08, "loss": 1.5693, "step": 5750 }, { "epoch": 0.8673554030616092, "grad_norm": 0.2549205869783248, "learning_rate": 6.923602771496892e-08, "loss": 1.5528, "step": 5751 }, { "epoch": 0.8675062212502828, "grad_norm": 0.2572567660331949, "learning_rate": 6.919291741310195e-08, "loss": 1.5549, "step": 5752 }, { "epoch": 0.8676570394389563, "grad_norm": 0.2505870450308744, "learning_rate": 6.914985332091872e-08, "loss": 1.5467, "step": 5753 }, { "epoch": 0.8678078576276299, "grad_norm": 0.26579179135339104, "learning_rate": 6.91068354480885e-08, "loss": 1.5146, "step": 5754 }, { "epoch": 0.8679586758163035, "grad_norm": 0.24709517636008974, "learning_rate": 6.906386380426997e-08, "loss": 1.5421, "step": 5755 }, { "epoch": 0.868109494004977, "grad_norm": 0.24904109958646975, "learning_rate": 6.902093839911153e-08, "loss": 1.549, "step": 5756 }, { "epoch": 0.8682603121936505, "grad_norm": 0.24555385869845897, "learning_rate": 6.897805924225121e-08, "loss": 1.5294, "step": 5757 }, { "epoch": 0.8684111303823241, "grad_norm": 0.2531488037293015, "learning_rate": 6.893522634331664e-08, "loss": 1.5193, "step": 5758 }, { "epoch": 0.8685619485709977, "grad_norm": 0.24920935663852561, "learning_rate": 6.889243971192496e-08, "loss": 1.569, "step": 5759 }, { "epoch": 0.8687127667596712, "grad_norm": 0.2518169601440843, "learning_rate": 6.884969935768312e-08, "loss": 1.5928, "step": 5760 }, { "epoch": 0.8688635849483448, "grad_norm": 0.2563759875025315, "learning_rate": 6.880700529018754e-08, "loss": 1.5736, "step": 5761 }, { "epoch": 0.8690144031370183, "grad_norm": 0.24551476144593656, "learning_rate": 6.876435751902429e-08, "loss": 1.5914, "step": 5762 }, { "epoch": 0.8691652213256919, "grad_norm": 0.2555129218654098, "learning_rate": 6.872175605376895e-08, "loss": 1.552, "step": 5763 }, { "epoch": 0.8693160395143654, "grad_norm": 0.24750220376331367, "learning_rate": 6.867920090398691e-08, "loss": 1.5317, "step": 5764 }, { "epoch": 0.869466857703039, "grad_norm": 0.25743591918926084, "learning_rate": 6.863669207923304e-08, "loss": 1.5233, "step": 5765 }, { "epoch": 0.8696176758917126, "grad_norm": 0.2628750595949352, "learning_rate": 6.859422958905168e-08, "loss": 1.5431, "step": 5766 }, { "epoch": 0.8697684940803861, "grad_norm": 0.2526559671701926, "learning_rate": 6.8551813442977e-08, "loss": 1.5568, "step": 5767 }, { "epoch": 0.8699193122690596, "grad_norm": 0.2608591221468531, "learning_rate": 6.850944365053267e-08, "loss": 1.577, "step": 5768 }, { "epoch": 0.8700701304577332, "grad_norm": 0.24115166603471838, "learning_rate": 6.846712022123188e-08, "loss": 1.5662, "step": 5769 }, { "epoch": 0.8702209486464068, "grad_norm": 0.25783684286768616, "learning_rate": 6.84248431645775e-08, "loss": 1.5063, "step": 5770 }, { "epoch": 0.8703717668350803, "grad_norm": 0.25905139910313585, "learning_rate": 6.838261249006201e-08, "loss": 1.5644, "step": 5771 }, { "epoch": 0.8705225850237539, "grad_norm": 0.3336424296277969, "learning_rate": 6.834042820716731e-08, "loss": 1.5495, "step": 5772 }, { "epoch": 0.8706734032124274, "grad_norm": 0.25415718407660476, "learning_rate": 6.829829032536512e-08, "loss": 1.5935, "step": 5773 }, { "epoch": 0.8708242214011009, "grad_norm": 0.2627446059365829, "learning_rate": 6.825619885411657e-08, "loss": 1.5677, "step": 5774 }, { "epoch": 0.8709750395897745, "grad_norm": 0.27321392533057337, "learning_rate": 6.82141538028724e-08, "loss": 1.603, "step": 5775 }, { "epoch": 0.8711258577784481, "grad_norm": 0.49729836523555004, "learning_rate": 6.817215518107296e-08, "loss": 1.5617, "step": 5776 }, { "epoch": 0.8712766759671217, "grad_norm": 0.25681974644368083, "learning_rate": 6.813020299814821e-08, "loss": 1.5916, "step": 5777 }, { "epoch": 0.8714274941557952, "grad_norm": 0.2599369277347524, "learning_rate": 6.80882972635176e-08, "loss": 1.5781, "step": 5778 }, { "epoch": 0.8715783123444687, "grad_norm": 0.7437037849338393, "learning_rate": 6.804643798659014e-08, "loss": 1.5506, "step": 5779 }, { "epoch": 0.8717291305331423, "grad_norm": 0.24341262204991174, "learning_rate": 6.800462517676456e-08, "loss": 1.5698, "step": 5780 }, { "epoch": 0.8718799487218158, "grad_norm": 0.28515803115037064, "learning_rate": 6.796285884342899e-08, "loss": 1.5737, "step": 5781 }, { "epoch": 0.8720307669104894, "grad_norm": 0.24612164199905281, "learning_rate": 6.792113899596117e-08, "loss": 1.5673, "step": 5782 }, { "epoch": 0.872181585099163, "grad_norm": 0.25811193708811464, "learning_rate": 6.787946564372847e-08, "loss": 1.5352, "step": 5783 }, { "epoch": 0.8723324032878366, "grad_norm": 0.24997846469445648, "learning_rate": 6.783783879608776e-08, "loss": 1.6021, "step": 5784 }, { "epoch": 0.87248322147651, "grad_norm": 0.25787697298968776, "learning_rate": 6.779625846238543e-08, "loss": 1.542, "step": 5785 }, { "epoch": 0.8726340396651836, "grad_norm": 0.24797558138876954, "learning_rate": 6.775472465195755e-08, "loss": 1.5959, "step": 5786 }, { "epoch": 0.8727848578538572, "grad_norm": 0.24589188754588415, "learning_rate": 6.771323737412964e-08, "loss": 1.6038, "step": 5787 }, { "epoch": 0.8729356760425308, "grad_norm": 0.24759771757639476, "learning_rate": 6.767179663821677e-08, "loss": 1.5319, "step": 5788 }, { "epoch": 0.8730864942312043, "grad_norm": 0.24471607955377447, "learning_rate": 6.763040245352363e-08, "loss": 1.5651, "step": 5789 }, { "epoch": 0.8732373124198778, "grad_norm": 0.27370792316214204, "learning_rate": 6.75890548293444e-08, "loss": 1.5699, "step": 5790 }, { "epoch": 0.8733881306085514, "grad_norm": 0.24812544496721092, "learning_rate": 6.754775377496287e-08, "loss": 1.5811, "step": 5791 }, { "epoch": 0.8735389487972249, "grad_norm": 0.24741161438852477, "learning_rate": 6.750649929965226e-08, "loss": 1.552, "step": 5792 }, { "epoch": 0.8736897669858985, "grad_norm": 0.24811467249527291, "learning_rate": 6.746529141267546e-08, "loss": 1.5063, "step": 5793 }, { "epoch": 0.8738405851745721, "grad_norm": 0.29930632459785994, "learning_rate": 6.742413012328479e-08, "loss": 1.5434, "step": 5794 }, { "epoch": 0.8739914033632457, "grad_norm": 0.276737500035982, "learning_rate": 6.738301544072216e-08, "loss": 1.5531, "step": 5795 }, { "epoch": 0.8741422215519191, "grad_norm": 0.2597364745992147, "learning_rate": 6.734194737421909e-08, "loss": 1.5429, "step": 5796 }, { "epoch": 0.8742930397405927, "grad_norm": 0.26515093591468036, "learning_rate": 6.730092593299652e-08, "loss": 1.5143, "step": 5797 }, { "epoch": 0.8744438579292663, "grad_norm": 0.3663922869437447, "learning_rate": 6.725995112626489e-08, "loss": 1.606, "step": 5798 }, { "epoch": 0.8745946761179398, "grad_norm": 0.3252535706789248, "learning_rate": 6.721902296322432e-08, "loss": 1.437, "step": 5799 }, { "epoch": 0.8747454943066134, "grad_norm": 0.2568741329223731, "learning_rate": 6.717814145306436e-08, "loss": 1.5478, "step": 5800 }, { "epoch": 0.8748963124952869, "grad_norm": 0.24996302895460534, "learning_rate": 6.713730660496409e-08, "loss": 1.5551, "step": 5801 }, { "epoch": 0.8750471306839605, "grad_norm": 0.2545392985313996, "learning_rate": 6.709651842809205e-08, "loss": 1.6085, "step": 5802 }, { "epoch": 0.875197948872634, "grad_norm": 0.25044393006807736, "learning_rate": 6.70557769316065e-08, "loss": 1.503, "step": 5803 }, { "epoch": 0.8753487670613076, "grad_norm": 0.24949983065925968, "learning_rate": 6.701508212465503e-08, "loss": 1.5185, "step": 5804 }, { "epoch": 0.8754995852499812, "grad_norm": 0.24634745287309373, "learning_rate": 6.697443401637479e-08, "loss": 1.53, "step": 5805 }, { "epoch": 0.8756504034386547, "grad_norm": 0.2546824800620017, "learning_rate": 6.69338326158925e-08, "loss": 1.5123, "step": 5806 }, { "epoch": 0.8758012216273282, "grad_norm": 0.36811860514044387, "learning_rate": 6.689327793232436e-08, "loss": 1.5241, "step": 5807 }, { "epoch": 0.8759520398160018, "grad_norm": 0.2606483365818859, "learning_rate": 6.685276997477603e-08, "loss": 1.5727, "step": 5808 }, { "epoch": 0.8761028580046754, "grad_norm": 0.2734831217011574, "learning_rate": 6.681230875234277e-08, "loss": 1.5894, "step": 5809 }, { "epoch": 0.8762536761933489, "grad_norm": 0.3024542050398058, "learning_rate": 6.677189427410932e-08, "loss": 1.6002, "step": 5810 }, { "epoch": 0.8764044943820225, "grad_norm": 0.2668761682946003, "learning_rate": 6.673152654914985e-08, "loss": 1.5581, "step": 5811 }, { "epoch": 0.8765553125706961, "grad_norm": 0.2625276781394117, "learning_rate": 6.669120558652811e-08, "loss": 1.6457, "step": 5812 }, { "epoch": 0.8767061307593695, "grad_norm": 0.24232071568159685, "learning_rate": 6.665093139529735e-08, "loss": 1.5521, "step": 5813 }, { "epoch": 0.8768569489480431, "grad_norm": 0.2624439848241586, "learning_rate": 6.661070398450031e-08, "loss": 1.5716, "step": 5814 }, { "epoch": 0.8770077671367167, "grad_norm": 0.25094127590812293, "learning_rate": 6.657052336316915e-08, "loss": 1.6076, "step": 5815 }, { "epoch": 0.8771585853253903, "grad_norm": 0.25558888200209967, "learning_rate": 6.653038954032567e-08, "loss": 1.5274, "step": 5816 }, { "epoch": 0.8773094035140638, "grad_norm": 0.28301639685082025, "learning_rate": 6.649030252498103e-08, "loss": 1.6492, "step": 5817 }, { "epoch": 0.8774602217027373, "grad_norm": 0.2522194226059869, "learning_rate": 6.645026232613591e-08, "loss": 1.49, "step": 5818 }, { "epoch": 0.8776110398914109, "grad_norm": 0.25572083691157027, "learning_rate": 6.64102689527806e-08, "loss": 1.5293, "step": 5819 }, { "epoch": 0.8777618580800844, "grad_norm": 0.2575052070493476, "learning_rate": 6.637032241389469e-08, "loss": 1.6022, "step": 5820 }, { "epoch": 0.877912676268758, "grad_norm": 0.2473719042944877, "learning_rate": 6.633042271844735e-08, "loss": 1.52, "step": 5821 }, { "epoch": 0.8780634944574316, "grad_norm": 0.26112198892787164, "learning_rate": 6.629056987539724e-08, "loss": 1.5811, "step": 5822 }, { "epoch": 0.8782143126461052, "grad_norm": 0.26605878544749945, "learning_rate": 6.625076389369253e-08, "loss": 1.532, "step": 5823 }, { "epoch": 0.8783651308347786, "grad_norm": 0.2705578409821992, "learning_rate": 6.621100478227074e-08, "loss": 1.5703, "step": 5824 }, { "epoch": 0.8785159490234522, "grad_norm": 0.3073932781527974, "learning_rate": 6.617129255005897e-08, "loss": 1.5783, "step": 5825 }, { "epoch": 0.8786667672121258, "grad_norm": 0.24982712134015259, "learning_rate": 6.613162720597379e-08, "loss": 1.539, "step": 5826 }, { "epoch": 0.8788175854007994, "grad_norm": 0.2527065045533689, "learning_rate": 6.609200875892125e-08, "loss": 1.5615, "step": 5827 }, { "epoch": 0.8789684035894729, "grad_norm": 0.2579373139291263, "learning_rate": 6.605243721779679e-08, "loss": 1.499, "step": 5828 }, { "epoch": 0.8791192217781465, "grad_norm": 0.2577578580334874, "learning_rate": 6.601291259148542e-08, "loss": 1.6307, "step": 5829 }, { "epoch": 0.87927003996682, "grad_norm": 0.4646593161063779, "learning_rate": 6.597343488886152e-08, "loss": 1.592, "step": 5830 }, { "epoch": 0.8794208581554935, "grad_norm": 0.24590121517129615, "learning_rate": 6.593400411878904e-08, "loss": 1.5284, "step": 5831 }, { "epoch": 0.8795716763441671, "grad_norm": 0.2546596404400722, "learning_rate": 6.58946202901213e-08, "loss": 1.5823, "step": 5832 }, { "epoch": 0.8797224945328407, "grad_norm": 0.24785867246068702, "learning_rate": 6.585528341170113e-08, "loss": 1.6144, "step": 5833 }, { "epoch": 0.8798733127215143, "grad_norm": 0.257903523205089, "learning_rate": 6.58159934923608e-08, "loss": 1.6212, "step": 5834 }, { "epoch": 0.8800241309101877, "grad_norm": 0.28376176827500943, "learning_rate": 6.577675054092204e-08, "loss": 1.5282, "step": 5835 }, { "epoch": 0.8801749490988613, "grad_norm": 0.2413382149580761, "learning_rate": 6.573755456619605e-08, "loss": 1.5698, "step": 5836 }, { "epoch": 0.8803257672875349, "grad_norm": 0.841519277052757, "learning_rate": 6.569840557698348e-08, "loss": 1.56, "step": 5837 }, { "epoch": 0.8804765854762084, "grad_norm": 0.24583507349649472, "learning_rate": 6.565930358207432e-08, "loss": 1.5974, "step": 5838 }, { "epoch": 0.880627403664882, "grad_norm": 0.25423698271210005, "learning_rate": 6.562024859024823e-08, "loss": 1.6486, "step": 5839 }, { "epoch": 0.8807782218535556, "grad_norm": 0.2737973890941905, "learning_rate": 6.558124061027416e-08, "loss": 1.5341, "step": 5840 }, { "epoch": 0.880929040042229, "grad_norm": 0.23529592907654184, "learning_rate": 6.554227965091046e-08, "loss": 1.5336, "step": 5841 }, { "epoch": 0.8810798582309026, "grad_norm": 0.2706876439468758, "learning_rate": 6.550336572090511e-08, "loss": 1.5282, "step": 5842 }, { "epoch": 0.8812306764195762, "grad_norm": 0.25116041596406796, "learning_rate": 6.546449882899538e-08, "loss": 1.5206, "step": 5843 }, { "epoch": 0.8813814946082498, "grad_norm": 0.2541844979530065, "learning_rate": 6.542567898390796e-08, "loss": 1.5461, "step": 5844 }, { "epoch": 0.8815323127969233, "grad_norm": 0.2602737220053533, "learning_rate": 6.538690619435913e-08, "loss": 1.5829, "step": 5845 }, { "epoch": 0.8816831309855968, "grad_norm": 0.5266348515071663, "learning_rate": 6.534818046905448e-08, "loss": 1.6131, "step": 5846 }, { "epoch": 0.8818339491742704, "grad_norm": 0.23945077495057282, "learning_rate": 6.5309501816689e-08, "loss": 1.5225, "step": 5847 }, { "epoch": 0.881984767362944, "grad_norm": 0.24306577197166404, "learning_rate": 6.527087024594727e-08, "loss": 1.6451, "step": 5848 }, { "epoch": 0.8821355855516175, "grad_norm": 0.2511505414286209, "learning_rate": 6.523228576550313e-08, "loss": 1.5301, "step": 5849 }, { "epoch": 0.8822864037402911, "grad_norm": 0.3157981994234282, "learning_rate": 6.519374838401996e-08, "loss": 1.547, "step": 5850 }, { "epoch": 0.8824372219289647, "grad_norm": 0.24197185158085258, "learning_rate": 6.51552581101505e-08, "loss": 1.5305, "step": 5851 }, { "epoch": 0.8825880401176381, "grad_norm": 0.2780996962107519, "learning_rate": 6.511681495253696e-08, "loss": 1.5642, "step": 5852 }, { "epoch": 0.8827388583063117, "grad_norm": 0.25352910001836493, "learning_rate": 6.507841891981095e-08, "loss": 1.534, "step": 5853 }, { "epoch": 0.8828896764949853, "grad_norm": 0.25044494272911494, "learning_rate": 6.504007002059343e-08, "loss": 1.5529, "step": 5854 }, { "epoch": 0.8830404946836589, "grad_norm": 0.3117023973963776, "learning_rate": 6.500176826349495e-08, "loss": 1.5422, "step": 5855 }, { "epoch": 0.8831913128723324, "grad_norm": 0.2442724461387855, "learning_rate": 6.496351365711531e-08, "loss": 1.5985, "step": 5856 }, { "epoch": 0.883342131061006, "grad_norm": 0.270081945555682, "learning_rate": 6.492530621004377e-08, "loss": 1.494, "step": 5857 }, { "epoch": 0.8834929492496795, "grad_norm": 0.2752048956448727, "learning_rate": 6.488714593085907e-08, "loss": 1.5454, "step": 5858 }, { "epoch": 0.883643767438353, "grad_norm": 0.25020273816589, "learning_rate": 6.484903282812927e-08, "loss": 1.5419, "step": 5859 }, { "epoch": 0.8837945856270266, "grad_norm": 0.2552255559663703, "learning_rate": 6.481096691041184e-08, "loss": 1.5507, "step": 5860 }, { "epoch": 0.8839454038157002, "grad_norm": 0.24406927446969745, "learning_rate": 6.477294818625375e-08, "loss": 1.5815, "step": 5861 }, { "epoch": 0.8840962220043738, "grad_norm": 0.2691424640015323, "learning_rate": 6.473497666419127e-08, "loss": 1.5694, "step": 5862 }, { "epoch": 0.8842470401930472, "grad_norm": 0.27891157110759063, "learning_rate": 6.469705235275016e-08, "loss": 1.5966, "step": 5863 }, { "epoch": 0.8843978583817208, "grad_norm": 0.4048163210053456, "learning_rate": 6.465917526044541e-08, "loss": 1.6641, "step": 5864 }, { "epoch": 0.8845486765703944, "grad_norm": 0.2517174592052388, "learning_rate": 6.462134539578169e-08, "loss": 1.6265, "step": 5865 }, { "epoch": 0.884699494759068, "grad_norm": 0.2535782327644837, "learning_rate": 6.458356276725281e-08, "loss": 1.5737, "step": 5866 }, { "epoch": 0.8848503129477415, "grad_norm": 0.24205301714222377, "learning_rate": 6.454582738334208e-08, "loss": 1.589, "step": 5867 }, { "epoch": 0.8850011311364151, "grad_norm": 0.2564795886088914, "learning_rate": 6.450813925252221e-08, "loss": 1.5431, "step": 5868 }, { "epoch": 0.8851519493250886, "grad_norm": 0.4133754335843493, "learning_rate": 6.447049838325529e-08, "loss": 1.5507, "step": 5869 }, { "epoch": 0.8853027675137621, "grad_norm": 0.28005206464773075, "learning_rate": 6.443290478399276e-08, "loss": 1.5409, "step": 5870 }, { "epoch": 0.8854535857024357, "grad_norm": 0.5400376931836294, "learning_rate": 6.439535846317553e-08, "loss": 1.5197, "step": 5871 }, { "epoch": 0.8856044038911093, "grad_norm": 0.2735622427700534, "learning_rate": 6.435785942923379e-08, "loss": 1.6257, "step": 5872 }, { "epoch": 0.8857552220797829, "grad_norm": 0.26715549035195923, "learning_rate": 6.432040769058716e-08, "loss": 1.541, "step": 5873 }, { "epoch": 0.8859060402684564, "grad_norm": 0.2550453636354242, "learning_rate": 6.42830032556447e-08, "loss": 1.6178, "step": 5874 }, { "epoch": 0.8860568584571299, "grad_norm": 0.24667162238930262, "learning_rate": 6.424564613280478e-08, "loss": 1.566, "step": 5875 }, { "epoch": 0.8862076766458035, "grad_norm": 0.25575405213987346, "learning_rate": 6.420833633045514e-08, "loss": 1.5986, "step": 5876 }, { "epoch": 0.886358494834477, "grad_norm": 0.2567939555373869, "learning_rate": 6.41710738569729e-08, "loss": 1.6104, "step": 5877 }, { "epoch": 0.8865093130231506, "grad_norm": 0.27743083094419724, "learning_rate": 6.413385872072462e-08, "loss": 1.5155, "step": 5878 }, { "epoch": 0.8866601312118242, "grad_norm": 0.25766363740830706, "learning_rate": 6.409669093006618e-08, "loss": 1.509, "step": 5879 }, { "epoch": 0.8868109494004977, "grad_norm": 0.29953726419607446, "learning_rate": 6.405957049334275e-08, "loss": 1.5885, "step": 5880 }, { "epoch": 0.8869617675891712, "grad_norm": 0.24196780546874763, "learning_rate": 6.402249741888905e-08, "loss": 1.5239, "step": 5881 }, { "epoch": 0.8871125857778448, "grad_norm": 0.24233031263300328, "learning_rate": 6.398547171502903e-08, "loss": 1.464, "step": 5882 }, { "epoch": 0.8872634039665184, "grad_norm": 0.24816442235875175, "learning_rate": 6.3948493390076e-08, "loss": 1.5234, "step": 5883 }, { "epoch": 0.887414222155192, "grad_norm": 0.2480019620615259, "learning_rate": 6.391156245233272e-08, "loss": 1.5351, "step": 5884 }, { "epoch": 0.8875650403438655, "grad_norm": 0.2560935364839055, "learning_rate": 6.387467891009125e-08, "loss": 1.6397, "step": 5885 }, { "epoch": 0.887715858532539, "grad_norm": 0.269378814178076, "learning_rate": 6.383784277163299e-08, "loss": 1.5915, "step": 5886 }, { "epoch": 0.8878666767212126, "grad_norm": 0.2667927821700778, "learning_rate": 6.380105404522875e-08, "loss": 1.6394, "step": 5887 }, { "epoch": 0.8880174949098861, "grad_norm": 0.24908878957242192, "learning_rate": 6.37643127391387e-08, "loss": 1.6094, "step": 5888 }, { "epoch": 0.8881683130985597, "grad_norm": 0.4882189504331334, "learning_rate": 6.372761886161231e-08, "loss": 1.5773, "step": 5889 }, { "epoch": 0.8883191312872333, "grad_norm": 0.25313376930267606, "learning_rate": 6.369097242088837e-08, "loss": 1.5588, "step": 5890 }, { "epoch": 0.8884699494759067, "grad_norm": 0.2658884687807718, "learning_rate": 6.365437342519517e-08, "loss": 1.523, "step": 5891 }, { "epoch": 0.8886207676645803, "grad_norm": 0.2529329522885053, "learning_rate": 6.361782188275018e-08, "loss": 1.5984, "step": 5892 }, { "epoch": 0.8887715858532539, "grad_norm": 0.2774872541482239, "learning_rate": 6.358131780176028e-08, "loss": 1.5754, "step": 5893 }, { "epoch": 0.8889224040419275, "grad_norm": 0.2421467165011948, "learning_rate": 6.354486119042178e-08, "loss": 1.485, "step": 5894 }, { "epoch": 0.889073222230601, "grad_norm": 0.2715653990408799, "learning_rate": 6.350845205692016e-08, "loss": 1.4851, "step": 5895 }, { "epoch": 0.8892240404192746, "grad_norm": 0.32996512731212596, "learning_rate": 6.347209040943034e-08, "loss": 1.5237, "step": 5896 }, { "epoch": 0.8893748586079481, "grad_norm": 0.2811997761623907, "learning_rate": 6.343577625611666e-08, "loss": 1.5832, "step": 5897 }, { "epoch": 0.8895256767966216, "grad_norm": 0.25437689079194636, "learning_rate": 6.339950960513261e-08, "loss": 1.5377, "step": 5898 }, { "epoch": 0.8896764949852952, "grad_norm": 0.24800163959629032, "learning_rate": 6.336329046462115e-08, "loss": 1.4737, "step": 5899 }, { "epoch": 0.8898273131739688, "grad_norm": 0.28929413175546753, "learning_rate": 6.332711884271452e-08, "loss": 1.5879, "step": 5900 }, { "epoch": 0.8899781313626424, "grad_norm": 0.2530746149930727, "learning_rate": 6.32909947475343e-08, "loss": 1.5705, "step": 5901 }, { "epoch": 0.8901289495513159, "grad_norm": 0.262257833317664, "learning_rate": 6.325491818719144e-08, "loss": 1.5289, "step": 5902 }, { "epoch": 0.8902797677399894, "grad_norm": 0.2548884487560699, "learning_rate": 6.321888916978609e-08, "loss": 1.5929, "step": 5903 }, { "epoch": 0.890430585928663, "grad_norm": 0.25979559417848014, "learning_rate": 6.31829077034079e-08, "loss": 1.6308, "step": 5904 }, { "epoch": 0.8905814041173366, "grad_norm": 0.24462400630588177, "learning_rate": 6.314697379613569e-08, "loss": 1.5462, "step": 5905 }, { "epoch": 0.8907322223060101, "grad_norm": 0.25649500321946517, "learning_rate": 6.311108745603775e-08, "loss": 1.6261, "step": 5906 }, { "epoch": 0.8908830404946837, "grad_norm": 0.3051374242499988, "learning_rate": 6.307524869117154e-08, "loss": 1.5937, "step": 5907 }, { "epoch": 0.8910338586833572, "grad_norm": 0.26413628832939406, "learning_rate": 6.303945750958393e-08, "loss": 1.5781, "step": 5908 }, { "epoch": 0.8911846768720307, "grad_norm": 0.2699633938984024, "learning_rate": 6.300371391931113e-08, "loss": 1.5348, "step": 5909 }, { "epoch": 0.8913354950607043, "grad_norm": 0.28555157419570487, "learning_rate": 6.296801792837848e-08, "loss": 1.4669, "step": 5910 }, { "epoch": 0.8914863132493779, "grad_norm": 0.48857873667811774, "learning_rate": 6.293236954480092e-08, "loss": 1.5307, "step": 5911 }, { "epoch": 0.8916371314380515, "grad_norm": 0.23834644553922518, "learning_rate": 6.28967687765825e-08, "loss": 1.5111, "step": 5912 }, { "epoch": 0.891787949626725, "grad_norm": 0.26603501578231303, "learning_rate": 6.286121563171657e-08, "loss": 1.5718, "step": 5913 }, { "epoch": 0.8919387678153985, "grad_norm": 0.23842106316087344, "learning_rate": 6.282571011818594e-08, "loss": 1.4927, "step": 5914 }, { "epoch": 0.8920895860040721, "grad_norm": 0.29860749122389607, "learning_rate": 6.279025224396257e-08, "loss": 1.5237, "step": 5915 }, { "epoch": 0.8922404041927456, "grad_norm": 0.36973073453293803, "learning_rate": 6.275484201700779e-08, "loss": 1.5553, "step": 5916 }, { "epoch": 0.8923912223814192, "grad_norm": 0.26957002690688664, "learning_rate": 6.271947944527226e-08, "loss": 1.5176, "step": 5917 }, { "epoch": 0.8925420405700928, "grad_norm": 0.24833599270790296, "learning_rate": 6.268416453669588e-08, "loss": 1.6262, "step": 5918 }, { "epoch": 0.8926928587587664, "grad_norm": 0.26736278994547186, "learning_rate": 6.264889729920788e-08, "loss": 1.4807, "step": 5919 }, { "epoch": 0.8928436769474398, "grad_norm": 0.2528725015623935, "learning_rate": 6.261367774072677e-08, "loss": 1.5732, "step": 5920 }, { "epoch": 0.8929944951361134, "grad_norm": 0.25935914936506105, "learning_rate": 6.25785058691604e-08, "loss": 1.4699, "step": 5921 }, { "epoch": 0.893145313324787, "grad_norm": 0.25700196910473727, "learning_rate": 6.254338169240588e-08, "loss": 1.5718, "step": 5922 }, { "epoch": 0.8932961315134605, "grad_norm": 0.2612362352666459, "learning_rate": 6.250830521834953e-08, "loss": 1.4954, "step": 5923 }, { "epoch": 0.8934469497021341, "grad_norm": 0.2398806807867513, "learning_rate": 6.247327645486715e-08, "loss": 1.5542, "step": 5924 }, { "epoch": 0.8935977678908076, "grad_norm": 0.24741778039983103, "learning_rate": 6.243829540982365e-08, "loss": 1.5585, "step": 5925 }, { "epoch": 0.8937485860794812, "grad_norm": 0.2533496331004564, "learning_rate": 6.24033620910733e-08, "loss": 1.5317, "step": 5926 }, { "epoch": 0.8938994042681547, "grad_norm": 0.24478317756933246, "learning_rate": 6.236847650645969e-08, "loss": 1.5864, "step": 5927 }, { "epoch": 0.8940502224568283, "grad_norm": 0.25150332003977416, "learning_rate": 6.233363866381562e-08, "loss": 1.5466, "step": 5928 }, { "epoch": 0.8942010406455019, "grad_norm": 0.2718123301282069, "learning_rate": 6.229884857096318e-08, "loss": 1.5986, "step": 5929 }, { "epoch": 0.8943518588341755, "grad_norm": 0.4184994268005829, "learning_rate": 6.226410623571375e-08, "loss": 1.6133, "step": 5930 }, { "epoch": 0.8945026770228489, "grad_norm": 0.2862373432969868, "learning_rate": 6.222941166586807e-08, "loss": 1.569, "step": 5931 }, { "epoch": 0.8946534952115225, "grad_norm": 0.28116572971179365, "learning_rate": 6.219476486921604e-08, "loss": 1.5196, "step": 5932 }, { "epoch": 0.8948043134001961, "grad_norm": 0.27920990887066083, "learning_rate": 6.216016585353684e-08, "loss": 1.5565, "step": 5933 }, { "epoch": 0.8949551315888696, "grad_norm": 0.9468327930177274, "learning_rate": 6.212561462659899e-08, "loss": 1.4884, "step": 5934 }, { "epoch": 0.8951059497775432, "grad_norm": 0.24085009761212245, "learning_rate": 6.209111119616026e-08, "loss": 1.5676, "step": 5935 }, { "epoch": 0.8952567679662167, "grad_norm": 0.24640198307092454, "learning_rate": 6.205665556996763e-08, "loss": 1.5443, "step": 5936 }, { "epoch": 0.8954075861548902, "grad_norm": 0.244946177856438, "learning_rate": 6.202224775575743e-08, "loss": 1.5786, "step": 5937 }, { "epoch": 0.8955584043435638, "grad_norm": 0.23629801333322237, "learning_rate": 6.198788776125523e-08, "loss": 1.5061, "step": 5938 }, { "epoch": 0.8957092225322374, "grad_norm": 0.24437752940312316, "learning_rate": 6.195357559417579e-08, "loss": 1.5267, "step": 5939 }, { "epoch": 0.895860040720911, "grad_norm": 1.1115769329679674, "learning_rate": 6.191931126222327e-08, "loss": 1.5187, "step": 5940 }, { "epoch": 0.8960108589095845, "grad_norm": 0.25349389272698614, "learning_rate": 6.188509477309095e-08, "loss": 1.5622, "step": 5941 }, { "epoch": 0.896161677098258, "grad_norm": 0.45245419843398993, "learning_rate": 6.185092613446144e-08, "loss": 1.5551, "step": 5942 }, { "epoch": 0.8963124952869316, "grad_norm": 0.27820637443519197, "learning_rate": 6.181680535400658e-08, "loss": 1.6205, "step": 5943 }, { "epoch": 0.8964633134756052, "grad_norm": 0.26015948602270705, "learning_rate": 6.178273243938754e-08, "loss": 1.5722, "step": 5944 }, { "epoch": 0.8966141316642787, "grad_norm": 0.2531641812074329, "learning_rate": 6.174870739825462e-08, "loss": 1.5779, "step": 5945 }, { "epoch": 0.8967649498529523, "grad_norm": 0.28896923356037835, "learning_rate": 6.171473023824745e-08, "loss": 1.5222, "step": 5946 }, { "epoch": 0.8969157680416259, "grad_norm": 0.257840125587561, "learning_rate": 6.168080096699491e-08, "loss": 1.5753, "step": 5947 }, { "epoch": 0.8970665862302993, "grad_norm": 0.2601253932992487, "learning_rate": 6.164691959211508e-08, "loss": 1.5733, "step": 5948 }, { "epoch": 0.8972174044189729, "grad_norm": 0.2693931957959471, "learning_rate": 6.161308612121533e-08, "loss": 1.5236, "step": 5949 }, { "epoch": 0.8973682226076465, "grad_norm": 0.4050835473138554, "learning_rate": 6.157930056189226e-08, "loss": 1.565, "step": 5950 }, { "epoch": 0.8975190407963201, "grad_norm": 0.24394173590491194, "learning_rate": 6.154556292173172e-08, "loss": 1.4795, "step": 5951 }, { "epoch": 0.8976698589849936, "grad_norm": 0.26346869726216715, "learning_rate": 6.151187320830875e-08, "loss": 1.541, "step": 5952 }, { "epoch": 0.8978206771736671, "grad_norm": 0.25805070681206316, "learning_rate": 6.147823142918775e-08, "loss": 1.531, "step": 5953 }, { "epoch": 0.8979714953623407, "grad_norm": 0.24566885665807253, "learning_rate": 6.144463759192224e-08, "loss": 1.5557, "step": 5954 }, { "epoch": 0.8981223135510142, "grad_norm": 0.2520165272941859, "learning_rate": 6.1411091704055e-08, "loss": 1.6103, "step": 5955 }, { "epoch": 0.8982731317396878, "grad_norm": 0.24379749398124398, "learning_rate": 6.137759377311803e-08, "loss": 1.4831, "step": 5956 }, { "epoch": 0.8984239499283614, "grad_norm": 0.2479661131005401, "learning_rate": 6.134414380663268e-08, "loss": 1.5335, "step": 5957 }, { "epoch": 0.898574768117035, "grad_norm": 0.3180718221807806, "learning_rate": 6.131074181210937e-08, "loss": 1.5028, "step": 5958 }, { "epoch": 0.8987255863057084, "grad_norm": 0.3582739368580261, "learning_rate": 6.127738779704781e-08, "loss": 1.5343, "step": 5959 }, { "epoch": 0.898876404494382, "grad_norm": 0.24072622519868206, "learning_rate": 6.124408176893704e-08, "loss": 1.4762, "step": 5960 }, { "epoch": 0.8990272226830556, "grad_norm": 0.24225971651789271, "learning_rate": 6.121082373525516e-08, "loss": 1.5625, "step": 5961 }, { "epoch": 0.8991780408717291, "grad_norm": 0.2878400417245367, "learning_rate": 6.117761370346952e-08, "loss": 1.5547, "step": 5962 }, { "epoch": 0.8993288590604027, "grad_norm": 0.25902823185013885, "learning_rate": 6.114445168103686e-08, "loss": 1.5859, "step": 5963 }, { "epoch": 0.8994796772490763, "grad_norm": 0.2565759083430694, "learning_rate": 6.111133767540296e-08, "loss": 1.527, "step": 5964 }, { "epoch": 0.8996304954377498, "grad_norm": 0.25825035292048387, "learning_rate": 6.107827169400286e-08, "loss": 1.6749, "step": 5965 }, { "epoch": 0.8997813136264233, "grad_norm": 0.24646332618767774, "learning_rate": 6.104525374426087e-08, "loss": 1.5141, "step": 5966 }, { "epoch": 0.8999321318150969, "grad_norm": 0.28275712404453535, "learning_rate": 6.101228383359045e-08, "loss": 1.5549, "step": 5967 }, { "epoch": 0.9000829500037705, "grad_norm": 0.24842922555258082, "learning_rate": 6.097936196939434e-08, "loss": 1.5833, "step": 5968 }, { "epoch": 0.900233768192444, "grad_norm": 0.3588249034386644, "learning_rate": 6.094648815906439e-08, "loss": 1.5332, "step": 5969 }, { "epoch": 0.9003845863811175, "grad_norm": 0.2641650885366285, "learning_rate": 6.091366240998182e-08, "loss": 1.6196, "step": 5970 }, { "epoch": 0.9005354045697911, "grad_norm": 0.24696161829150579, "learning_rate": 6.088088472951694e-08, "loss": 1.5711, "step": 5971 }, { "epoch": 0.9006862227584647, "grad_norm": 0.24249422449486577, "learning_rate": 6.084815512502922e-08, "loss": 1.5507, "step": 5972 }, { "epoch": 0.9008370409471382, "grad_norm": 0.24074282720599494, "learning_rate": 6.081547360386748e-08, "loss": 1.5016, "step": 5973 }, { "epoch": 0.9009878591358118, "grad_norm": 0.27379028533114474, "learning_rate": 6.07828401733697e-08, "loss": 1.5807, "step": 5974 }, { "epoch": 0.9011386773244854, "grad_norm": 0.24662719532027477, "learning_rate": 6.075025484086294e-08, "loss": 1.526, "step": 5975 }, { "epoch": 0.9012894955131588, "grad_norm": 0.26352249578185005, "learning_rate": 6.071771761366364e-08, "loss": 1.5176, "step": 5976 }, { "epoch": 0.9014403137018324, "grad_norm": 0.25389674701605053, "learning_rate": 6.068522849907734e-08, "loss": 1.5477, "step": 5977 }, { "epoch": 0.901591131890506, "grad_norm": 0.2621310667880105, "learning_rate": 6.065278750439873e-08, "loss": 1.5737, "step": 5978 }, { "epoch": 0.9017419500791796, "grad_norm": 0.2616883665866148, "learning_rate": 6.062039463691185e-08, "loss": 1.5583, "step": 5979 }, { "epoch": 0.9018927682678531, "grad_norm": 0.24728104360191377, "learning_rate": 6.058804990388979e-08, "loss": 1.526, "step": 5980 }, { "epoch": 0.9020435864565266, "grad_norm": 0.2559460115090124, "learning_rate": 6.055575331259488e-08, "loss": 1.5865, "step": 5981 }, { "epoch": 0.9021944046452002, "grad_norm": 0.25760239630524945, "learning_rate": 6.052350487027865e-08, "loss": 1.57, "step": 5982 }, { "epoch": 0.9023452228338738, "grad_norm": 0.24738618795501038, "learning_rate": 6.049130458418183e-08, "loss": 1.6156, "step": 5983 }, { "epoch": 0.9024960410225473, "grad_norm": 0.26018903050986375, "learning_rate": 6.045915246153435e-08, "loss": 1.561, "step": 5984 }, { "epoch": 0.9026468592112209, "grad_norm": 0.2568670949435877, "learning_rate": 6.042704850955521e-08, "loss": 1.541, "step": 5985 }, { "epoch": 0.9027976773998945, "grad_norm": 0.25089162499423023, "learning_rate": 6.039499273545277e-08, "loss": 1.5407, "step": 5986 }, { "epoch": 0.9029484955885679, "grad_norm": 0.24381195120656876, "learning_rate": 6.036298514642445e-08, "loss": 1.5889, "step": 5987 }, { "epoch": 0.9030993137772415, "grad_norm": 0.25755710349757727, "learning_rate": 6.033102574965685e-08, "loss": 1.5078, "step": 5988 }, { "epoch": 0.9032501319659151, "grad_norm": 0.23761719968154357, "learning_rate": 6.029911455232585e-08, "loss": 1.4897, "step": 5989 }, { "epoch": 0.9034009501545887, "grad_norm": 0.24978975522047717, "learning_rate": 6.026725156159643e-08, "loss": 1.6201, "step": 5990 }, { "epoch": 0.9035517683432622, "grad_norm": 0.2730426495004039, "learning_rate": 6.023543678462268e-08, "loss": 1.5745, "step": 5991 }, { "epoch": 0.9037025865319358, "grad_norm": 0.2529225158454245, "learning_rate": 6.020367022854805e-08, "loss": 1.5771, "step": 5992 }, { "epoch": 0.9038534047206093, "grad_norm": 0.2624770484136695, "learning_rate": 6.017195190050502e-08, "loss": 1.526, "step": 5993 }, { "epoch": 0.9040042229092828, "grad_norm": 0.24752289297805732, "learning_rate": 6.014028180761527e-08, "loss": 1.5887, "step": 5994 }, { "epoch": 0.9041550410979564, "grad_norm": 0.25359951881522097, "learning_rate": 6.010865995698965e-08, "loss": 1.5637, "step": 5995 }, { "epoch": 0.90430585928663, "grad_norm": 0.24977479265531782, "learning_rate": 6.007708635572821e-08, "loss": 1.675, "step": 5996 }, { "epoch": 0.9044566774753036, "grad_norm": 0.31466477706062923, "learning_rate": 6.004556101092012e-08, "loss": 1.5225, "step": 5997 }, { "epoch": 0.904607495663977, "grad_norm": 0.24303541638011292, "learning_rate": 6.001408392964375e-08, "loss": 1.5208, "step": 5998 }, { "epoch": 0.9047583138526506, "grad_norm": 0.6739544452758239, "learning_rate": 5.998265511896662e-08, "loss": 1.5446, "step": 5999 }, { "epoch": 0.9049091320413242, "grad_norm": 0.25687482238901066, "learning_rate": 5.995127458594542e-08, "loss": 1.5913, "step": 6000 }, { "epoch": 0.9050599502299977, "grad_norm": 0.2550897498231508, "learning_rate": 5.991994233762597e-08, "loss": 1.5659, "step": 6001 }, { "epoch": 0.9052107684186713, "grad_norm": 0.24268037160138126, "learning_rate": 5.988865838104332e-08, "loss": 1.5651, "step": 6002 }, { "epoch": 0.9053615866073449, "grad_norm": 0.2731439566265382, "learning_rate": 5.985742272322157e-08, "loss": 1.5762, "step": 6003 }, { "epoch": 0.9055124047960184, "grad_norm": 0.26038850236812383, "learning_rate": 5.982623537117405e-08, "loss": 1.5946, "step": 6004 }, { "epoch": 0.9056632229846919, "grad_norm": 0.8607046633015673, "learning_rate": 5.979509633190327e-08, "loss": 1.5304, "step": 6005 }, { "epoch": 0.9058140411733655, "grad_norm": 0.25976877570104584, "learning_rate": 5.976400561240083e-08, "loss": 1.575, "step": 6006 }, { "epoch": 0.9059648593620391, "grad_norm": 0.30493413653552726, "learning_rate": 5.97329632196475e-08, "loss": 1.4739, "step": 6007 }, { "epoch": 0.9061156775507127, "grad_norm": 0.26836670517825983, "learning_rate": 5.970196916061314e-08, "loss": 1.6054, "step": 6008 }, { "epoch": 0.9062664957393862, "grad_norm": 0.23988289191523413, "learning_rate": 5.967102344225691e-08, "loss": 1.5178, "step": 6009 }, { "epoch": 0.9064173139280597, "grad_norm": 0.30141474342371144, "learning_rate": 5.964012607152701e-08, "loss": 1.5522, "step": 6010 }, { "epoch": 0.9065681321167333, "grad_norm": 0.2545175356890916, "learning_rate": 5.960927705536076e-08, "loss": 1.5099, "step": 6011 }, { "epoch": 0.9067189503054068, "grad_norm": 0.26200645955094654, "learning_rate": 5.957847640068467e-08, "loss": 1.5973, "step": 6012 }, { "epoch": 0.9068697684940804, "grad_norm": 0.25436559423044286, "learning_rate": 5.954772411441442e-08, "loss": 1.5871, "step": 6013 }, { "epoch": 0.907020586682754, "grad_norm": 0.2684570896237277, "learning_rate": 5.9517020203454756e-08, "loss": 1.5906, "step": 6014 }, { "epoch": 0.9071714048714274, "grad_norm": 0.44423987142588256, "learning_rate": 5.948636467469963e-08, "loss": 1.5618, "step": 6015 }, { "epoch": 0.907322223060101, "grad_norm": 0.2465686813729949, "learning_rate": 5.945575753503207e-08, "loss": 1.5585, "step": 6016 }, { "epoch": 0.9074730412487746, "grad_norm": 0.25285875225744253, "learning_rate": 5.942519879132427e-08, "loss": 1.566, "step": 6017 }, { "epoch": 0.9076238594374482, "grad_norm": 0.24574371258077005, "learning_rate": 5.93946884504376e-08, "loss": 1.6016, "step": 6018 }, { "epoch": 0.9077746776261217, "grad_norm": 0.24671212121303002, "learning_rate": 5.93642265192225e-08, "loss": 1.5013, "step": 6019 }, { "epoch": 0.9079254958147953, "grad_norm": 0.2734618189493248, "learning_rate": 5.933381300451855e-08, "loss": 1.5508, "step": 6020 }, { "epoch": 0.9080763140034688, "grad_norm": 0.24606026666908068, "learning_rate": 5.9303447913154435e-08, "loss": 1.5824, "step": 6021 }, { "epoch": 0.9082271321921424, "grad_norm": 0.27087054216315615, "learning_rate": 5.927313125194809e-08, "loss": 1.5711, "step": 6022 }, { "epoch": 0.9083779503808159, "grad_norm": 1.7962946270160636, "learning_rate": 5.924286302770642e-08, "loss": 1.523, "step": 6023 }, { "epoch": 0.9085287685694895, "grad_norm": 0.2423753250984084, "learning_rate": 5.921264324722553e-08, "loss": 1.5427, "step": 6024 }, { "epoch": 0.9086795867581631, "grad_norm": 0.26772680116993963, "learning_rate": 5.918247191729067e-08, "loss": 1.4842, "step": 6025 }, { "epoch": 0.9088304049468365, "grad_norm": 0.24826463739719304, "learning_rate": 5.915234904467617e-08, "loss": 1.5398, "step": 6026 }, { "epoch": 0.9089812231355101, "grad_norm": 0.29586232205972984, "learning_rate": 5.912227463614545e-08, "loss": 1.5409, "step": 6027 }, { "epoch": 0.9091320413241837, "grad_norm": 0.2599860801557601, "learning_rate": 5.9092248698451196e-08, "loss": 1.5961, "step": 6028 }, { "epoch": 0.9092828595128573, "grad_norm": 0.35821856761478, "learning_rate": 5.906227123833501e-08, "loss": 1.5799, "step": 6029 }, { "epoch": 0.9094336777015308, "grad_norm": 0.25207841952714977, "learning_rate": 5.9032342262527716e-08, "loss": 1.6431, "step": 6030 }, { "epoch": 0.9095844958902044, "grad_norm": 0.28798786232057827, "learning_rate": 5.900246177774926e-08, "loss": 1.4955, "step": 6031 }, { "epoch": 0.9097353140788779, "grad_norm": 0.24850564745268564, "learning_rate": 5.897262979070868e-08, "loss": 1.5698, "step": 6032 }, { "epoch": 0.9098861322675514, "grad_norm": 0.2941221192303597, "learning_rate": 5.894284630810415e-08, "loss": 1.5519, "step": 6033 }, { "epoch": 0.910036950456225, "grad_norm": 0.24259196567563732, "learning_rate": 5.8913111336622856e-08, "loss": 1.6027, "step": 6034 }, { "epoch": 0.9101877686448986, "grad_norm": 0.25153935046572595, "learning_rate": 5.888342488294125e-08, "loss": 1.5018, "step": 6035 }, { "epoch": 0.9103385868335722, "grad_norm": 0.24229883035317848, "learning_rate": 5.885378695372476e-08, "loss": 1.4775, "step": 6036 }, { "epoch": 0.9104894050222457, "grad_norm": 0.24241717043334063, "learning_rate": 5.8824197555627936e-08, "loss": 1.5488, "step": 6037 }, { "epoch": 0.9106402232109192, "grad_norm": 0.24568885905359372, "learning_rate": 5.8794656695294546e-08, "loss": 1.5211, "step": 6038 }, { "epoch": 0.9107910413995928, "grad_norm": 0.23744917362961723, "learning_rate": 5.8765164379357314e-08, "loss": 1.6052, "step": 6039 }, { "epoch": 0.9109418595882663, "grad_norm": 0.36932078031473947, "learning_rate": 5.873572061443811e-08, "loss": 1.5795, "step": 6040 }, { "epoch": 0.9110926777769399, "grad_norm": 0.2542230004886954, "learning_rate": 5.8706325407147956e-08, "loss": 1.6133, "step": 6041 }, { "epoch": 0.9112434959656135, "grad_norm": 0.255384496145479, "learning_rate": 5.867697876408694e-08, "loss": 1.5637, "step": 6042 }, { "epoch": 0.911394314154287, "grad_norm": 0.25485060766998097, "learning_rate": 5.8647680691844184e-08, "loss": 1.6338, "step": 6043 }, { "epoch": 0.9115451323429605, "grad_norm": 0.25237897567980183, "learning_rate": 5.8618431196997984e-08, "loss": 1.5547, "step": 6044 }, { "epoch": 0.9116959505316341, "grad_norm": 0.24409432431768954, "learning_rate": 5.8589230286115716e-08, "loss": 1.5827, "step": 6045 }, { "epoch": 0.9118467687203077, "grad_norm": 0.2570925994521639, "learning_rate": 5.856007796575385e-08, "loss": 1.5491, "step": 6046 }, { "epoch": 0.9119975869089813, "grad_norm": 0.24830464335674102, "learning_rate": 5.8530974242457856e-08, "loss": 1.4652, "step": 6047 }, { "epoch": 0.9121484050976548, "grad_norm": 0.24781908325718147, "learning_rate": 5.8501919122762464e-08, "loss": 1.6006, "step": 6048 }, { "epoch": 0.9122992232863283, "grad_norm": 0.26083921342285155, "learning_rate": 5.847291261319134e-08, "loss": 1.5819, "step": 6049 }, { "epoch": 0.9124500414750019, "grad_norm": 0.26309633869145815, "learning_rate": 5.8443954720257283e-08, "loss": 1.553, "step": 6050 }, { "epoch": 0.9126008596636754, "grad_norm": 0.26281025692831333, "learning_rate": 5.841504545046222e-08, "loss": 1.5374, "step": 6051 }, { "epoch": 0.912751677852349, "grad_norm": 0.27750393946945345, "learning_rate": 5.83861848102971e-08, "loss": 1.5446, "step": 6052 }, { "epoch": 0.9129024960410226, "grad_norm": 0.2799515575723963, "learning_rate": 5.835737280624197e-08, "loss": 1.5626, "step": 6053 }, { "epoch": 0.9130533142296962, "grad_norm": 0.24894486614996936, "learning_rate": 5.8328609444766006e-08, "loss": 1.5599, "step": 6054 }, { "epoch": 0.9132041324183696, "grad_norm": 0.26019381642413064, "learning_rate": 5.829989473232737e-08, "loss": 1.5944, "step": 6055 }, { "epoch": 0.9133549506070432, "grad_norm": 0.2759237853661695, "learning_rate": 5.827122867537337e-08, "loss": 1.5644, "step": 6056 }, { "epoch": 0.9135057687957168, "grad_norm": 0.24814657412343966, "learning_rate": 5.8242611280340356e-08, "loss": 1.5764, "step": 6057 }, { "epoch": 0.9136565869843903, "grad_norm": 0.25487268213850733, "learning_rate": 5.821404255365381e-08, "loss": 1.5767, "step": 6058 }, { "epoch": 0.9138074051730639, "grad_norm": 0.2446630422502796, "learning_rate": 5.818552250172822e-08, "loss": 1.545, "step": 6059 }, { "epoch": 0.9139582233617374, "grad_norm": 0.26215823714021624, "learning_rate": 5.815705113096712e-08, "loss": 1.6177, "step": 6060 }, { "epoch": 0.914109041550411, "grad_norm": 0.35947069849148555, "learning_rate": 5.812862844776324e-08, "loss": 1.5917, "step": 6061 }, { "epoch": 0.9142598597390845, "grad_norm": 0.308709803926604, "learning_rate": 5.8100254458498285e-08, "loss": 1.5329, "step": 6062 }, { "epoch": 0.9144106779277581, "grad_norm": 0.24201885723949065, "learning_rate": 5.8071929169542974e-08, "loss": 1.6049, "step": 6063 }, { "epoch": 0.9145614961164317, "grad_norm": 0.2542398848923103, "learning_rate": 5.804365258725724e-08, "loss": 1.5361, "step": 6064 }, { "epoch": 0.9147123143051052, "grad_norm": 0.24749220381791429, "learning_rate": 5.801542471798998e-08, "loss": 1.5734, "step": 6065 }, { "epoch": 0.9148631324937787, "grad_norm": 0.2399673199320219, "learning_rate": 5.7987245568079114e-08, "loss": 1.5079, "step": 6066 }, { "epoch": 0.9150139506824523, "grad_norm": 0.25543942985532075, "learning_rate": 5.795911514385176e-08, "loss": 1.5887, "step": 6067 }, { "epoch": 0.9151647688711259, "grad_norm": 0.25005576353674086, "learning_rate": 5.793103345162397e-08, "loss": 1.5817, "step": 6068 }, { "epoch": 0.9153155870597994, "grad_norm": 0.2469560749520833, "learning_rate": 5.790300049770094e-08, "loss": 1.5127, "step": 6069 }, { "epoch": 0.915466405248473, "grad_norm": 0.2501041361963282, "learning_rate": 5.7875016288376826e-08, "loss": 1.616, "step": 6070 }, { "epoch": 0.9156172234371465, "grad_norm": 0.24059136281338953, "learning_rate": 5.7847080829934966e-08, "loss": 1.6286, "step": 6071 }, { "epoch": 0.91576804162582, "grad_norm": 0.24840412852701096, "learning_rate": 5.781919412864765e-08, "loss": 1.5391, "step": 6072 }, { "epoch": 0.9159188598144936, "grad_norm": 0.2560422391892665, "learning_rate": 5.779135619077623e-08, "loss": 1.6219, "step": 6073 }, { "epoch": 0.9160696780031672, "grad_norm": 0.6623667693400862, "learning_rate": 5.7763567022571196e-08, "loss": 1.4886, "step": 6074 }, { "epoch": 0.9162204961918408, "grad_norm": 0.2664057578498538, "learning_rate": 5.7735826630271974e-08, "loss": 1.5858, "step": 6075 }, { "epoch": 0.9163713143805143, "grad_norm": 0.3096052738479129, "learning_rate": 5.770813502010708e-08, "loss": 1.5794, "step": 6076 }, { "epoch": 0.9165221325691878, "grad_norm": 0.31916402504883584, "learning_rate": 5.768049219829415e-08, "loss": 1.5336, "step": 6077 }, { "epoch": 0.9166729507578614, "grad_norm": 0.2563832597165492, "learning_rate": 5.765289817103976e-08, "loss": 1.554, "step": 6078 }, { "epoch": 0.916823768946535, "grad_norm": 0.2352107286443002, "learning_rate": 5.7625352944539537e-08, "loss": 1.5427, "step": 6079 }, { "epoch": 0.9169745871352085, "grad_norm": 0.24655627148991396, "learning_rate": 5.759785652497827e-08, "loss": 1.5758, "step": 6080 }, { "epoch": 0.9171254053238821, "grad_norm": 0.25237182893359833, "learning_rate": 5.757040891852967e-08, "loss": 1.6457, "step": 6081 }, { "epoch": 0.9172762235125557, "grad_norm": 0.2429829153276654, "learning_rate": 5.754301013135651e-08, "loss": 1.5361, "step": 6082 }, { "epoch": 0.9174270417012291, "grad_norm": 0.25485848289645746, "learning_rate": 5.75156601696106e-08, "loss": 1.5314, "step": 6083 }, { "epoch": 0.9175778598899027, "grad_norm": 0.23571301769763325, "learning_rate": 5.7488359039432835e-08, "loss": 1.4689, "step": 6084 }, { "epoch": 0.9177286780785763, "grad_norm": 0.24958446516321273, "learning_rate": 5.746110674695312e-08, "loss": 1.5411, "step": 6085 }, { "epoch": 0.9178794962672499, "grad_norm": 0.25595237502687773, "learning_rate": 5.743390329829035e-08, "loss": 1.5244, "step": 6086 }, { "epoch": 0.9180303144559234, "grad_norm": 0.24543998320526575, "learning_rate": 5.740674869955254e-08, "loss": 1.5927, "step": 6087 }, { "epoch": 0.9181811326445969, "grad_norm": 0.2595006942873366, "learning_rate": 5.737964295683665e-08, "loss": 1.5793, "step": 6088 }, { "epoch": 0.9183319508332705, "grad_norm": 0.634422558558343, "learning_rate": 5.73525860762287e-08, "loss": 1.5654, "step": 6089 }, { "epoch": 0.918482769021944, "grad_norm": 0.2561577816220454, "learning_rate": 5.73255780638038e-08, "loss": 1.5743, "step": 6090 }, { "epoch": 0.9186335872106176, "grad_norm": 0.256582214734705, "learning_rate": 5.7298618925626e-08, "loss": 1.5056, "step": 6091 }, { "epoch": 0.9187844053992912, "grad_norm": 0.29479209097127596, "learning_rate": 5.7271708667748377e-08, "loss": 1.4916, "step": 6092 }, { "epoch": 0.9189352235879648, "grad_norm": 0.3003603630013369, "learning_rate": 5.724484729621317e-08, "loss": 1.558, "step": 6093 }, { "epoch": 0.9190860417766382, "grad_norm": 0.26160301110829026, "learning_rate": 5.7218034817051436e-08, "loss": 1.4789, "step": 6094 }, { "epoch": 0.9192368599653118, "grad_norm": 0.24774684920350054, "learning_rate": 5.719127123628343e-08, "loss": 1.5946, "step": 6095 }, { "epoch": 0.9193876781539854, "grad_norm": 0.28108399934166406, "learning_rate": 5.716455655991827e-08, "loss": 1.5268, "step": 6096 }, { "epoch": 0.9195384963426589, "grad_norm": 0.5761951650568155, "learning_rate": 5.713789079395427e-08, "loss": 1.5801, "step": 6097 }, { "epoch": 0.9196893145313325, "grad_norm": 0.25754475229133966, "learning_rate": 5.711127394437863e-08, "loss": 1.5637, "step": 6098 }, { "epoch": 0.9198401327200061, "grad_norm": 0.25139716251584615, "learning_rate": 5.7084706017167595e-08, "loss": 1.5676, "step": 6099 }, { "epoch": 0.9199909509086796, "grad_norm": 0.28185865101427054, "learning_rate": 5.705818701828646e-08, "loss": 1.6325, "step": 6100 }, { "epoch": 0.9201417690973531, "grad_norm": 0.26628385922175346, "learning_rate": 5.7031716953689516e-08, "loss": 1.5633, "step": 6101 }, { "epoch": 0.9202925872860267, "grad_norm": 0.3687397497971009, "learning_rate": 5.7005295829320044e-08, "loss": 1.5356, "step": 6102 }, { "epoch": 0.9204434054747003, "grad_norm": 0.28602796011570303, "learning_rate": 5.697892365111037e-08, "loss": 1.5567, "step": 6103 }, { "epoch": 0.9205942236633738, "grad_norm": 0.2538349209624199, "learning_rate": 5.695260042498182e-08, "loss": 1.533, "step": 6104 }, { "epoch": 0.9207450418520473, "grad_norm": 0.26657271123144255, "learning_rate": 5.6926326156844695e-08, "loss": 1.6001, "step": 6105 }, { "epoch": 0.9208958600407209, "grad_norm": 0.24861917653118323, "learning_rate": 5.690010085259839e-08, "loss": 1.5704, "step": 6106 }, { "epoch": 0.9210466782293945, "grad_norm": 0.24530394043827866, "learning_rate": 5.6873924518131185e-08, "loss": 1.6085, "step": 6107 }, { "epoch": 0.921197496418068, "grad_norm": 0.24313647967223873, "learning_rate": 5.684779715932047e-08, "loss": 1.5621, "step": 6108 }, { "epoch": 0.9213483146067416, "grad_norm": 0.2504009150214136, "learning_rate": 5.6821718782032566e-08, "loss": 1.4987, "step": 6109 }, { "epoch": 0.9214991327954152, "grad_norm": 0.6787606527205317, "learning_rate": 5.679568939212289e-08, "loss": 1.5836, "step": 6110 }, { "epoch": 0.9216499509840886, "grad_norm": 0.2876594125286204, "learning_rate": 5.676970899543574e-08, "loss": 1.5046, "step": 6111 }, { "epoch": 0.9218007691727622, "grad_norm": 2.551721847230167, "learning_rate": 5.674377759780446e-08, "loss": 1.5481, "step": 6112 }, { "epoch": 0.9219515873614358, "grad_norm": 0.3347547508301126, "learning_rate": 5.671789520505145e-08, "loss": 1.5646, "step": 6113 }, { "epoch": 0.9221024055501094, "grad_norm": 0.2469328025595555, "learning_rate": 5.669206182298805e-08, "loss": 1.542, "step": 6114 }, { "epoch": 0.9222532237387829, "grad_norm": 0.24934031772475193, "learning_rate": 5.6666277457414625e-08, "loss": 1.5075, "step": 6115 }, { "epoch": 0.9224040419274564, "grad_norm": 0.28294027549133693, "learning_rate": 5.6640542114120455e-08, "loss": 1.5542, "step": 6116 }, { "epoch": 0.92255486011613, "grad_norm": 0.2564436610759413, "learning_rate": 5.661485579888392e-08, "loss": 1.5324, "step": 6117 }, { "epoch": 0.9227056783048035, "grad_norm": 0.2584982097210025, "learning_rate": 5.658921851747234e-08, "loss": 1.5662, "step": 6118 }, { "epoch": 0.9228564964934771, "grad_norm": 0.2670956480863373, "learning_rate": 5.656363027564201e-08, "loss": 1.596, "step": 6119 }, { "epoch": 0.9230073146821507, "grad_norm": 0.30653116963810917, "learning_rate": 5.6538091079138294e-08, "loss": 1.5695, "step": 6120 }, { "epoch": 0.9231581328708243, "grad_norm": 0.30199507780830126, "learning_rate": 5.6512600933695425e-08, "loss": 1.5549, "step": 6121 }, { "epoch": 0.9233089510594977, "grad_norm": 0.33741167082112555, "learning_rate": 5.6487159845036685e-08, "loss": 1.5277, "step": 6122 }, { "epoch": 0.9234597692481713, "grad_norm": 0.275068483440525, "learning_rate": 5.6461767818874364e-08, "loss": 1.5245, "step": 6123 }, { "epoch": 0.9236105874368449, "grad_norm": 0.24574270325486824, "learning_rate": 5.643642486090973e-08, "loss": 1.5421, "step": 6124 }, { "epoch": 0.9237614056255185, "grad_norm": 0.2477649885538568, "learning_rate": 5.6411130976832994e-08, "loss": 1.5655, "step": 6125 }, { "epoch": 0.923912223814192, "grad_norm": 0.24572262725711227, "learning_rate": 5.638588617232331e-08, "loss": 1.5446, "step": 6126 }, { "epoch": 0.9240630420028656, "grad_norm": 0.24912195102215598, "learning_rate": 5.6360690453048983e-08, "loss": 1.5682, "step": 6127 }, { "epoch": 0.9242138601915391, "grad_norm": 0.25792672085752477, "learning_rate": 5.6335543824667114e-08, "loss": 1.491, "step": 6128 }, { "epoch": 0.9243646783802126, "grad_norm": 0.3338943665700372, "learning_rate": 5.6310446292823845e-08, "loss": 1.5448, "step": 6129 }, { "epoch": 0.9245154965688862, "grad_norm": 0.30488090147190133, "learning_rate": 5.6285397863154355e-08, "loss": 1.5588, "step": 6130 }, { "epoch": 0.9246663147575598, "grad_norm": 0.24595625574750304, "learning_rate": 5.6260398541282723e-08, "loss": 1.5927, "step": 6131 }, { "epoch": 0.9248171329462334, "grad_norm": 0.23974432099050957, "learning_rate": 5.623544833282197e-08, "loss": 1.5094, "step": 6132 }, { "epoch": 0.9249679511349068, "grad_norm": 0.2549180339482979, "learning_rate": 5.6210547243374235e-08, "loss": 1.6182, "step": 6133 }, { "epoch": 0.9251187693235804, "grad_norm": 0.2537976703144186, "learning_rate": 5.61856952785305e-08, "loss": 1.6009, "step": 6134 }, { "epoch": 0.925269587512254, "grad_norm": 0.24465920281166262, "learning_rate": 5.6160892443870716e-08, "loss": 1.5955, "step": 6135 }, { "epoch": 0.9254204057009275, "grad_norm": 0.25195229045779016, "learning_rate": 5.613613874496392e-08, "loss": 1.5537, "step": 6136 }, { "epoch": 0.9255712238896011, "grad_norm": 0.24869817633114924, "learning_rate": 5.6111434187368e-08, "loss": 1.5412, "step": 6137 }, { "epoch": 0.9257220420782747, "grad_norm": 0.271284202605858, "learning_rate": 5.608677877662981e-08, "loss": 1.545, "step": 6138 }, { "epoch": 0.9258728602669481, "grad_norm": 0.27163866870959363, "learning_rate": 5.606217251828525e-08, "loss": 1.5452, "step": 6139 }, { "epoch": 0.9260236784556217, "grad_norm": 0.46977552446701015, "learning_rate": 5.6037615417859155e-08, "loss": 1.5388, "step": 6140 }, { "epoch": 0.9261744966442953, "grad_norm": 0.36444870166684856, "learning_rate": 5.601310748086528e-08, "loss": 1.5172, "step": 6141 }, { "epoch": 0.9263253148329689, "grad_norm": 0.24999061175916848, "learning_rate": 5.5988648712806355e-08, "loss": 1.5342, "step": 6142 }, { "epoch": 0.9264761330216424, "grad_norm": 0.24147710000217487, "learning_rate": 5.596423911917414e-08, "loss": 1.5201, "step": 6143 }, { "epoch": 0.926626951210316, "grad_norm": 0.26529601088070126, "learning_rate": 5.5939878705449263e-08, "loss": 1.5234, "step": 6144 }, { "epoch": 0.9267777693989895, "grad_norm": 0.2450064676081888, "learning_rate": 5.591556747710131e-08, "loss": 1.5023, "step": 6145 }, { "epoch": 0.926928587587663, "grad_norm": 0.5562495178275069, "learning_rate": 5.589130543958897e-08, "loss": 1.5791, "step": 6146 }, { "epoch": 0.9270794057763366, "grad_norm": 0.25271078865290386, "learning_rate": 5.5867092598359674e-08, "loss": 1.5522, "step": 6147 }, { "epoch": 0.9272302239650102, "grad_norm": 0.2510375488276909, "learning_rate": 5.584292895884995e-08, "loss": 1.5708, "step": 6148 }, { "epoch": 0.9273810421536838, "grad_norm": 0.30069052420596476, "learning_rate": 5.581881452648523e-08, "loss": 1.5017, "step": 6149 }, { "epoch": 0.9275318603423572, "grad_norm": 0.25405929708733727, "learning_rate": 5.579474930667992e-08, "loss": 1.5602, "step": 6150 }, { "epoch": 0.9276826785310308, "grad_norm": 0.24886204828205324, "learning_rate": 5.577073330483736e-08, "loss": 1.556, "step": 6151 }, { "epoch": 0.9278334967197044, "grad_norm": 0.24788735210464843, "learning_rate": 5.574676652634979e-08, "loss": 1.5254, "step": 6152 }, { "epoch": 0.927984314908378, "grad_norm": 0.2626661539030793, "learning_rate": 5.572284897659854e-08, "loss": 1.5193, "step": 6153 }, { "epoch": 0.9281351330970515, "grad_norm": 0.26222135209071656, "learning_rate": 5.5698980660953747e-08, "loss": 1.5583, "step": 6154 }, { "epoch": 0.9282859512857251, "grad_norm": 0.316263266609963, "learning_rate": 5.5675161584774516e-08, "loss": 1.5786, "step": 6155 }, { "epoch": 0.9284367694743986, "grad_norm": 0.2543644512752816, "learning_rate": 5.565139175340899e-08, "loss": 1.6077, "step": 6156 }, { "epoch": 0.9285875876630721, "grad_norm": 0.26142964558477094, "learning_rate": 5.562767117219414e-08, "loss": 1.4931, "step": 6157 }, { "epoch": 0.9287384058517457, "grad_norm": 0.25780942757545783, "learning_rate": 5.5603999846455915e-08, "loss": 1.4687, "step": 6158 }, { "epoch": 0.9288892240404193, "grad_norm": 0.24910206966222853, "learning_rate": 5.558037778150929e-08, "loss": 1.5521, "step": 6159 }, { "epoch": 0.9290400422290929, "grad_norm": 0.2610805885124634, "learning_rate": 5.555680498265804e-08, "loss": 1.586, "step": 6160 }, { "epoch": 0.9291908604177663, "grad_norm": 0.24786750742412972, "learning_rate": 5.5533281455194955e-08, "loss": 1.6002, "step": 6161 }, { "epoch": 0.9293416786064399, "grad_norm": 0.27917077529733814, "learning_rate": 5.550980720440176e-08, "loss": 1.5492, "step": 6162 }, { "epoch": 0.9294924967951135, "grad_norm": 0.6410057671884332, "learning_rate": 5.5486382235549103e-08, "loss": 1.5816, "step": 6163 }, { "epoch": 0.929643314983787, "grad_norm": 0.2511566309452938, "learning_rate": 5.5463006553896596e-08, "loss": 1.5545, "step": 6164 }, { "epoch": 0.9297941331724606, "grad_norm": 0.2737661737146725, "learning_rate": 5.543968016469273e-08, "loss": 1.5745, "step": 6165 }, { "epoch": 0.9299449513611342, "grad_norm": 0.3797270236127085, "learning_rate": 5.5416403073174975e-08, "loss": 1.6144, "step": 6166 }, { "epoch": 0.9300957695498077, "grad_norm": 0.31918928141620784, "learning_rate": 5.5393175284569716e-08, "loss": 1.6324, "step": 6167 }, { "epoch": 0.9302465877384812, "grad_norm": 0.24958868757458863, "learning_rate": 5.5369996804092246e-08, "loss": 1.5633, "step": 6168 }, { "epoch": 0.9303974059271548, "grad_norm": 0.25361093030951837, "learning_rate": 5.534686763694684e-08, "loss": 1.552, "step": 6169 }, { "epoch": 0.9305482241158284, "grad_norm": 0.26060521044377927, "learning_rate": 5.532378778832665e-08, "loss": 1.5283, "step": 6170 }, { "epoch": 0.930699042304502, "grad_norm": 0.441136825411557, "learning_rate": 5.530075726341377e-08, "loss": 1.5484, "step": 6171 }, { "epoch": 0.9308498604931755, "grad_norm": 0.2464745890853672, "learning_rate": 5.527777606737926e-08, "loss": 1.5102, "step": 6172 }, { "epoch": 0.931000678681849, "grad_norm": 0.24944079204088596, "learning_rate": 5.525484420538303e-08, "loss": 1.523, "step": 6173 }, { "epoch": 0.9311514968705226, "grad_norm": 0.2875126059730783, "learning_rate": 5.523196168257397e-08, "loss": 1.6301, "step": 6174 }, { "epoch": 0.9313023150591961, "grad_norm": 0.26474669232214115, "learning_rate": 5.520912850408985e-08, "loss": 1.589, "step": 6175 }, { "epoch": 0.9314531332478697, "grad_norm": 0.26217816594758014, "learning_rate": 5.518634467505739e-08, "loss": 1.553, "step": 6176 }, { "epoch": 0.9316039514365433, "grad_norm": 0.2718608358446988, "learning_rate": 5.5163610200592256e-08, "loss": 1.5838, "step": 6177 }, { "epoch": 0.9317547696252167, "grad_norm": 0.24415865400433565, "learning_rate": 5.514092508579894e-08, "loss": 1.5133, "step": 6178 }, { "epoch": 0.9319055878138903, "grad_norm": 0.2411641486293077, "learning_rate": 5.5118289335771e-08, "loss": 1.5839, "step": 6179 }, { "epoch": 0.9320564060025639, "grad_norm": 0.2514668725290416, "learning_rate": 5.509570295559074e-08, "loss": 1.5972, "step": 6180 }, { "epoch": 0.9322072241912375, "grad_norm": 0.2759880586716973, "learning_rate": 5.5073165950329465e-08, "loss": 1.5304, "step": 6181 }, { "epoch": 0.932358042379911, "grad_norm": 0.25605097901499024, "learning_rate": 5.505067832504744e-08, "loss": 1.5246, "step": 6182 }, { "epoch": 0.9325088605685846, "grad_norm": 0.29444578427717816, "learning_rate": 5.502824008479376e-08, "loss": 1.5881, "step": 6183 }, { "epoch": 0.9326596787572581, "grad_norm": 0.24273982291992666, "learning_rate": 5.5005851234606446e-08, "loss": 1.5598, "step": 6184 }, { "epoch": 0.9328104969459317, "grad_norm": 0.25299211156493445, "learning_rate": 5.4983511779512494e-08, "loss": 1.595, "step": 6185 }, { "epoch": 0.9329613151346052, "grad_norm": 0.25732622340646677, "learning_rate": 5.4961221724527716e-08, "loss": 1.6029, "step": 6186 }, { "epoch": 0.9331121333232788, "grad_norm": 0.8128019546387675, "learning_rate": 5.493898107465691e-08, "loss": 1.5349, "step": 6187 }, { "epoch": 0.9332629515119524, "grad_norm": 0.27992600807371115, "learning_rate": 5.4916789834893725e-08, "loss": 1.6154, "step": 6188 }, { "epoch": 0.933413769700626, "grad_norm": 0.2542356440560896, "learning_rate": 5.489464801022077e-08, "loss": 1.5421, "step": 6189 }, { "epoch": 0.9335645878892994, "grad_norm": 0.26740460030730834, "learning_rate": 5.487255560560952e-08, "loss": 1.6063, "step": 6190 }, { "epoch": 0.933715406077973, "grad_norm": 0.272674547101346, "learning_rate": 5.4850512626020335e-08, "loss": 1.6055, "step": 6191 }, { "epoch": 0.9338662242666466, "grad_norm": 0.24536914204213503, "learning_rate": 5.482851907640254e-08, "loss": 1.572, "step": 6192 }, { "epoch": 0.9340170424553201, "grad_norm": 0.2449252209481925, "learning_rate": 5.4806574961694314e-08, "loss": 1.5289, "step": 6193 }, { "epoch": 0.9341678606439937, "grad_norm": 0.24055151366290678, "learning_rate": 5.478468028682272e-08, "loss": 1.5498, "step": 6194 }, { "epoch": 0.9343186788326672, "grad_norm": 0.2527859810550758, "learning_rate": 5.476283505670381e-08, "loss": 1.5134, "step": 6195 }, { "epoch": 0.9344694970213407, "grad_norm": 0.3577246499396631, "learning_rate": 5.474103927624247e-08, "loss": 1.5736, "step": 6196 }, { "epoch": 0.9346203152100143, "grad_norm": 0.2766852958914957, "learning_rate": 5.471929295033241e-08, "loss": 1.6239, "step": 6197 }, { "epoch": 0.9347711333986879, "grad_norm": 0.24795491996332725, "learning_rate": 5.4697596083856396e-08, "loss": 1.5726, "step": 6198 }, { "epoch": 0.9349219515873615, "grad_norm": 0.26782304778646604, "learning_rate": 5.4675948681685984e-08, "loss": 1.5763, "step": 6199 }, { "epoch": 0.935072769776035, "grad_norm": 0.26596988390834714, "learning_rate": 5.465435074868161e-08, "loss": 1.5687, "step": 6200 }, { "epoch": 0.9352235879647085, "grad_norm": 0.2580184881996988, "learning_rate": 5.463280228969268e-08, "loss": 1.5726, "step": 6201 }, { "epoch": 0.9353744061533821, "grad_norm": 0.24960174972988566, "learning_rate": 5.4611303309557454e-08, "loss": 1.5639, "step": 6202 }, { "epoch": 0.9355252243420557, "grad_norm": 0.24331613996923387, "learning_rate": 5.4589853813103055e-08, "loss": 1.5338, "step": 6203 }, { "epoch": 0.9356760425307292, "grad_norm": 0.2879551135187979, "learning_rate": 5.456845380514551e-08, "loss": 1.5685, "step": 6204 }, { "epoch": 0.9358268607194028, "grad_norm": 0.25161922629399475, "learning_rate": 5.454710329048977e-08, "loss": 1.4967, "step": 6205 }, { "epoch": 0.9359776789080763, "grad_norm": 0.26340960169106836, "learning_rate": 5.452580227392966e-08, "loss": 1.5289, "step": 6206 }, { "epoch": 0.9361284970967498, "grad_norm": 0.2670862257291208, "learning_rate": 5.450455076024781e-08, "loss": 1.6131, "step": 6207 }, { "epoch": 0.9362793152854234, "grad_norm": 0.24782221497804563, "learning_rate": 5.4483348754215895e-08, "loss": 1.5653, "step": 6208 }, { "epoch": 0.936430133474097, "grad_norm": 0.2808602684717364, "learning_rate": 5.446219626059433e-08, "loss": 1.5309, "step": 6209 }, { "epoch": 0.9365809516627706, "grad_norm": 0.24441080991582628, "learning_rate": 5.444109328413246e-08, "loss": 1.5623, "step": 6210 }, { "epoch": 0.9367317698514441, "grad_norm": 0.29055086813288683, "learning_rate": 5.442003982956855e-08, "loss": 1.5119, "step": 6211 }, { "epoch": 0.9368825880401176, "grad_norm": 0.2522579754654987, "learning_rate": 5.439903590162971e-08, "loss": 1.5241, "step": 6212 }, { "epoch": 0.9370334062287912, "grad_norm": 0.25334183102849595, "learning_rate": 5.43780815050319e-08, "loss": 1.5799, "step": 6213 }, { "epoch": 0.9371842244174647, "grad_norm": 2.436419004177727, "learning_rate": 5.435717664448001e-08, "loss": 1.5813, "step": 6214 }, { "epoch": 0.9373350426061383, "grad_norm": 0.2828523731284309, "learning_rate": 5.4336321324667806e-08, "loss": 1.5146, "step": 6215 }, { "epoch": 0.9374858607948119, "grad_norm": 0.27876418593965085, "learning_rate": 5.431551555027789e-08, "loss": 1.6188, "step": 6216 }, { "epoch": 0.9376366789834855, "grad_norm": 0.258647266375581, "learning_rate": 5.429475932598177e-08, "loss": 1.5489, "step": 6217 }, { "epoch": 0.9377874971721589, "grad_norm": 0.4395462622142405, "learning_rate": 5.427405265643983e-08, "loss": 1.5212, "step": 6218 }, { "epoch": 0.9379383153608325, "grad_norm": 0.25527207182841355, "learning_rate": 5.4253395546301344e-08, "loss": 1.5375, "step": 6219 }, { "epoch": 0.9380891335495061, "grad_norm": 0.25692760225637207, "learning_rate": 5.4232788000204385e-08, "loss": 1.5743, "step": 6220 }, { "epoch": 0.9382399517381796, "grad_norm": 0.2527388733388531, "learning_rate": 5.4212230022775965e-08, "loss": 1.5302, "step": 6221 }, { "epoch": 0.9383907699268532, "grad_norm": 0.24386827293627816, "learning_rate": 5.4191721618631976e-08, "loss": 1.603, "step": 6222 }, { "epoch": 0.9385415881155267, "grad_norm": 0.2502629611784765, "learning_rate": 5.4171262792377134e-08, "loss": 1.5227, "step": 6223 }, { "epoch": 0.9386924063042003, "grad_norm": 0.2495200719724515, "learning_rate": 5.415085354860503e-08, "loss": 1.5439, "step": 6224 }, { "epoch": 0.9388432244928738, "grad_norm": 0.24395521270584844, "learning_rate": 5.413049389189813e-08, "loss": 1.5743, "step": 6225 }, { "epoch": 0.9389940426815474, "grad_norm": 0.2528655609824242, "learning_rate": 5.4110183826827805e-08, "loss": 1.5886, "step": 6226 }, { "epoch": 0.939144860870221, "grad_norm": 0.2535177596142676, "learning_rate": 5.4089923357954206e-08, "loss": 1.5847, "step": 6227 }, { "epoch": 0.9392956790588946, "grad_norm": 0.30616125366794783, "learning_rate": 5.406971248982644e-08, "loss": 1.5544, "step": 6228 }, { "epoch": 0.939446497247568, "grad_norm": 0.24170407278087794, "learning_rate": 5.404955122698243e-08, "loss": 1.5934, "step": 6229 }, { "epoch": 0.9395973154362416, "grad_norm": 0.25291719608092217, "learning_rate": 5.4029439573948954e-08, "loss": 1.5598, "step": 6230 }, { "epoch": 0.9397481336249152, "grad_norm": 0.2725490134475271, "learning_rate": 5.400937753524165e-08, "loss": 1.5739, "step": 6231 }, { "epoch": 0.9398989518135887, "grad_norm": 0.30230082036390676, "learning_rate": 5.3989365115365075e-08, "loss": 1.5173, "step": 6232 }, { "epoch": 0.9400497700022623, "grad_norm": 0.2721094377001865, "learning_rate": 5.396940231881256e-08, "loss": 1.5834, "step": 6233 }, { "epoch": 0.9402005881909359, "grad_norm": 0.30026085064325764, "learning_rate": 5.394948915006634e-08, "loss": 1.6117, "step": 6234 }, { "epoch": 0.9403514063796093, "grad_norm": 0.2593433685604889, "learning_rate": 5.392962561359751e-08, "loss": 1.5859, "step": 6235 }, { "epoch": 0.9405022245682829, "grad_norm": 0.27293940297424424, "learning_rate": 5.390981171386603e-08, "loss": 1.5537, "step": 6236 }, { "epoch": 0.9406530427569565, "grad_norm": 0.6238862258928658, "learning_rate": 5.389004745532065e-08, "loss": 1.5627, "step": 6237 }, { "epoch": 0.9408038609456301, "grad_norm": 0.29140578643943493, "learning_rate": 5.387033284239907e-08, "loss": 1.6062, "step": 6238 }, { "epoch": 0.9409546791343036, "grad_norm": 0.2581601694967583, "learning_rate": 5.385066787952777e-08, "loss": 1.5281, "step": 6239 }, { "epoch": 0.9411054973229771, "grad_norm": 0.27090139343038255, "learning_rate": 5.3831052571122093e-08, "loss": 1.5505, "step": 6240 }, { "epoch": 0.9412563155116507, "grad_norm": 0.25486575308346454, "learning_rate": 5.3811486921586287e-08, "loss": 1.5933, "step": 6241 }, { "epoch": 0.9414071337003243, "grad_norm": 0.3990119272110088, "learning_rate": 5.37919709353134e-08, "loss": 1.5544, "step": 6242 }, { "epoch": 0.9415579518889978, "grad_norm": 0.31640864216127074, "learning_rate": 5.37725046166853e-08, "loss": 1.5405, "step": 6243 }, { "epoch": 0.9417087700776714, "grad_norm": 0.36875348906357763, "learning_rate": 5.3753087970072784e-08, "loss": 1.6093, "step": 6244 }, { "epoch": 0.941859588266345, "grad_norm": 0.2535016803140429, "learning_rate": 5.3733720999835434e-08, "loss": 1.5158, "step": 6245 }, { "epoch": 0.9420104064550184, "grad_norm": 0.2656723034956448, "learning_rate": 5.37144037103217e-08, "loss": 1.5191, "step": 6246 }, { "epoch": 0.942161224643692, "grad_norm": 0.25601902569295426, "learning_rate": 5.36951361058689e-08, "loss": 1.6099, "step": 6247 }, { "epoch": 0.9423120428323656, "grad_norm": 0.24039116671233865, "learning_rate": 5.367591819080316e-08, "loss": 1.5488, "step": 6248 }, { "epoch": 0.9424628610210392, "grad_norm": 0.2705119376049466, "learning_rate": 5.3656749969439443e-08, "loss": 1.5515, "step": 6249 }, { "epoch": 0.9426136792097127, "grad_norm": 0.2826894483285141, "learning_rate": 5.363763144608157e-08, "loss": 1.5691, "step": 6250 }, { "epoch": 0.9427644973983862, "grad_norm": 0.25073840366910916, "learning_rate": 5.361856262502224e-08, "loss": 1.5692, "step": 6251 }, { "epoch": 0.9429153155870598, "grad_norm": 0.36303998778681995, "learning_rate": 5.359954351054295e-08, "loss": 1.5497, "step": 6252 }, { "epoch": 0.9430661337757333, "grad_norm": 0.276823763660622, "learning_rate": 5.3580574106914006e-08, "loss": 1.5727, "step": 6253 }, { "epoch": 0.9432169519644069, "grad_norm": 0.3197162806628283, "learning_rate": 5.356165441839465e-08, "loss": 1.6099, "step": 6254 }, { "epoch": 0.9433677701530805, "grad_norm": 0.4140003917037914, "learning_rate": 5.3542784449232866e-08, "loss": 1.5471, "step": 6255 }, { "epoch": 0.9435185883417541, "grad_norm": 0.23637027304146924, "learning_rate": 5.3523964203665504e-08, "loss": 1.5509, "step": 6256 }, { "epoch": 0.9436694065304275, "grad_norm": 0.25411725979810995, "learning_rate": 5.350519368591828e-08, "loss": 1.5274, "step": 6257 }, { "epoch": 0.9438202247191011, "grad_norm": 0.2571056548657803, "learning_rate": 5.348647290020575e-08, "loss": 1.5255, "step": 6258 }, { "epoch": 0.9439710429077747, "grad_norm": 0.891663348681135, "learning_rate": 5.346780185073119e-08, "loss": 1.5706, "step": 6259 }, { "epoch": 0.9441218610964482, "grad_norm": 0.24728698455839027, "learning_rate": 5.344918054168689e-08, "loss": 1.6282, "step": 6260 }, { "epoch": 0.9442726792851218, "grad_norm": 0.26295350236768716, "learning_rate": 5.343060897725381e-08, "loss": 1.544, "step": 6261 }, { "epoch": 0.9444234974737954, "grad_norm": 0.4401728429434684, "learning_rate": 5.341208716160183e-08, "loss": 1.4944, "step": 6262 }, { "epoch": 0.9445743156624689, "grad_norm": 0.3103928113123362, "learning_rate": 5.339361509888962e-08, "loss": 1.5241, "step": 6263 }, { "epoch": 0.9447251338511424, "grad_norm": 0.2529679821462462, "learning_rate": 5.337519279326473e-08, "loss": 1.4817, "step": 6264 }, { "epoch": 0.944875952039816, "grad_norm": 0.2446506531939954, "learning_rate": 5.335682024886348e-08, "loss": 1.5688, "step": 6265 }, { "epoch": 0.9450267702284896, "grad_norm": 0.26458469173429827, "learning_rate": 5.3338497469811033e-08, "loss": 1.5002, "step": 6266 }, { "epoch": 0.9451775884171632, "grad_norm": 0.25643614837676953, "learning_rate": 5.3320224460221424e-08, "loss": 1.5558, "step": 6267 }, { "epoch": 0.9453284066058366, "grad_norm": 0.2539040408097631, "learning_rate": 5.3302001224197445e-08, "loss": 1.6157, "step": 6268 }, { "epoch": 0.9454792247945102, "grad_norm": 0.26683606113407243, "learning_rate": 5.3283827765830754e-08, "loss": 1.6008, "step": 6269 }, { "epoch": 0.9456300429831838, "grad_norm": 0.26095663501899063, "learning_rate": 5.32657040892018e-08, "loss": 1.581, "step": 6270 }, { "epoch": 0.9457808611718573, "grad_norm": 0.2856783576919806, "learning_rate": 5.3247630198379916e-08, "loss": 1.5888, "step": 6271 }, { "epoch": 0.9459316793605309, "grad_norm": 0.24943877739150502, "learning_rate": 5.322960609742317e-08, "loss": 1.4971, "step": 6272 }, { "epoch": 0.9460824975492045, "grad_norm": 0.25485051413068144, "learning_rate": 5.321163179037857e-08, "loss": 1.5837, "step": 6273 }, { "epoch": 0.946233315737878, "grad_norm": 0.2666569322243275, "learning_rate": 5.319370728128178e-08, "loss": 1.5049, "step": 6274 }, { "epoch": 0.9463841339265515, "grad_norm": 0.2442776208036276, "learning_rate": 5.317583257415743e-08, "loss": 1.5863, "step": 6275 }, { "epoch": 0.9465349521152251, "grad_norm": 0.254146941158463, "learning_rate": 5.31580076730189e-08, "loss": 1.5502, "step": 6276 }, { "epoch": 0.9466857703038987, "grad_norm": 0.2517459606134991, "learning_rate": 5.314023258186843e-08, "loss": 1.575, "step": 6277 }, { "epoch": 0.9468365884925722, "grad_norm": 0.2983302017671054, "learning_rate": 5.312250730469697e-08, "loss": 1.539, "step": 6278 }, { "epoch": 0.9469874066812458, "grad_norm": 0.26748833623162827, "learning_rate": 5.310483184548442e-08, "loss": 1.5791, "step": 6279 }, { "epoch": 0.9471382248699193, "grad_norm": 0.2768742628332462, "learning_rate": 5.308720620819943e-08, "loss": 1.5507, "step": 6280 }, { "epoch": 0.9472890430585929, "grad_norm": 0.38464340768160965, "learning_rate": 5.3069630396799485e-08, "loss": 1.5766, "step": 6281 }, { "epoch": 0.9474398612472664, "grad_norm": 0.23792739536809543, "learning_rate": 5.3052104415230795e-08, "loss": 1.5254, "step": 6282 }, { "epoch": 0.94759067943594, "grad_norm": 0.26708907903881945, "learning_rate": 5.303462826742852e-08, "loss": 1.6363, "step": 6283 }, { "epoch": 0.9477414976246136, "grad_norm": 0.25006308858650866, "learning_rate": 5.3017201957316576e-08, "loss": 1.5392, "step": 6284 }, { "epoch": 0.947892315813287, "grad_norm": 0.2500468930788792, "learning_rate": 5.29998254888076e-08, "loss": 1.5147, "step": 6285 }, { "epoch": 0.9480431340019606, "grad_norm": 0.2394152106733528, "learning_rate": 5.2982498865803195e-08, "loss": 1.5139, "step": 6286 }, { "epoch": 0.9481939521906342, "grad_norm": 0.32357770060991164, "learning_rate": 5.296522209219362e-08, "loss": 1.5668, "step": 6287 }, { "epoch": 0.9483447703793078, "grad_norm": 0.33410105145228264, "learning_rate": 5.2947995171858095e-08, "loss": 1.5436, "step": 6288 }, { "epoch": 0.9484955885679813, "grad_norm": 0.2512225546358416, "learning_rate": 5.2930818108664494e-08, "loss": 1.533, "step": 6289 }, { "epoch": 0.9486464067566549, "grad_norm": 0.28751801968688, "learning_rate": 5.2913690906469613e-08, "loss": 1.5494, "step": 6290 }, { "epoch": 0.9487972249453284, "grad_norm": 0.2515027708999194, "learning_rate": 5.2896613569119e-08, "loss": 1.4793, "step": 6291 }, { "epoch": 0.9489480431340019, "grad_norm": 0.26909642061522543, "learning_rate": 5.287958610044697e-08, "loss": 1.5931, "step": 6292 }, { "epoch": 0.9490988613226755, "grad_norm": 0.2835281984574101, "learning_rate": 5.286260850427675e-08, "loss": 1.5816, "step": 6293 }, { "epoch": 0.9492496795113491, "grad_norm": 0.251470368186942, "learning_rate": 5.284568078442029e-08, "loss": 1.6253, "step": 6294 }, { "epoch": 0.9494004977000227, "grad_norm": 0.2515840504511767, "learning_rate": 5.2828802944678316e-08, "loss": 1.5558, "step": 6295 }, { "epoch": 0.9495513158886961, "grad_norm": 0.24579037686156516, "learning_rate": 5.281197498884042e-08, "loss": 1.5177, "step": 6296 }, { "epoch": 0.9497021340773697, "grad_norm": 0.48143361296846493, "learning_rate": 5.2795196920684983e-08, "loss": 1.5907, "step": 6297 }, { "epoch": 0.9498529522660433, "grad_norm": 0.36402142735239346, "learning_rate": 5.277846874397915e-08, "loss": 1.5661, "step": 6298 }, { "epoch": 0.9500037704547168, "grad_norm": 0.8513052715793634, "learning_rate": 5.27617904624789e-08, "loss": 1.5928, "step": 6299 }, { "epoch": 0.9501545886433904, "grad_norm": 0.2492730562768886, "learning_rate": 5.274516207992898e-08, "loss": 1.5783, "step": 6300 }, { "epoch": 0.950305406832064, "grad_norm": 0.2483441833704336, "learning_rate": 5.272858360006294e-08, "loss": 1.5933, "step": 6301 }, { "epoch": 0.9504562250207375, "grad_norm": 0.4504628252228296, "learning_rate": 5.271205502660316e-08, "loss": 1.5785, "step": 6302 }, { "epoch": 0.950607043209411, "grad_norm": 0.2497635116932171, "learning_rate": 5.269557636326074e-08, "loss": 1.5573, "step": 6303 }, { "epoch": 0.9507578613980846, "grad_norm": 0.6568417764575003, "learning_rate": 5.267914761373567e-08, "loss": 1.5554, "step": 6304 }, { "epoch": 0.9509086795867582, "grad_norm": 0.2397379571816978, "learning_rate": 5.266276878171664e-08, "loss": 1.5802, "step": 6305 }, { "epoch": 0.9510594977754318, "grad_norm": 0.26934854200352987, "learning_rate": 5.264643987088122e-08, "loss": 1.4862, "step": 6306 }, { "epoch": 0.9512103159641053, "grad_norm": 0.25181008089940937, "learning_rate": 5.263016088489568e-08, "loss": 1.5685, "step": 6307 }, { "epoch": 0.9513611341527788, "grad_norm": 0.2421353533048425, "learning_rate": 5.261393182741514e-08, "loss": 1.556, "step": 6308 }, { "epoch": 0.9515119523414524, "grad_norm": 0.251096429171215, "learning_rate": 5.2597752702083514e-08, "loss": 1.4922, "step": 6309 }, { "epoch": 0.9516627705301259, "grad_norm": 0.37880347467434033, "learning_rate": 5.258162351253348e-08, "loss": 1.5302, "step": 6310 }, { "epoch": 0.9518135887187995, "grad_norm": 0.255749362520709, "learning_rate": 5.2565544262386504e-08, "loss": 1.5452, "step": 6311 }, { "epoch": 0.9519644069074731, "grad_norm": 0.24572970039643124, "learning_rate": 5.254951495525285e-08, "loss": 1.5086, "step": 6312 }, { "epoch": 0.9521152250961465, "grad_norm": 0.2671105386365278, "learning_rate": 5.2533535594731544e-08, "loss": 1.5678, "step": 6313 }, { "epoch": 0.9522660432848201, "grad_norm": 0.26509869032789696, "learning_rate": 5.251760618441047e-08, "loss": 1.5863, "step": 6314 }, { "epoch": 0.9524168614734937, "grad_norm": 0.24674607540460267, "learning_rate": 5.250172672786618e-08, "loss": 1.5591, "step": 6315 }, { "epoch": 0.9525676796621673, "grad_norm": 0.2810966899454014, "learning_rate": 5.248589722866413e-08, "loss": 1.5854, "step": 6316 }, { "epoch": 0.9527184978508408, "grad_norm": 0.2552064134059601, "learning_rate": 5.247011769035848e-08, "loss": 1.5454, "step": 6317 }, { "epoch": 0.9528693160395144, "grad_norm": 0.2616743545212081, "learning_rate": 5.2454388116492156e-08, "loss": 1.5682, "step": 6318 }, { "epoch": 0.9530201342281879, "grad_norm": 0.24562816546877292, "learning_rate": 5.2438708510596976e-08, "loss": 1.5482, "step": 6319 }, { "epoch": 0.9531709524168615, "grad_norm": 0.25582222827139806, "learning_rate": 5.242307887619344e-08, "loss": 1.5549, "step": 6320 }, { "epoch": 0.953321770605535, "grad_norm": 0.25225601203377085, "learning_rate": 5.2407499216790834e-08, "loss": 1.5699, "step": 6321 }, { "epoch": 0.9534725887942086, "grad_norm": 0.24564441907729032, "learning_rate": 5.2391969535887276e-08, "loss": 1.643, "step": 6322 }, { "epoch": 0.9536234069828822, "grad_norm": 0.24492164355094914, "learning_rate": 5.237648983696961e-08, "loss": 1.5292, "step": 6323 }, { "epoch": 0.9537742251715557, "grad_norm": 0.2706662227904134, "learning_rate": 5.23610601235135e-08, "loss": 1.5519, "step": 6324 }, { "epoch": 0.9539250433602292, "grad_norm": 0.25127336107480863, "learning_rate": 5.234568039898336e-08, "loss": 1.6202, "step": 6325 }, { "epoch": 0.9540758615489028, "grad_norm": 0.24323633765490138, "learning_rate": 5.233035066683238e-08, "loss": 1.564, "step": 6326 }, { "epoch": 0.9542266797375764, "grad_norm": 0.3904116632792208, "learning_rate": 5.2315070930502523e-08, "loss": 1.5072, "step": 6327 }, { "epoch": 0.9543774979262499, "grad_norm": 0.24535791701400553, "learning_rate": 5.229984119342452e-08, "loss": 1.6105, "step": 6328 }, { "epoch": 0.9545283161149235, "grad_norm": 0.24983407654198528, "learning_rate": 5.2284661459017956e-08, "loss": 1.6082, "step": 6329 }, { "epoch": 0.954679134303597, "grad_norm": 0.2735020691691619, "learning_rate": 5.2269531730691056e-08, "loss": 1.61, "step": 6330 }, { "epoch": 0.9548299524922705, "grad_norm": 0.3316571241912269, "learning_rate": 5.2254452011840936e-08, "loss": 1.6008, "step": 6331 }, { "epoch": 0.9549807706809441, "grad_norm": 0.24588425535404385, "learning_rate": 5.223942230585339e-08, "loss": 1.5382, "step": 6332 }, { "epoch": 0.9551315888696177, "grad_norm": 0.2410493398534231, "learning_rate": 5.222444261610305e-08, "loss": 1.5555, "step": 6333 }, { "epoch": 0.9552824070582913, "grad_norm": 0.27248429010886993, "learning_rate": 5.220951294595327e-08, "loss": 1.5324, "step": 6334 }, { "epoch": 0.9554332252469648, "grad_norm": 0.2531146404786899, "learning_rate": 5.219463329875622e-08, "loss": 1.5819, "step": 6335 }, { "epoch": 0.9555840434356383, "grad_norm": 0.3264049251970853, "learning_rate": 5.21798036778528e-08, "loss": 1.5493, "step": 6336 }, { "epoch": 0.9557348616243119, "grad_norm": 0.2673903371298123, "learning_rate": 5.216502408657271e-08, "loss": 1.5503, "step": 6337 }, { "epoch": 0.9558856798129854, "grad_norm": 0.25067687301545893, "learning_rate": 5.215029452823435e-08, "loss": 1.542, "step": 6338 }, { "epoch": 0.956036498001659, "grad_norm": 0.24137452966892348, "learning_rate": 5.213561500614498e-08, "loss": 1.5222, "step": 6339 }, { "epoch": 0.9561873161903326, "grad_norm": 0.2985998380684002, "learning_rate": 5.21209855236006e-08, "loss": 1.5531, "step": 6340 }, { "epoch": 0.956338134379006, "grad_norm": 0.2575262759632998, "learning_rate": 5.21064060838859e-08, "loss": 1.5532, "step": 6341 }, { "epoch": 0.9564889525676796, "grad_norm": 0.26226393673791865, "learning_rate": 5.2091876690274414e-08, "loss": 1.5486, "step": 6342 }, { "epoch": 0.9566397707563532, "grad_norm": 0.25781586813383506, "learning_rate": 5.2077397346028425e-08, "loss": 1.6117, "step": 6343 }, { "epoch": 0.9567905889450268, "grad_norm": 0.24805595398722974, "learning_rate": 5.2062968054398935e-08, "loss": 1.5863, "step": 6344 }, { "epoch": 0.9569414071337004, "grad_norm": 0.24439234848724492, "learning_rate": 5.2048588818625784e-08, "loss": 1.6135, "step": 6345 }, { "epoch": 0.9570922253223739, "grad_norm": 0.4701836790860895, "learning_rate": 5.2034259641937486e-08, "loss": 1.5165, "step": 6346 }, { "epoch": 0.9572430435110474, "grad_norm": 0.24201946522129109, "learning_rate": 5.201998052755139e-08, "loss": 1.5647, "step": 6347 }, { "epoch": 0.957393861699721, "grad_norm": 0.24220374354816995, "learning_rate": 5.200575147867356e-08, "loss": 1.4992, "step": 6348 }, { "epoch": 0.9575446798883945, "grad_norm": 0.2541763196845834, "learning_rate": 5.199157249849884e-08, "loss": 1.5549, "step": 6349 }, { "epoch": 0.9576954980770681, "grad_norm": 0.23701110036162357, "learning_rate": 5.19774435902108e-08, "loss": 1.5567, "step": 6350 }, { "epoch": 0.9578463162657417, "grad_norm": 0.2466884716259338, "learning_rate": 5.1963364756981814e-08, "loss": 1.6295, "step": 6351 }, { "epoch": 0.9579971344544153, "grad_norm": 0.2565904248345054, "learning_rate": 5.1949336001973e-08, "loss": 1.58, "step": 6352 }, { "epoch": 0.9581479526430887, "grad_norm": 0.26585377051506953, "learning_rate": 5.193535732833418e-08, "loss": 1.5371, "step": 6353 }, { "epoch": 0.9582987708317623, "grad_norm": 0.26269672466200317, "learning_rate": 5.192142873920401e-08, "loss": 1.6572, "step": 6354 }, { "epoch": 0.9584495890204359, "grad_norm": 0.2683391523084997, "learning_rate": 5.1907550237709877e-08, "loss": 1.594, "step": 6355 }, { "epoch": 0.9586004072091094, "grad_norm": 0.25262581475921414, "learning_rate": 5.189372182696786e-08, "loss": 1.5137, "step": 6356 }, { "epoch": 0.958751225397783, "grad_norm": 0.2730807174740343, "learning_rate": 5.187994351008288e-08, "loss": 1.5568, "step": 6357 }, { "epoch": 0.9589020435864565, "grad_norm": 0.25856596758541345, "learning_rate": 5.1866215290148544e-08, "loss": 1.5104, "step": 6358 }, { "epoch": 0.95905286177513, "grad_norm": 0.6752805259594179, "learning_rate": 5.1852537170247246e-08, "loss": 1.5019, "step": 6359 }, { "epoch": 0.9592036799638036, "grad_norm": 0.27077413598273226, "learning_rate": 5.183890915345015e-08, "loss": 1.5844, "step": 6360 }, { "epoch": 0.9593544981524772, "grad_norm": 0.2620727508929476, "learning_rate": 5.1825331242817094e-08, "loss": 1.5553, "step": 6361 }, { "epoch": 0.9595053163411508, "grad_norm": 0.24557656249048038, "learning_rate": 5.181180344139676e-08, "loss": 1.5322, "step": 6362 }, { "epoch": 0.9596561345298243, "grad_norm": 0.24812302527421484, "learning_rate": 5.179832575222651e-08, "loss": 1.6082, "step": 6363 }, { "epoch": 0.9598069527184978, "grad_norm": 0.26450100421447964, "learning_rate": 5.1784898178332455e-08, "loss": 1.5796, "step": 6364 }, { "epoch": 0.9599577709071714, "grad_norm": 0.24817179902484185, "learning_rate": 5.177152072272952e-08, "loss": 1.5181, "step": 6365 }, { "epoch": 0.960108589095845, "grad_norm": 0.31622965878024756, "learning_rate": 5.175819338842133e-08, "loss": 1.5046, "step": 6366 }, { "epoch": 0.9602594072845185, "grad_norm": 0.2484048976941183, "learning_rate": 5.174491617840023e-08, "loss": 1.547, "step": 6367 }, { "epoch": 0.9604102254731921, "grad_norm": 0.24385330701567645, "learning_rate": 5.1731689095647344e-08, "loss": 1.5372, "step": 6368 }, { "epoch": 0.9605610436618656, "grad_norm": 0.2516939386965962, "learning_rate": 5.171851214313254e-08, "loss": 1.5466, "step": 6369 }, { "epoch": 0.9607118618505391, "grad_norm": 0.2556617679977314, "learning_rate": 5.170538532381446e-08, "loss": 1.619, "step": 6370 }, { "epoch": 0.9608626800392127, "grad_norm": 0.25530410544496274, "learning_rate": 5.1692308640640416e-08, "loss": 1.5737, "step": 6371 }, { "epoch": 0.9610134982278863, "grad_norm": 0.25699963969499623, "learning_rate": 5.1679282096546535e-08, "loss": 1.6024, "step": 6372 }, { "epoch": 0.9611643164165599, "grad_norm": 0.25245499108689606, "learning_rate": 5.1666305694457646e-08, "loss": 1.6041, "step": 6373 }, { "epoch": 0.9613151346052334, "grad_norm": 0.2495170948349133, "learning_rate": 5.16533794372873e-08, "loss": 1.5568, "step": 6374 }, { "epoch": 0.9614659527939069, "grad_norm": 0.2518591639384356, "learning_rate": 5.164050332793786e-08, "loss": 1.6363, "step": 6375 }, { "epoch": 0.9616167709825805, "grad_norm": 0.28856757112588033, "learning_rate": 5.162767736930037e-08, "loss": 1.5222, "step": 6376 }, { "epoch": 0.961767589171254, "grad_norm": 0.2541937128631411, "learning_rate": 5.161490156425461e-08, "loss": 1.5362, "step": 6377 }, { "epoch": 0.9619184073599276, "grad_norm": 0.338367967854621, "learning_rate": 5.1602175915669166e-08, "loss": 1.6243, "step": 6378 }, { "epoch": 0.9620692255486012, "grad_norm": 0.3627880610386085, "learning_rate": 5.158950042640128e-08, "loss": 1.5381, "step": 6379 }, { "epoch": 0.9622200437372748, "grad_norm": 0.282055593543598, "learning_rate": 5.1576875099297006e-08, "loss": 1.5844, "step": 6380 }, { "epoch": 0.9623708619259482, "grad_norm": 0.25053425306125043, "learning_rate": 5.156429993719104e-08, "loss": 1.5137, "step": 6381 }, { "epoch": 0.9625216801146218, "grad_norm": 0.44214031666076936, "learning_rate": 5.155177494290691e-08, "loss": 1.5772, "step": 6382 }, { "epoch": 0.9626724983032954, "grad_norm": 0.2482561354350461, "learning_rate": 5.1539300119256864e-08, "loss": 1.5061, "step": 6383 }, { "epoch": 0.962823316491969, "grad_norm": 0.26087108410790427, "learning_rate": 5.152687546904182e-08, "loss": 1.554, "step": 6384 }, { "epoch": 0.9629741346806425, "grad_norm": 0.283677071517818, "learning_rate": 5.151450099505147e-08, "loss": 1.5427, "step": 6385 }, { "epoch": 0.963124952869316, "grad_norm": 0.36027328623910293, "learning_rate": 5.1502176700064304e-08, "loss": 1.5881, "step": 6386 }, { "epoch": 0.9632757710579896, "grad_norm": 0.24136512996506998, "learning_rate": 5.1489902586847397e-08, "loss": 1.5579, "step": 6387 }, { "epoch": 0.9634265892466631, "grad_norm": 0.276326793171099, "learning_rate": 5.147767865815673e-08, "loss": 1.5579, "step": 6388 }, { "epoch": 0.9635774074353367, "grad_norm": 0.2661932526640572, "learning_rate": 5.14655049167369e-08, "loss": 1.5575, "step": 6389 }, { "epoch": 0.9637282256240103, "grad_norm": 0.2768456724796517, "learning_rate": 5.1453381365321224e-08, "loss": 1.5478, "step": 6390 }, { "epoch": 0.9638790438126839, "grad_norm": 0.2672029887206416, "learning_rate": 5.1441308006631854e-08, "loss": 1.5583, "step": 6391 }, { "epoch": 0.9640298620013573, "grad_norm": 0.28307822253509946, "learning_rate": 5.1429284843379576e-08, "loss": 1.5894, "step": 6392 }, { "epoch": 0.9641806801900309, "grad_norm": 0.2584585522942879, "learning_rate": 5.141731187826395e-08, "loss": 1.5169, "step": 6393 }, { "epoch": 0.9643314983787045, "grad_norm": 0.2556591975295269, "learning_rate": 5.140538911397326e-08, "loss": 1.5851, "step": 6394 }, { "epoch": 0.964482316567378, "grad_norm": 0.2720527790539065, "learning_rate": 5.139351655318451e-08, "loss": 1.561, "step": 6395 }, { "epoch": 0.9646331347560516, "grad_norm": 0.28976825110477883, "learning_rate": 5.138169419856344e-08, "loss": 1.519, "step": 6396 }, { "epoch": 0.9647839529447252, "grad_norm": 3.967530858341636, "learning_rate": 5.136992205276449e-08, "loss": 1.5251, "step": 6397 }, { "epoch": 0.9649347711333986, "grad_norm": 0.24041622479641886, "learning_rate": 5.135820011843088e-08, "loss": 1.5575, "step": 6398 }, { "epoch": 0.9650855893220722, "grad_norm": 0.2605820726704934, "learning_rate": 5.13465283981945e-08, "loss": 1.579, "step": 6399 }, { "epoch": 0.9652364075107458, "grad_norm": 0.2560085767952434, "learning_rate": 5.1334906894676e-08, "loss": 1.5716, "step": 6400 }, { "epoch": 0.9653872256994194, "grad_norm": 0.24748805942426388, "learning_rate": 5.1323335610484776e-08, "loss": 1.5731, "step": 6401 }, { "epoch": 0.965538043888093, "grad_norm": 0.2654173067765607, "learning_rate": 5.131181454821887e-08, "loss": 1.6424, "step": 6402 }, { "epoch": 0.9656888620767664, "grad_norm": 0.3101653209941184, "learning_rate": 5.1300343710465113e-08, "loss": 1.5601, "step": 6403 }, { "epoch": 0.96583968026544, "grad_norm": 0.24537041958505784, "learning_rate": 5.128892309979907e-08, "loss": 1.598, "step": 6404 }, { "epoch": 0.9659904984541136, "grad_norm": 0.2949769881137243, "learning_rate": 5.1277552718784966e-08, "loss": 1.5294, "step": 6405 }, { "epoch": 0.9661413166427871, "grad_norm": 0.3355136951414702, "learning_rate": 5.12662325699758e-08, "loss": 1.613, "step": 6406 }, { "epoch": 0.9662921348314607, "grad_norm": 0.24091864982323638, "learning_rate": 5.1254962655913256e-08, "loss": 1.5554, "step": 6407 }, { "epoch": 0.9664429530201343, "grad_norm": 0.2687209411320317, "learning_rate": 5.124374297912777e-08, "loss": 1.4844, "step": 6408 }, { "epoch": 0.9665937712088077, "grad_norm": 0.26312670298435775, "learning_rate": 5.123257354213851e-08, "loss": 1.5538, "step": 6409 }, { "epoch": 0.9667445893974813, "grad_norm": 0.28145274288958727, "learning_rate": 5.122145434745329e-08, "loss": 1.437, "step": 6410 }, { "epoch": 0.9668954075861549, "grad_norm": 0.2838846037459194, "learning_rate": 5.1210385397568745e-08, "loss": 1.586, "step": 6411 }, { "epoch": 0.9670462257748285, "grad_norm": 0.2528764946796678, "learning_rate": 5.119936669497015e-08, "loss": 1.6836, "step": 6412 }, { "epoch": 0.967197043963502, "grad_norm": 0.2562476919374889, "learning_rate": 5.118839824213151e-08, "loss": 1.5663, "step": 6413 }, { "epoch": 0.9673478621521755, "grad_norm": 0.24206565595635035, "learning_rate": 5.1177480041515576e-08, "loss": 1.5931, "step": 6414 }, { "epoch": 0.9674986803408491, "grad_norm": 0.2629260321961326, "learning_rate": 5.1166612095573824e-08, "loss": 1.5754, "step": 6415 }, { "epoch": 0.9676494985295226, "grad_norm": 0.25671472522188277, "learning_rate": 5.1155794406746385e-08, "loss": 1.5733, "step": 6416 }, { "epoch": 0.9678003167181962, "grad_norm": 0.4533755136092952, "learning_rate": 5.11450269774622e-08, "loss": 1.5637, "step": 6417 }, { "epoch": 0.9679511349068698, "grad_norm": 0.25153301510486026, "learning_rate": 5.11343098101388e-08, "loss": 1.5059, "step": 6418 }, { "epoch": 0.9681019530955434, "grad_norm": 0.2870340121695763, "learning_rate": 5.112364290718255e-08, "loss": 1.6028, "step": 6419 }, { "epoch": 0.9682527712842168, "grad_norm": 0.2654495116873873, "learning_rate": 5.111302627098848e-08, "loss": 1.5071, "step": 6420 }, { "epoch": 0.9684035894728904, "grad_norm": 0.24384356461296303, "learning_rate": 5.110245990394031e-08, "loss": 1.5198, "step": 6421 }, { "epoch": 0.968554407661564, "grad_norm": 0.24775464926176569, "learning_rate": 5.1091943808410505e-08, "loss": 1.5057, "step": 6422 }, { "epoch": 0.9687052258502376, "grad_norm": 0.2634349039600724, "learning_rate": 5.108147798676023e-08, "loss": 1.5917, "step": 6423 }, { "epoch": 0.9688560440389111, "grad_norm": 0.2475589464139171, "learning_rate": 5.107106244133939e-08, "loss": 1.5126, "step": 6424 }, { "epoch": 0.9690068622275847, "grad_norm": 0.2500812810143837, "learning_rate": 5.1060697174486566e-08, "loss": 1.5617, "step": 6425 }, { "epoch": 0.9691576804162582, "grad_norm": 0.26121275475871664, "learning_rate": 5.105038218852906e-08, "loss": 1.557, "step": 6426 }, { "epoch": 0.9693084986049317, "grad_norm": 0.27842912631507954, "learning_rate": 5.104011748578287e-08, "loss": 1.5295, "step": 6427 }, { "epoch": 0.9694593167936053, "grad_norm": 0.5338303687886577, "learning_rate": 5.102990306855276e-08, "loss": 1.5238, "step": 6428 }, { "epoch": 0.9696101349822789, "grad_norm": 0.25920039211508383, "learning_rate": 5.101973893913211e-08, "loss": 1.5476, "step": 6429 }, { "epoch": 0.9697609531709525, "grad_norm": 0.29257572169992907, "learning_rate": 5.100962509980314e-08, "loss": 1.5912, "step": 6430 }, { "epoch": 0.9699117713596259, "grad_norm": 0.4974711068219046, "learning_rate": 5.0999561552836636e-08, "loss": 1.578, "step": 6431 }, { "epoch": 0.9700625895482995, "grad_norm": 0.2692008434386123, "learning_rate": 5.098954830049217e-08, "loss": 1.5373, "step": 6432 }, { "epoch": 0.9702134077369731, "grad_norm": 0.24493602251064997, "learning_rate": 5.0979585345018e-08, "loss": 1.5222, "step": 6433 }, { "epoch": 0.9703642259256466, "grad_norm": 0.24580616475390038, "learning_rate": 5.0969672688651134e-08, "loss": 1.5055, "step": 6434 }, { "epoch": 0.9705150441143202, "grad_norm": 0.40525484455776534, "learning_rate": 5.095981033361724e-08, "loss": 1.5819, "step": 6435 }, { "epoch": 0.9706658623029938, "grad_norm": 0.5323926365254915, "learning_rate": 5.094999828213068e-08, "loss": 1.5993, "step": 6436 }, { "epoch": 0.9708166804916672, "grad_norm": 0.2758624089121172, "learning_rate": 5.0940236536394586e-08, "loss": 1.5756, "step": 6437 }, { "epoch": 0.9709674986803408, "grad_norm": 0.30537970805051545, "learning_rate": 5.093052509860073e-08, "loss": 1.5215, "step": 6438 }, { "epoch": 0.9711183168690144, "grad_norm": 0.25306210515763267, "learning_rate": 5.0920863970929584e-08, "loss": 1.5335, "step": 6439 }, { "epoch": 0.971269135057688, "grad_norm": 0.2793419447136214, "learning_rate": 5.091125315555039e-08, "loss": 1.5796, "step": 6440 }, { "epoch": 0.9714199532463615, "grad_norm": 0.2635853250708283, "learning_rate": 5.090169265462107e-08, "loss": 1.5372, "step": 6441 }, { "epoch": 0.9715707714350351, "grad_norm": 0.24773665496139735, "learning_rate": 5.0892182470288205e-08, "loss": 1.5705, "step": 6442 }, { "epoch": 0.9717215896237086, "grad_norm": 0.2602379534612917, "learning_rate": 5.08827226046871e-08, "loss": 1.6155, "step": 6443 }, { "epoch": 0.9718724078123822, "grad_norm": 0.24904634275831725, "learning_rate": 5.0873313059941794e-08, "loss": 1.5483, "step": 6444 }, { "epoch": 0.9720232260010557, "grad_norm": 0.2558030518578314, "learning_rate": 5.086395383816501e-08, "loss": 1.5874, "step": 6445 }, { "epoch": 0.9721740441897293, "grad_norm": 0.25354510496858634, "learning_rate": 5.0854644941458136e-08, "loss": 1.5827, "step": 6446 }, { "epoch": 0.9723248623784029, "grad_norm": 0.250452222296652, "learning_rate": 5.084538637191133e-08, "loss": 1.5173, "step": 6447 }, { "epoch": 0.9724756805670763, "grad_norm": 0.2603913148074543, "learning_rate": 5.083617813160337e-08, "loss": 1.5508, "step": 6448 }, { "epoch": 0.9726264987557499, "grad_norm": 0.40930176727111084, "learning_rate": 5.082702022260179e-08, "loss": 1.5538, "step": 6449 }, { "epoch": 0.9727773169444235, "grad_norm": 0.2497056167405697, "learning_rate": 5.081791264696281e-08, "loss": 1.4941, "step": 6450 }, { "epoch": 0.9729281351330971, "grad_norm": 0.2550678396598978, "learning_rate": 5.080885540673135e-08, "loss": 1.5985, "step": 6451 }, { "epoch": 0.9730789533217706, "grad_norm": 0.2710859709151231, "learning_rate": 5.079984850394102e-08, "loss": 1.5682, "step": 6452 }, { "epoch": 0.9732297715104442, "grad_norm": 0.24717971177406345, "learning_rate": 5.079089194061412e-08, "loss": 1.5253, "step": 6453 }, { "epoch": 0.9733805896991177, "grad_norm": 0.24035360506745604, "learning_rate": 5.078198571876169e-08, "loss": 1.5121, "step": 6454 }, { "epoch": 0.9735314078877912, "grad_norm": 0.24981177472656527, "learning_rate": 5.077312984038339e-08, "loss": 1.5128, "step": 6455 }, { "epoch": 0.9736822260764648, "grad_norm": 0.2522474695358754, "learning_rate": 5.076432430746767e-08, "loss": 1.581, "step": 6456 }, { "epoch": 0.9738330442651384, "grad_norm": 0.2617174417485487, "learning_rate": 5.075556912199161e-08, "loss": 1.5678, "step": 6457 }, { "epoch": 0.973983862453812, "grad_norm": 0.24953025638107984, "learning_rate": 5.0746864285920995e-08, "loss": 1.587, "step": 6458 }, { "epoch": 0.9741346806424854, "grad_norm": 0.261470405145867, "learning_rate": 5.0738209801210296e-08, "loss": 1.5478, "step": 6459 }, { "epoch": 0.974285498831159, "grad_norm": 0.2520795520193516, "learning_rate": 5.0729605669802766e-08, "loss": 1.5086, "step": 6460 }, { "epoch": 0.9744363170198326, "grad_norm": 0.3244120204281703, "learning_rate": 5.0721051893630216e-08, "loss": 1.6022, "step": 6461 }, { "epoch": 0.9745871352085062, "grad_norm": 0.26256172500630426, "learning_rate": 5.071254847461322e-08, "loss": 1.5372, "step": 6462 }, { "epoch": 0.9747379533971797, "grad_norm": 0.23743723766173824, "learning_rate": 5.0704095414661084e-08, "loss": 1.5135, "step": 6463 }, { "epoch": 0.9748887715858533, "grad_norm": 0.29242581161455566, "learning_rate": 5.0695692715671756e-08, "loss": 1.553, "step": 6464 }, { "epoch": 0.9750395897745268, "grad_norm": 0.3117616067572284, "learning_rate": 5.0687340379531864e-08, "loss": 1.5107, "step": 6465 }, { "epoch": 0.9751904079632003, "grad_norm": 0.24530123805488938, "learning_rate": 5.067903840811679e-08, "loss": 1.5374, "step": 6466 }, { "epoch": 0.9753412261518739, "grad_norm": 0.24600277932465653, "learning_rate": 5.0670786803290514e-08, "loss": 1.5453, "step": 6467 }, { "epoch": 0.9754920443405475, "grad_norm": 0.24954792101445722, "learning_rate": 5.066258556690581e-08, "loss": 1.5429, "step": 6468 }, { "epoch": 0.975642862529221, "grad_norm": 0.3665210065866042, "learning_rate": 5.065443470080404e-08, "loss": 1.5777, "step": 6469 }, { "epoch": 0.9757936807178946, "grad_norm": 0.24878039696491128, "learning_rate": 5.0646334206815386e-08, "loss": 1.5602, "step": 6470 }, { "epoch": 0.9759444989065681, "grad_norm": 0.271514195335307, "learning_rate": 5.063828408675858e-08, "loss": 1.5214, "step": 6471 }, { "epoch": 0.9760953170952417, "grad_norm": 0.2663042992823365, "learning_rate": 5.063028434244113e-08, "loss": 1.5173, "step": 6472 }, { "epoch": 0.9762461352839152, "grad_norm": 0.23886729520101155, "learning_rate": 5.062233497565922e-08, "loss": 1.5354, "step": 6473 }, { "epoch": 0.9763969534725888, "grad_norm": 0.26097968730758986, "learning_rate": 5.0614435988197694e-08, "loss": 1.5131, "step": 6474 }, { "epoch": 0.9765477716612624, "grad_norm": 0.2957295881693974, "learning_rate": 5.060658738183012e-08, "loss": 1.563, "step": 6475 }, { "epoch": 0.9766985898499358, "grad_norm": 0.25273515987338896, "learning_rate": 5.059878915831877e-08, "loss": 1.5387, "step": 6476 }, { "epoch": 0.9768494080386094, "grad_norm": 0.24849187140414217, "learning_rate": 5.059104131941452e-08, "loss": 1.5624, "step": 6477 }, { "epoch": 0.977000226227283, "grad_norm": 0.26336256406420006, "learning_rate": 5.058334386685699e-08, "loss": 1.5553, "step": 6478 }, { "epoch": 0.9771510444159566, "grad_norm": 0.2505690502220122, "learning_rate": 5.0575696802374496e-08, "loss": 1.5526, "step": 6479 }, { "epoch": 0.9773018626046301, "grad_norm": 0.23844152313984407, "learning_rate": 5.056810012768405e-08, "loss": 1.5126, "step": 6480 }, { "epoch": 0.9774526807933037, "grad_norm": 0.3296402229519655, "learning_rate": 5.056055384449129e-08, "loss": 1.5577, "step": 6481 }, { "epoch": 0.9776034989819772, "grad_norm": 0.251433847339324, "learning_rate": 5.055305795449061e-08, "loss": 1.5756, "step": 6482 }, { "epoch": 0.9777543171706508, "grad_norm": 0.23745387328194084, "learning_rate": 5.054561245936502e-08, "loss": 1.5269, "step": 6483 }, { "epoch": 0.9779051353593243, "grad_norm": 0.2431422102768408, "learning_rate": 5.053821736078627e-08, "loss": 1.5274, "step": 6484 }, { "epoch": 0.9780559535479979, "grad_norm": 0.34551121164950643, "learning_rate": 5.053087266041475e-08, "loss": 1.5602, "step": 6485 }, { "epoch": 0.9782067717366715, "grad_norm": 0.25410630677517887, "learning_rate": 5.052357835989963e-08, "loss": 1.4949, "step": 6486 }, { "epoch": 0.978357589925345, "grad_norm": 0.26531649825766734, "learning_rate": 5.05163344608786e-08, "loss": 1.4913, "step": 6487 }, { "epoch": 0.9785084081140185, "grad_norm": 0.26781315412225526, "learning_rate": 5.05091409649782e-08, "loss": 1.5396, "step": 6488 }, { "epoch": 0.9786592263026921, "grad_norm": 0.2520580299099175, "learning_rate": 5.050199787381354e-08, "loss": 1.5628, "step": 6489 }, { "epoch": 0.9788100444913657, "grad_norm": 0.3804986834247732, "learning_rate": 5.049490518898845e-08, "loss": 1.5316, "step": 6490 }, { "epoch": 0.9789608626800392, "grad_norm": 1.244187494723972, "learning_rate": 5.048786291209548e-08, "loss": 1.5458, "step": 6491 }, { "epoch": 0.9791116808687128, "grad_norm": 0.23424370161157604, "learning_rate": 5.048087104471578e-08, "loss": 1.5631, "step": 6492 }, { "epoch": 0.9792624990573863, "grad_norm": 0.25032519116588015, "learning_rate": 5.047392958841927e-08, "loss": 1.5827, "step": 6493 }, { "epoch": 0.9794133172460598, "grad_norm": 0.24731029803020294, "learning_rate": 5.0467038544764466e-08, "loss": 1.5252, "step": 6494 }, { "epoch": 0.9795641354347334, "grad_norm": 0.2624072760981343, "learning_rate": 5.046019791529864e-08, "loss": 1.6048, "step": 6495 }, { "epoch": 0.979714953623407, "grad_norm": 0.2924639644547172, "learning_rate": 5.045340770155771e-08, "loss": 1.5289, "step": 6496 }, { "epoch": 0.9798657718120806, "grad_norm": 0.24884598029170174, "learning_rate": 5.0446667905066276e-08, "loss": 1.5927, "step": 6497 }, { "epoch": 0.9800165900007541, "grad_norm": 0.27924775476960245, "learning_rate": 5.0439978527337586e-08, "loss": 1.5717, "step": 6498 }, { "epoch": 0.9801674081894276, "grad_norm": 0.25301986563185713, "learning_rate": 5.043333956987366e-08, "loss": 1.5623, "step": 6499 }, { "epoch": 0.9803182263781012, "grad_norm": 0.27688842373854006, "learning_rate": 5.0426751034165076e-08, "loss": 1.5664, "step": 6500 }, { "epoch": 0.9804690445667748, "grad_norm": 0.24445084259797586, "learning_rate": 5.0420212921691204e-08, "loss": 1.59, "step": 6501 }, { "epoch": 0.9806198627554483, "grad_norm": 0.24344584631254262, "learning_rate": 5.041372523392e-08, "loss": 1.5412, "step": 6502 }, { "epoch": 0.9807706809441219, "grad_norm": 0.2526570631672275, "learning_rate": 5.040728797230817e-08, "loss": 1.5631, "step": 6503 }, { "epoch": 0.9809214991327954, "grad_norm": 0.3186108326288003, "learning_rate": 5.040090113830105e-08, "loss": 1.5801, "step": 6504 }, { "epoch": 0.9810723173214689, "grad_norm": 0.2986905201284329, "learning_rate": 5.039456473333268e-08, "loss": 1.5891, "step": 6505 }, { "epoch": 0.9812231355101425, "grad_norm": 0.24819163687733814, "learning_rate": 5.0388278758825745e-08, "loss": 1.5818, "step": 6506 }, { "epoch": 0.9813739536988161, "grad_norm": 0.2576431292514359, "learning_rate": 5.038204321619165e-08, "loss": 1.5484, "step": 6507 }, { "epoch": 0.9815247718874897, "grad_norm": 1.1781340006336096, "learning_rate": 5.037585810683046e-08, "loss": 1.5901, "step": 6508 }, { "epoch": 0.9816755900761632, "grad_norm": 0.25925081343451833, "learning_rate": 5.03697234321309e-08, "loss": 1.5672, "step": 6509 }, { "epoch": 0.9818264082648367, "grad_norm": 0.24633666848558364, "learning_rate": 5.0363639193470386e-08, "loss": 1.5217, "step": 6510 }, { "epoch": 0.9819772264535103, "grad_norm": 0.2390602820930981, "learning_rate": 5.0357605392215e-08, "loss": 1.5268, "step": 6511 }, { "epoch": 0.9821280446421838, "grad_norm": 0.26011007605635605, "learning_rate": 5.0351622029719506e-08, "loss": 1.5172, "step": 6512 }, { "epoch": 0.9822788628308574, "grad_norm": 0.24076616630485573, "learning_rate": 5.034568910732737e-08, "loss": 1.6221, "step": 6513 }, { "epoch": 0.982429681019531, "grad_norm": 0.27744182607071605, "learning_rate": 5.033980662637067e-08, "loss": 1.6523, "step": 6514 }, { "epoch": 0.9825804992082046, "grad_norm": 0.26065840247835165, "learning_rate": 5.03339745881702e-08, "loss": 1.6047, "step": 6515 }, { "epoch": 0.982731317396878, "grad_norm": 0.24838784838301078, "learning_rate": 5.0328192994035456e-08, "loss": 1.5765, "step": 6516 }, { "epoch": 0.9828821355855516, "grad_norm": 0.24992476928306281, "learning_rate": 5.0322461845264545e-08, "loss": 1.5043, "step": 6517 }, { "epoch": 0.9830329537742252, "grad_norm": 0.24636638294071833, "learning_rate": 5.031678114314429e-08, "loss": 1.5504, "step": 6518 }, { "epoch": 0.9831837719628987, "grad_norm": 0.25322163197820124, "learning_rate": 5.031115088895015e-08, "loss": 1.5645, "step": 6519 }, { "epoch": 0.9833345901515723, "grad_norm": 0.28106231255069447, "learning_rate": 5.0305571083946305e-08, "loss": 1.5788, "step": 6520 }, { "epoch": 0.9834854083402458, "grad_norm": 0.2553265807543555, "learning_rate": 5.030004172938559e-08, "loss": 1.5801, "step": 6521 }, { "epoch": 0.9836362265289194, "grad_norm": 0.2637114564187194, "learning_rate": 5.029456282650948e-08, "loss": 1.5569, "step": 6522 }, { "epoch": 0.9837870447175929, "grad_norm": 0.25801532365856095, "learning_rate": 5.028913437654816e-08, "loss": 1.5832, "step": 6523 }, { "epoch": 0.9839378629062665, "grad_norm": 0.24938123649171232, "learning_rate": 5.028375638072049e-08, "loss": 1.5996, "step": 6524 }, { "epoch": 0.9840886810949401, "grad_norm": 0.3509246362741187, "learning_rate": 5.027842884023395e-08, "loss": 1.6541, "step": 6525 }, { "epoch": 0.9842394992836137, "grad_norm": 0.3108705383194768, "learning_rate": 5.027315175628478e-08, "loss": 1.5554, "step": 6526 }, { "epoch": 0.9843903174722871, "grad_norm": 0.24457863554702414, "learning_rate": 5.026792513005779e-08, "loss": 1.5216, "step": 6527 }, { "epoch": 0.9845411356609607, "grad_norm": 0.26069030549313427, "learning_rate": 5.02627489627265e-08, "loss": 1.5288, "step": 6528 }, { "epoch": 0.9846919538496343, "grad_norm": 0.24747632060797806, "learning_rate": 5.025762325545319e-08, "loss": 1.5329, "step": 6529 }, { "epoch": 0.9848427720383078, "grad_norm": 0.2757738150564607, "learning_rate": 5.025254800938867e-08, "loss": 1.522, "step": 6530 }, { "epoch": 0.9849935902269814, "grad_norm": 0.26346233480909437, "learning_rate": 5.024752322567248e-08, "loss": 1.4972, "step": 6531 }, { "epoch": 0.985144408415655, "grad_norm": 0.25503332889486574, "learning_rate": 5.024254890543284e-08, "loss": 1.6004, "step": 6532 }, { "epoch": 0.9852952266043284, "grad_norm": 0.24756468469656975, "learning_rate": 5.023762504978663e-08, "loss": 1.4894, "step": 6533 }, { "epoch": 0.985446044793002, "grad_norm": 0.24112569122639588, "learning_rate": 5.0232751659839414e-08, "loss": 1.5443, "step": 6534 }, { "epoch": 0.9855968629816756, "grad_norm": 0.2719279864027767, "learning_rate": 5.0227928736685394e-08, "loss": 1.6128, "step": 6535 }, { "epoch": 0.9857476811703492, "grad_norm": 1.1656734563879858, "learning_rate": 5.0223156281407456e-08, "loss": 1.568, "step": 6536 }, { "epoch": 0.9858984993590227, "grad_norm": 0.28870201154578834, "learning_rate": 5.0218434295077155e-08, "loss": 1.5104, "step": 6537 }, { "epoch": 0.9860493175476962, "grad_norm": 0.25067612590232297, "learning_rate": 5.0213762778754706e-08, "loss": 1.5537, "step": 6538 }, { "epoch": 0.9862001357363698, "grad_norm": 0.3021000063641636, "learning_rate": 5.0209141733489046e-08, "loss": 1.5324, "step": 6539 }, { "epoch": 0.9863509539250433, "grad_norm": 0.26216923631130623, "learning_rate": 5.020457116031768e-08, "loss": 1.5644, "step": 6540 }, { "epoch": 0.9865017721137169, "grad_norm": 0.2656526626019676, "learning_rate": 5.0200051060266824e-08, "loss": 1.5785, "step": 6541 }, { "epoch": 0.9866525903023905, "grad_norm": 0.24813683444140744, "learning_rate": 5.0195581434351445e-08, "loss": 1.595, "step": 6542 }, { "epoch": 0.9868034084910641, "grad_norm": 0.24865637037750893, "learning_rate": 5.019116228357505e-08, "loss": 1.626, "step": 6543 }, { "epoch": 0.9869542266797375, "grad_norm": 0.24656800678491447, "learning_rate": 5.018679360892989e-08, "loss": 1.6314, "step": 6544 }, { "epoch": 0.9871050448684111, "grad_norm": 0.2483444651125244, "learning_rate": 5.018247541139684e-08, "loss": 1.5515, "step": 6545 }, { "epoch": 0.9872558630570847, "grad_norm": 0.2527243702013985, "learning_rate": 5.017820769194546e-08, "loss": 1.6493, "step": 6546 }, { "epoch": 0.9874066812457583, "grad_norm": 0.2517723592329947, "learning_rate": 5.0173990451533994e-08, "loss": 1.5266, "step": 6547 }, { "epoch": 0.9875574994344318, "grad_norm": 0.25317069918610413, "learning_rate": 5.016982369110933e-08, "loss": 1.56, "step": 6548 }, { "epoch": 0.9877083176231053, "grad_norm": 0.24608514103588505, "learning_rate": 5.016570741160703e-08, "loss": 1.5486, "step": 6549 }, { "epoch": 0.9878591358117789, "grad_norm": 0.2647537292214903, "learning_rate": 5.0161641613951295e-08, "loss": 1.5151, "step": 6550 }, { "epoch": 0.9880099540004524, "grad_norm": 0.24074774570637517, "learning_rate": 5.015762629905507e-08, "loss": 1.5666, "step": 6551 }, { "epoch": 0.988160772189126, "grad_norm": 0.26007525915617, "learning_rate": 5.0153661467819845e-08, "loss": 1.5034, "step": 6552 }, { "epoch": 0.9883115903777996, "grad_norm": 0.2516640942926316, "learning_rate": 5.014974712113589e-08, "loss": 1.5718, "step": 6553 }, { "epoch": 0.9884624085664732, "grad_norm": 0.24720404120112693, "learning_rate": 5.014588325988206e-08, "loss": 1.5559, "step": 6554 }, { "epoch": 0.9886132267551466, "grad_norm": 0.24841952692707817, "learning_rate": 5.0142069884925905e-08, "loss": 1.5363, "step": 6555 }, { "epoch": 0.9887640449438202, "grad_norm": 0.2486734212115933, "learning_rate": 5.013830699712365e-08, "loss": 1.4945, "step": 6556 }, { "epoch": 0.9889148631324938, "grad_norm": 0.24751562722403198, "learning_rate": 5.0134594597320185e-08, "loss": 1.6056, "step": 6557 }, { "epoch": 0.9890656813211673, "grad_norm": 0.261117367244557, "learning_rate": 5.0130932686349036e-08, "loss": 1.4504, "step": 6558 }, { "epoch": 0.9892164995098409, "grad_norm": 0.26202295217714505, "learning_rate": 5.012732126503238e-08, "loss": 1.5029, "step": 6559 }, { "epoch": 0.9893673176985145, "grad_norm": 0.24211227802711202, "learning_rate": 5.012376033418115e-08, "loss": 1.6037, "step": 6560 }, { "epoch": 0.989518135887188, "grad_norm": 0.26180647304427285, "learning_rate": 5.012024989459483e-08, "loss": 1.5926, "step": 6561 }, { "epoch": 0.9896689540758615, "grad_norm": 0.4177446113873971, "learning_rate": 5.011678994706165e-08, "loss": 1.542, "step": 6562 }, { "epoch": 0.9898197722645351, "grad_norm": 0.2539076636184375, "learning_rate": 5.011338049235842e-08, "loss": 1.6005, "step": 6563 }, { "epoch": 0.9899705904532087, "grad_norm": 0.2615796253744187, "learning_rate": 5.011002153125071e-08, "loss": 1.5413, "step": 6564 }, { "epoch": 0.9901214086418823, "grad_norm": 0.2960432928116584, "learning_rate": 5.010671306449267e-08, "loss": 1.631, "step": 6565 }, { "epoch": 0.9902722268305557, "grad_norm": 0.24457914186893268, "learning_rate": 5.010345509282717e-08, "loss": 1.5597, "step": 6566 }, { "epoch": 0.9904230450192293, "grad_norm": 0.24825750469450225, "learning_rate": 5.010024761698572e-08, "loss": 1.5476, "step": 6567 }, { "epoch": 0.9905738632079029, "grad_norm": 0.2572946824640707, "learning_rate": 5.0097090637688494e-08, "loss": 1.5519, "step": 6568 }, { "epoch": 0.9907246813965764, "grad_norm": 0.2859707766855372, "learning_rate": 5.0093984155644316e-08, "loss": 1.6001, "step": 6569 }, { "epoch": 0.99087549958525, "grad_norm": 0.24960053608418814, "learning_rate": 5.009092817155066e-08, "loss": 1.6219, "step": 6570 }, { "epoch": 0.9910263177739236, "grad_norm": 0.24169959370913324, "learning_rate": 5.008792268609373e-08, "loss": 1.5545, "step": 6571 }, { "epoch": 0.991177135962597, "grad_norm": 0.26847186988473953, "learning_rate": 5.00849676999483e-08, "loss": 1.6139, "step": 6572 }, { "epoch": 0.9913279541512706, "grad_norm": 0.24507802090346645, "learning_rate": 5.0082063213777876e-08, "loss": 1.5341, "step": 6573 }, { "epoch": 0.9914787723399442, "grad_norm": 0.24703925950319292, "learning_rate": 5.007920922823461e-08, "loss": 1.5049, "step": 6574 }, { "epoch": 0.9916295905286178, "grad_norm": 0.25278765284185717, "learning_rate": 5.007640574395928e-08, "loss": 1.5668, "step": 6575 }, { "epoch": 0.9917804087172913, "grad_norm": 0.25022451568994536, "learning_rate": 5.007365276158134e-08, "loss": 1.5477, "step": 6576 }, { "epoch": 0.9919312269059649, "grad_norm": 0.26152573903290816, "learning_rate": 5.0070950281718946e-08, "loss": 1.5317, "step": 6577 }, { "epoch": 0.9920820450946384, "grad_norm": 0.3338312726106041, "learning_rate": 5.006829830497889e-08, "loss": 1.5627, "step": 6578 }, { "epoch": 0.992232863283312, "grad_norm": 0.242145683944422, "learning_rate": 5.006569683195659e-08, "loss": 1.5797, "step": 6579 }, { "epoch": 0.9923836814719855, "grad_norm": 0.2458225912180188, "learning_rate": 5.006314586323615e-08, "loss": 1.4727, "step": 6580 }, { "epoch": 0.9925344996606591, "grad_norm": 0.2600251176366966, "learning_rate": 5.006064539939035e-08, "loss": 1.57, "step": 6581 }, { "epoch": 0.9926853178493327, "grad_norm": 0.24992863036321655, "learning_rate": 5.005819544098063e-08, "loss": 1.6949, "step": 6582 }, { "epoch": 0.9928361360380061, "grad_norm": 0.2593715127852107, "learning_rate": 5.005579598855705e-08, "loss": 1.591, "step": 6583 }, { "epoch": 0.9929869542266797, "grad_norm": 0.2599446696035754, "learning_rate": 5.005344704265839e-08, "loss": 1.585, "step": 6584 }, { "epoch": 0.9931377724153533, "grad_norm": 0.3857416727018453, "learning_rate": 5.005114860381201e-08, "loss": 1.5521, "step": 6585 }, { "epoch": 0.9932885906040269, "grad_norm": 0.25227183012276794, "learning_rate": 5.004890067253402e-08, "loss": 1.52, "step": 6586 }, { "epoch": 0.9934394087927004, "grad_norm": 0.270258631240915, "learning_rate": 5.0046703249329124e-08, "loss": 1.5643, "step": 6587 }, { "epoch": 0.993590226981374, "grad_norm": 0.2996266870859598, "learning_rate": 5.004455633469073e-08, "loss": 1.5496, "step": 6588 }, { "epoch": 0.9937410451700475, "grad_norm": 0.25308604768433385, "learning_rate": 5.0042459929100854e-08, "loss": 1.5256, "step": 6589 }, { "epoch": 0.993891863358721, "grad_norm": 0.25207421806595026, "learning_rate": 5.00404140330302e-08, "loss": 1.5814, "step": 6590 }, { "epoch": 0.9940426815473946, "grad_norm": 0.2695325793514296, "learning_rate": 5.003841864693814e-08, "loss": 1.6237, "step": 6591 }, { "epoch": 0.9941934997360682, "grad_norm": 0.620517491915544, "learning_rate": 5.003647377127272e-08, "loss": 1.5484, "step": 6592 }, { "epoch": 0.9943443179247418, "grad_norm": 0.2450898709745292, "learning_rate": 5.003457940647057e-08, "loss": 1.5471, "step": 6593 }, { "epoch": 0.9944951361134152, "grad_norm": 0.2587619496177874, "learning_rate": 5.0032735552957104e-08, "loss": 1.5542, "step": 6594 }, { "epoch": 0.9946459543020888, "grad_norm": 0.25998338882700783, "learning_rate": 5.003094221114625e-08, "loss": 1.6032, "step": 6595 }, { "epoch": 0.9947967724907624, "grad_norm": 0.2450711120072768, "learning_rate": 5.00291993814407e-08, "loss": 1.5722, "step": 6596 }, { "epoch": 0.994947590679436, "grad_norm": 0.29549061519392755, "learning_rate": 5.0027507064231754e-08, "loss": 1.5365, "step": 6597 }, { "epoch": 0.9950984088681095, "grad_norm": 0.24591074110055633, "learning_rate": 5.0025865259899395e-08, "loss": 1.5489, "step": 6598 }, { "epoch": 0.9952492270567831, "grad_norm": 0.2536244535593241, "learning_rate": 5.002427396881227e-08, "loss": 1.5374, "step": 6599 }, { "epoch": 0.9954000452454566, "grad_norm": 0.25277275561801404, "learning_rate": 5.0022733191327645e-08, "loss": 1.5392, "step": 6600 }, { "epoch": 0.9955508634341301, "grad_norm": 0.24256667203681012, "learning_rate": 5.0021242927791477e-08, "loss": 1.5, "step": 6601 }, { "epoch": 0.9957016816228037, "grad_norm": 0.262718000941228, "learning_rate": 5.001980317853838e-08, "loss": 1.4774, "step": 6602 }, { "epoch": 0.9958524998114773, "grad_norm": 0.26014642746601724, "learning_rate": 5.0018413943891636e-08, "loss": 1.5499, "step": 6603 }, { "epoch": 0.9960033180001509, "grad_norm": 0.27238693268092284, "learning_rate": 5.001707522416311e-08, "loss": 1.4962, "step": 6604 }, { "epoch": 0.9961541361888244, "grad_norm": 0.36523839539495767, "learning_rate": 5.001578701965347e-08, "loss": 1.5421, "step": 6605 }, { "epoch": 0.9963049543774979, "grad_norm": 0.26968890414008384, "learning_rate": 5.0014549330651884e-08, "loss": 1.5056, "step": 6606 }, { "epoch": 0.9964557725661715, "grad_norm": 0.25810388348729857, "learning_rate": 5.0013362157436276e-08, "loss": 1.4773, "step": 6607 }, { "epoch": 0.996606590754845, "grad_norm": 0.27029654290389415, "learning_rate": 5.0012225500273184e-08, "loss": 1.6149, "step": 6608 }, { "epoch": 0.9967574089435186, "grad_norm": 0.289186425325794, "learning_rate": 5.001113935941785e-08, "loss": 1.5884, "step": 6609 }, { "epoch": 0.9969082271321922, "grad_norm": 0.25404931837643585, "learning_rate": 5.001010373511414e-08, "loss": 1.4801, "step": 6610 }, { "epoch": 0.9970590453208656, "grad_norm": 0.2837220956254411, "learning_rate": 5.0009118627594556e-08, "loss": 1.5325, "step": 6611 }, { "epoch": 0.9972098635095392, "grad_norm": 0.2474702028118016, "learning_rate": 5.000818403708031e-08, "loss": 1.5786, "step": 6612 }, { "epoch": 0.9973606816982128, "grad_norm": 0.2526014041177881, "learning_rate": 5.000729996378121e-08, "loss": 1.5544, "step": 6613 }, { "epoch": 0.9975114998868864, "grad_norm": 0.26273147726823376, "learning_rate": 5.000646640789582e-08, "loss": 1.4948, "step": 6614 }, { "epoch": 0.9976623180755599, "grad_norm": 0.2950688429471966, "learning_rate": 5.000568336961123e-08, "loss": 1.5897, "step": 6615 }, { "epoch": 0.9978131362642335, "grad_norm": 0.25656955862427994, "learning_rate": 5.0004950849103275e-08, "loss": 1.5216, "step": 6616 }, { "epoch": 0.997963954452907, "grad_norm": 0.2862369388063159, "learning_rate": 5.000426884653646e-08, "loss": 1.5406, "step": 6617 }, { "epoch": 0.9981147726415805, "grad_norm": 0.34020761343391465, "learning_rate": 5.000363736206387e-08, "loss": 1.4978, "step": 6618 }, { "epoch": 0.9982655908302541, "grad_norm": 0.2857148184005516, "learning_rate": 5.00030563958273e-08, "loss": 1.5223, "step": 6619 }, { "epoch": 0.9984164090189277, "grad_norm": 0.30446226275025295, "learning_rate": 5.0002525947957226e-08, "loss": 1.5554, "step": 6620 }, { "epoch": 0.9985672272076013, "grad_norm": 0.2615054343779194, "learning_rate": 5.000204601857272e-08, "loss": 1.5789, "step": 6621 }, { "epoch": 0.9987180453962748, "grad_norm": 0.27178486454582657, "learning_rate": 5.0001616607781566e-08, "loss": 1.5491, "step": 6622 }, { "epoch": 0.9988688635849483, "grad_norm": 0.25279549756065184, "learning_rate": 5.000123771568012e-08, "loss": 1.5493, "step": 6623 }, { "epoch": 0.9990196817736219, "grad_norm": 0.45219351448552675, "learning_rate": 5.0000909342353545e-08, "loss": 1.5669, "step": 6624 }, { "epoch": 0.9991704999622955, "grad_norm": 0.273081492101918, "learning_rate": 5.0000631487875463e-08, "loss": 1.5694, "step": 6625 }, { "epoch": 0.999321318150969, "grad_norm": 0.2470353500890094, "learning_rate": 5.000040415230837e-08, "loss": 1.5355, "step": 6626 }, { "epoch": 0.9994721363396426, "grad_norm": 0.2749090211943406, "learning_rate": 5.000022733570323e-08, "loss": 1.5207, "step": 6627 }, { "epoch": 0.9996229545283161, "grad_norm": 0.2488845965843425, "learning_rate": 5.000010103809979e-08, "loss": 1.564, "step": 6628 }, { "epoch": 0.9997737727169896, "grad_norm": 0.24595496745924694, "learning_rate": 5.000002525952637e-08, "loss": 1.5456, "step": 6629 }, { "epoch": 0.9999245909056632, "grad_norm": 0.2667207051411267, "learning_rate": 5e-08, "loss": 1.5243, "step": 6630 } ], "logging_steps": 1, "max_steps": 6630, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1326, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 8649147427061760.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }