YufeiWeng commited on
Commit
27b4975
·
verified ·
1 Parent(s): 8b2c166

End of training

Browse files
README.md CHANGED
@@ -1,6 +1,8 @@
1
  ---
2
  base_model: microsoft/dit-base-finetuned-rvlcdip
3
  tags:
 
 
4
  - generated_from_trainer
5
  metrics:
6
  - f1
@@ -16,8 +18,8 @@ should probably proofread and complete it, then remove this comment. -->
16
 
17
  This model is a fine-tuned version of [microsoft/dit-base-finetuned-rvlcdip](https://huggingface.co/microsoft/dit-base-finetuned-rvlcdip) on an unknown dataset.
18
  It achieves the following results on the evaluation set:
19
- - Loss: 0.2283
20
- - F1: 0.2395
21
 
22
  ## Model description
23
 
 
1
  ---
2
  base_model: microsoft/dit-base-finetuned-rvlcdip
3
  tags:
4
+ - image-classification
5
+ - vision
6
  - generated_from_trainer
7
  metrics:
8
  - f1
 
18
 
19
  This model is a fine-tuned version of [microsoft/dit-base-finetuned-rvlcdip](https://huggingface.co/microsoft/dit-base-finetuned-rvlcdip) on an unknown dataset.
20
  It achieves the following results on the evaluation set:
21
+ - Loss: 0.1870
22
+ - F1: 0.3826
23
 
24
  ## Model description
25
 
all_results.json CHANGED
@@ -1,13 +1,13 @@
1
  {
2
- "epoch": 0.6942691239585963,
3
- "eval_f1": 0.6133951445650848,
4
- "eval_loss": 0.04044894501566887,
5
- "eval_runtime": 1162.523,
6
- "eval_samples_per_second": 177.426,
7
- "eval_steps_per_second": 2.772,
8
- "total_flos": 1.3639932886745088e+19,
9
- "train_loss": 0.019194319985129618,
10
- "train_runtime": 18605.0451,
11
- "train_samples_per_second": 34.399,
12
- "train_steps_per_second": 0.537
13
  }
 
1
  {
2
+ "epoch": 2.271580010095911,
3
+ "eval_f1": 0.38255795104555035,
4
+ "eval_loss": 0.18703292310237885,
5
+ "eval_runtime": 571.9074,
6
+ "eval_samples_per_second": 360.656,
7
+ "eval_steps_per_second": 2.819,
8
+ "total_flos": 4.4623040438986555e+19,
9
+ "train_loss": 0.13670311906602647,
10
+ "train_runtime": 12745.6466,
11
+ "train_samples_per_second": 301.279,
12
+ "train_steps_per_second": 2.354
13
  }
eval_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.6942691239585963,
3
- "eval_f1": 0.6133951445650848,
4
- "eval_loss": 0.04044894501566887,
5
- "eval_runtime": 1162.523,
6
- "eval_samples_per_second": 177.426,
7
- "eval_steps_per_second": 2.772
8
  }
 
1
  {
2
+ "epoch": 2.271580010095911,
3
+ "eval_f1": 0.38255795104555035,
4
+ "eval_loss": 0.18703292310237885,
5
+ "eval_runtime": 571.9074,
6
+ "eval_samples_per_second": 360.656,
7
+ "eval_steps_per_second": 2.819
8
  }
p_object.json CHANGED
The diff for this file is too large to render. See raw diff
 
prediction_reference.json CHANGED
The diff for this file is too large to render. See raw diff
 
train_results.json CHANGED
@@ -1,8 +1,8 @@
1
  {
2
- "epoch": 0.6942691239585963,
3
- "total_flos": 1.3639932886745088e+19,
4
- "train_loss": 0.019194319985129618,
5
- "train_runtime": 18605.0451,
6
- "train_samples_per_second": 34.399,
7
- "train_steps_per_second": 0.537
8
  }
 
1
  {
2
+ "epoch": 2.271580010095911,
3
+ "total_flos": 4.4623040438986555e+19,
4
+ "train_loss": 0.13670311906602647,
5
+ "train_runtime": 12745.6466,
6
+ "train_samples_per_second": 301.279,
7
+ "train_steps_per_second": 2.354
8
  }
trainer_state.json CHANGED
@@ -1,2452 +1,141 @@
1
  {
2
- "best_metric": 0.6133951445650848,
3
- "best_model_checkpoint": "./step_test_microsoft_dit/checkpoint-2500",
4
- "epoch": 0.6942691239585963,
5
- "eval_steps": 50,
6
- "global_step": 2750,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
- "epoch": 0.0025246149962130774,
13
- "grad_norm": 1.0554239749908447,
14
- "learning_rate": 2.997e-05,
15
- "loss": 0.3197,
16
- "step": 10
17
- },
18
- {
19
- "epoch": 0.005049229992426155,
20
- "grad_norm": 1.5600422620773315,
21
- "learning_rate": 2.994e-05,
22
- "loss": 0.2047,
23
- "step": 20
24
- },
25
- {
26
- "epoch": 0.007573844988639233,
27
- "grad_norm": 2.1541621685028076,
28
- "learning_rate": 2.991e-05,
29
- "loss": 0.1528,
30
- "step": 30
31
- },
32
- {
33
- "epoch": 0.01009845998485231,
34
- "grad_norm": 1.805535078048706,
35
- "learning_rate": 2.9880000000000002e-05,
36
- "loss": 0.1252,
37
- "step": 40
38
- },
39
- {
40
- "epoch": 0.012623074981065387,
41
- "grad_norm": 1.1236392259597778,
42
- "learning_rate": 2.985e-05,
43
- "loss": 0.1165,
44
- "step": 50
45
- },
46
- {
47
- "epoch": 0.012623074981065387,
48
- "eval_f1": 0.4177168854259269,
49
- "eval_loss": 0.06423712521791458,
50
- "eval_runtime": 1142.2038,
51
- "eval_samples_per_second": 180.582,
52
- "eval_steps_per_second": 2.822,
53
- "step": 50
54
- },
55
- {
56
- "epoch": 0.015147689977278465,
57
- "grad_norm": 1.1924934387207031,
58
- "learning_rate": 2.982e-05,
59
- "loss": 0.1029,
60
- "step": 60
61
- },
62
- {
63
- "epoch": 0.017672304973491544,
64
- "grad_norm": 1.225701928138733,
65
- "learning_rate": 2.979e-05,
66
- "loss": 0.117,
67
- "step": 70
68
- },
69
- {
70
- "epoch": 0.02019691996970462,
71
- "grad_norm": 2.702486515045166,
72
- "learning_rate": 2.976e-05,
73
- "loss": 0.1103,
74
- "step": 80
75
- },
76
- {
77
- "epoch": 0.022721534965917698,
78
- "grad_norm": 2.0278918743133545,
79
- "learning_rate": 2.973e-05,
80
- "loss": 0.1054,
81
- "step": 90
82
- },
83
- {
84
- "epoch": 0.025246149962130773,
85
- "grad_norm": 1.9288796186447144,
86
- "learning_rate": 2.97e-05,
87
- "loss": 0.0942,
88
- "step": 100
89
- },
90
- {
91
- "epoch": 0.025246149962130773,
92
- "eval_f1": 0.4771817453963171,
93
- "eval_loss": 0.048530641943216324,
94
- "eval_runtime": 1007.4651,
95
- "eval_samples_per_second": 204.734,
96
- "eval_steps_per_second": 3.199,
97
- "step": 100
98
- },
99
- {
100
- "epoch": 0.027770764958343852,
101
- "grad_norm": 1.4688999652862549,
102
- "learning_rate": 2.967e-05,
103
- "loss": 0.0992,
104
- "step": 110
105
- },
106
- {
107
- "epoch": 0.03029537995455693,
108
- "grad_norm": 1.1097605228424072,
109
- "learning_rate": 2.964e-05,
110
- "loss": 0.1126,
111
- "step": 120
112
- },
113
- {
114
- "epoch": 0.03281999495077001,
115
- "grad_norm": 1.0353784561157227,
116
- "learning_rate": 2.961e-05,
117
- "loss": 0.0949,
118
- "step": 130
119
- },
120
- {
121
- "epoch": 0.03534460994698309,
122
- "grad_norm": 1.7303999662399292,
123
- "learning_rate": 2.958e-05,
124
- "loss": 0.1025,
125
- "step": 140
126
- },
127
- {
128
- "epoch": 0.03786922494319616,
129
- "grad_norm": 1.1177138090133667,
130
- "learning_rate": 2.955e-05,
131
- "loss": 0.1076,
132
- "step": 150
133
- },
134
- {
135
- "epoch": 0.03786922494319616,
136
- "eval_f1": 0.46432628333423487,
137
- "eval_loss": 0.05836363136768341,
138
- "eval_runtime": 980.1012,
139
- "eval_samples_per_second": 210.45,
140
- "eval_steps_per_second": 3.288,
141
- "step": 150
142
- },
143
- {
144
- "epoch": 0.04039383993940924,
145
- "grad_norm": 1.1965147256851196,
146
- "learning_rate": 2.9520000000000002e-05,
147
- "loss": 0.0961,
148
- "step": 160
149
- },
150
- {
151
- "epoch": 0.04291845493562232,
152
- "grad_norm": 1.0545780658721924,
153
- "learning_rate": 2.949e-05,
154
- "loss": 0.104,
155
- "step": 170
156
- },
157
- {
158
- "epoch": 0.045443069931835396,
159
- "grad_norm": 1.8348199129104614,
160
- "learning_rate": 2.946e-05,
161
- "loss": 0.0932,
162
- "step": 180
163
- },
164
- {
165
- "epoch": 0.047967684928048475,
166
- "grad_norm": 1.8478541374206543,
167
- "learning_rate": 2.943e-05,
168
- "loss": 0.1069,
169
- "step": 190
170
- },
171
- {
172
- "epoch": 0.05049229992426155,
173
- "grad_norm": 0.9377999305725098,
174
- "learning_rate": 2.94e-05,
175
- "loss": 0.1103,
176
- "step": 200
177
- },
178
- {
179
- "epoch": 0.05049229992426155,
180
- "eval_f1": NaN,
181
- "eval_loss": 0.044557176530361176,
182
- "eval_runtime": 978.7525,
183
- "eval_samples_per_second": 210.74,
184
- "eval_steps_per_second": 3.293,
185
- "step": 200
186
- },
187
- {
188
- "epoch": 0.053016914920474625,
189
- "grad_norm": 1.6204830408096313,
190
- "learning_rate": 2.9370000000000002e-05,
191
- "loss": 0.1019,
192
- "step": 210
193
- },
194
- {
195
- "epoch": 0.055541529916687704,
196
- "grad_norm": 1.1411000490188599,
197
- "learning_rate": 2.934e-05,
198
- "loss": 0.0969,
199
- "step": 220
200
- },
201
- {
202
- "epoch": 0.05806614491290078,
203
- "grad_norm": 1.1179866790771484,
204
- "learning_rate": 2.931e-05,
205
- "loss": 0.1031,
206
- "step": 230
207
- },
208
- {
209
- "epoch": 0.06059075990911386,
210
- "grad_norm": 1.2155176401138306,
211
- "learning_rate": 2.928e-05,
212
- "loss": 0.0851,
213
- "step": 240
214
- },
215
- {
216
- "epoch": 0.06311537490532694,
217
- "grad_norm": 1.4578701257705688,
218
- "learning_rate": 2.925e-05,
219
- "loss": 0.0873,
220
- "step": 250
221
- },
222
- {
223
- "epoch": 0.06311537490532694,
224
- "eval_f1": 0.5313367950730588,
225
- "eval_loss": 0.05184657499194145,
226
- "eval_runtime": 982.8389,
227
- "eval_samples_per_second": 209.863,
228
- "eval_steps_per_second": 3.279,
229
- "step": 250
230
- },
231
- {
232
- "epoch": 0.06563998990154002,
233
- "grad_norm": 1.2894303798675537,
234
- "learning_rate": 2.922e-05,
235
- "loss": 0.0876,
236
- "step": 260
237
- },
238
- {
239
- "epoch": 0.0681646048977531,
240
- "grad_norm": 0.8404099941253662,
241
- "learning_rate": 2.919e-05,
242
- "loss": 0.0904,
243
- "step": 270
244
- },
245
- {
246
- "epoch": 0.07068921989396618,
247
- "grad_norm": 2.0062506198883057,
248
- "learning_rate": 2.916e-05,
249
- "loss": 0.1009,
250
- "step": 280
251
- },
252
- {
253
- "epoch": 0.07321383489017924,
254
- "grad_norm": 0.8900242447853088,
255
- "learning_rate": 2.913e-05,
256
- "loss": 0.0925,
257
- "step": 290
258
- },
259
- {
260
- "epoch": 0.07573844988639232,
261
- "grad_norm": 1.051013708114624,
262
- "learning_rate": 2.91e-05,
263
- "loss": 0.1053,
264
- "step": 300
265
- },
266
- {
267
- "epoch": 0.07573844988639232,
268
- "eval_f1": 0.532925682031985,
269
- "eval_loss": 0.07359323650598526,
270
- "eval_runtime": 980.9407,
271
- "eval_samples_per_second": 210.27,
272
- "eval_steps_per_second": 3.286,
273
- "step": 300
274
- },
275
- {
276
- "epoch": 0.0782630648826054,
277
- "grad_norm": 0.7765111327171326,
278
- "learning_rate": 2.907e-05,
279
- "loss": 0.0848,
280
- "step": 310
281
- },
282
- {
283
- "epoch": 0.08078767987881848,
284
- "grad_norm": 0.9605777859687805,
285
- "learning_rate": 2.904e-05,
286
- "loss": 0.0746,
287
- "step": 320
288
- },
289
- {
290
- "epoch": 0.08331229487503156,
291
- "grad_norm": 1.9086962938308716,
292
- "learning_rate": 2.901e-05,
293
- "loss": 0.1023,
294
- "step": 330
295
- },
296
- {
297
- "epoch": 0.08583690987124463,
298
- "grad_norm": 1.5782345533370972,
299
- "learning_rate": 2.898e-05,
300
- "loss": 0.0751,
301
- "step": 340
302
- },
303
- {
304
- "epoch": 0.08836152486745771,
305
- "grad_norm": 1.2298818826675415,
306
- "learning_rate": 2.895e-05,
307
- "loss": 0.0797,
308
- "step": 350
309
- },
310
- {
311
- "epoch": 0.08836152486745771,
312
- "eval_f1": 0.5325518588749066,
313
- "eval_loss": 0.07257544994354248,
314
- "eval_runtime": 979.2135,
315
- "eval_samples_per_second": 210.64,
316
- "eval_steps_per_second": 3.291,
317
- "step": 350
318
- },
319
- {
320
- "epoch": 0.09088613986367079,
321
- "grad_norm": 1.1932893991470337,
322
- "learning_rate": 2.892e-05,
323
- "loss": 0.0803,
324
- "step": 360
325
- },
326
- {
327
- "epoch": 0.09341075485988387,
328
- "grad_norm": 0.896007776260376,
329
- "learning_rate": 2.889e-05,
330
- "loss": 0.088,
331
- "step": 370
332
- },
333
- {
334
- "epoch": 0.09593536985609695,
335
- "grad_norm": 2.385890483856201,
336
- "learning_rate": 2.8859999999999998e-05,
337
- "loss": 0.0886,
338
- "step": 380
339
- },
340
- {
341
- "epoch": 0.09845998485231003,
342
- "grad_norm": 0.966077446937561,
343
- "learning_rate": 2.883e-05,
344
- "loss": 0.1038,
345
- "step": 390
346
- },
347
- {
348
- "epoch": 0.1009845998485231,
349
- "grad_norm": 0.969159722328186,
350
- "learning_rate": 2.88e-05,
351
- "loss": 0.0857,
352
- "step": 400
353
- },
354
- {
355
- "epoch": 0.1009845998485231,
356
- "eval_f1": 0.5497736226259776,
357
- "eval_loss": 0.06929118931293488,
358
- "eval_runtime": 978.0405,
359
- "eval_samples_per_second": 210.893,
360
- "eval_steps_per_second": 3.295,
361
- "step": 400
362
- },
363
- {
364
- "epoch": 0.10350921484473617,
365
- "grad_norm": 0.8633397817611694,
366
- "learning_rate": 2.877e-05,
367
- "loss": 0.0895,
368
- "step": 410
369
- },
370
- {
371
- "epoch": 0.10603382984094925,
372
- "grad_norm": 1.163271188735962,
373
- "learning_rate": 2.874e-05,
374
- "loss": 0.0861,
375
- "step": 420
376
- },
377
- {
378
- "epoch": 0.10855844483716233,
379
- "grad_norm": 1.102964997291565,
380
- "learning_rate": 2.871e-05,
381
- "loss": 0.0962,
382
- "step": 430
383
- },
384
- {
385
- "epoch": 0.11108305983337541,
386
- "grad_norm": 1.520044207572937,
387
- "learning_rate": 2.868e-05,
388
- "loss": 0.0981,
389
- "step": 440
390
- },
391
- {
392
- "epoch": 0.11360767482958849,
393
- "grad_norm": 1.8637338876724243,
394
- "learning_rate": 2.865e-05,
395
- "loss": 0.0885,
396
- "step": 450
397
- },
398
- {
399
- "epoch": 0.11360767482958849,
400
- "eval_f1": NaN,
401
- "eval_loss": 0.09174469113349915,
402
- "eval_runtime": 1032.8967,
403
- "eval_samples_per_second": 199.693,
404
- "eval_steps_per_second": 3.12,
405
- "step": 450
406
- },
407
- {
408
- "epoch": 0.11613228982580157,
409
- "grad_norm": 1.1974824666976929,
410
- "learning_rate": 2.862e-05,
411
- "loss": 0.0784,
412
- "step": 460
413
- },
414
- {
415
- "epoch": 0.11865690482201464,
416
- "grad_norm": 1.6933320760726929,
417
- "learning_rate": 2.859e-05,
418
- "loss": 0.078,
419
- "step": 470
420
- },
421
- {
422
- "epoch": 0.12118151981822772,
423
- "grad_norm": 1.7774609327316284,
424
- "learning_rate": 2.856e-05,
425
- "loss": 0.0715,
426
- "step": 480
427
- },
428
- {
429
- "epoch": 0.1237061348144408,
430
- "grad_norm": 0.7675666213035583,
431
- "learning_rate": 2.853e-05,
432
- "loss": 0.0817,
433
- "step": 490
434
- },
435
- {
436
- "epoch": 0.12623074981065388,
437
- "grad_norm": 1.169325590133667,
438
- "learning_rate": 2.8499999999999998e-05,
439
- "loss": 0.102,
440
- "step": 500
441
- },
442
- {
443
- "epoch": 0.12623074981065388,
444
- "eval_f1": 0.5648781658864481,
445
- "eval_loss": 0.057994671165943146,
446
- "eval_runtime": 967.2924,
447
- "eval_samples_per_second": 213.236,
448
- "eval_steps_per_second": 3.332,
449
- "step": 500
450
- },
451
- {
452
- "epoch": 0.12875536480686695,
453
- "grad_norm": 0.9567933678627014,
454
- "learning_rate": 2.847e-05,
455
- "loss": 0.0762,
456
- "step": 510
457
- },
458
- {
459
- "epoch": 0.13127997980308004,
460
- "grad_norm": 0.7539889216423035,
461
- "learning_rate": 2.844e-05,
462
- "loss": 0.0655,
463
- "step": 520
464
- },
465
- {
466
- "epoch": 0.1338045947992931,
467
- "grad_norm": 1.873833179473877,
468
- "learning_rate": 2.841e-05,
469
- "loss": 0.0747,
470
- "step": 530
471
- },
472
- {
473
- "epoch": 0.1363292097955062,
474
- "grad_norm": 0.7834559082984924,
475
- "learning_rate": 2.838e-05,
476
- "loss": 0.0923,
477
- "step": 540
478
- },
479
- {
480
- "epoch": 0.13885382479171926,
481
- "grad_norm": 0.6193771362304688,
482
- "learning_rate": 2.8349999999999998e-05,
483
- "loss": 0.0716,
484
- "step": 550
485
- },
486
- {
487
- "epoch": 0.13885382479171926,
488
- "eval_f1": 0.538135593220339,
489
- "eval_loss": 0.07973095029592514,
490
- "eval_runtime": 974.1593,
491
- "eval_samples_per_second": 211.733,
492
- "eval_steps_per_second": 3.308,
493
- "step": 550
494
- },
495
- {
496
- "epoch": 0.14137843978793235,
497
- "grad_norm": 1.1256766319274902,
498
- "learning_rate": 2.832e-05,
499
- "loss": 0.0798,
500
- "step": 560
501
- },
502
- {
503
- "epoch": 0.14390305478414542,
504
- "grad_norm": 1.0669515132904053,
505
- "learning_rate": 2.829e-05,
506
- "loss": 0.0795,
507
- "step": 570
508
- },
509
- {
510
- "epoch": 0.14642766978035848,
511
- "grad_norm": 1.018234133720398,
512
- "learning_rate": 2.826e-05,
513
- "loss": 0.073,
514
- "step": 580
515
- },
516
- {
517
- "epoch": 0.14895228477657158,
518
- "grad_norm": 1.2367616891860962,
519
- "learning_rate": 2.823e-05,
520
- "loss": 0.0879,
521
- "step": 590
522
- },
523
- {
524
- "epoch": 0.15147689977278464,
525
- "grad_norm": 1.5840317010879517,
526
- "learning_rate": 2.8199999999999998e-05,
527
- "loss": 0.0854,
528
- "step": 600
529
- },
530
- {
531
- "epoch": 0.15147689977278464,
532
- "eval_f1": 0.571752762018513,
533
- "eval_loss": 0.07439474016427994,
534
- "eval_runtime": 970.5653,
535
- "eval_samples_per_second": 212.517,
536
- "eval_steps_per_second": 3.321,
537
- "step": 600
538
- },
539
- {
540
- "epoch": 0.15400151476899773,
541
- "grad_norm": 0.5361483097076416,
542
- "learning_rate": 2.817e-05,
543
- "loss": 0.0854,
544
- "step": 610
545
- },
546
- {
547
- "epoch": 0.1565261297652108,
548
- "grad_norm": 0.9658698439598083,
549
- "learning_rate": 2.8139999999999998e-05,
550
- "loss": 0.095,
551
- "step": 620
552
- },
553
- {
554
- "epoch": 0.1590507447614239,
555
- "grad_norm": 0.820649266242981,
556
- "learning_rate": 2.8110000000000004e-05,
557
- "loss": 0.0921,
558
- "step": 630
559
- },
560
- {
561
- "epoch": 0.16157535975763695,
562
- "grad_norm": 1.1583890914916992,
563
- "learning_rate": 2.8080000000000002e-05,
564
- "loss": 0.077,
565
- "step": 640
566
- },
567
- {
568
- "epoch": 0.16409997475385005,
569
- "grad_norm": 0.8755506277084351,
570
- "learning_rate": 2.805e-05,
571
- "loss": 0.089,
572
- "step": 650
573
- },
574
- {
575
- "epoch": 0.16409997475385005,
576
- "eval_f1": 0.5789600675594161,
577
- "eval_loss": 0.0503680482506752,
578
- "eval_runtime": 976.7796,
579
- "eval_samples_per_second": 211.165,
580
- "eval_steps_per_second": 3.3,
581
- "step": 650
582
- },
583
- {
584
- "epoch": 0.1666245897500631,
585
- "grad_norm": 0.5073147416114807,
586
- "learning_rate": 2.8020000000000003e-05,
587
- "loss": 0.0784,
588
- "step": 660
589
- },
590
- {
591
- "epoch": 0.1691492047462762,
592
- "grad_norm": 1.0332393646240234,
593
- "learning_rate": 2.799e-05,
594
- "loss": 0.0906,
595
- "step": 670
596
- },
597
- {
598
- "epoch": 0.17167381974248927,
599
- "grad_norm": 1.1538151502609253,
600
- "learning_rate": 2.7960000000000003e-05,
601
- "loss": 0.0799,
602
- "step": 680
603
- },
604
- {
605
- "epoch": 0.17419843473870233,
606
- "grad_norm": 1.2075843811035156,
607
- "learning_rate": 2.7930000000000002e-05,
608
- "loss": 0.0795,
609
- "step": 690
610
- },
611
- {
612
- "epoch": 0.17672304973491543,
613
- "grad_norm": 2.1169683933258057,
614
- "learning_rate": 2.79e-05,
615
- "loss": 0.0721,
616
- "step": 700
617
- },
618
- {
619
- "epoch": 0.17672304973491543,
620
- "eval_f1": 0.5727175590644663,
621
- "eval_loss": 0.0618172287940979,
622
- "eval_runtime": 975.6558,
623
- "eval_samples_per_second": 211.409,
624
- "eval_steps_per_second": 3.303,
625
- "step": 700
626
- },
627
- {
628
- "epoch": 0.1792476647311285,
629
- "grad_norm": 1.3094089031219482,
630
- "learning_rate": 2.7870000000000003e-05,
631
- "loss": 0.0723,
632
- "step": 710
633
- },
634
- {
635
- "epoch": 0.18177227972734158,
636
- "grad_norm": 0.9937088489532471,
637
- "learning_rate": 2.784e-05,
638
- "loss": 0.0704,
639
- "step": 720
640
- },
641
- {
642
- "epoch": 0.18429689472355465,
643
- "grad_norm": 0.6464220881462097,
644
- "learning_rate": 2.7810000000000003e-05,
645
- "loss": 0.0731,
646
- "step": 730
647
- },
648
- {
649
- "epoch": 0.18682150971976774,
650
- "grad_norm": 0.5544419288635254,
651
- "learning_rate": 2.778e-05,
652
- "loss": 0.0894,
653
- "step": 740
654
- },
655
- {
656
- "epoch": 0.1893461247159808,
657
- "grad_norm": 0.6369556188583374,
658
- "learning_rate": 2.7750000000000004e-05,
659
- "loss": 0.0721,
660
- "step": 750
661
- },
662
- {
663
- "epoch": 0.1893461247159808,
664
- "eval_f1": 0.5904197411394702,
665
- "eval_loss": 0.07033708691596985,
666
- "eval_runtime": 967.5811,
667
- "eval_samples_per_second": 213.173,
668
- "eval_steps_per_second": 3.331,
669
- "step": 750
670
- },
671
- {
672
- "epoch": 0.1918707397121939,
673
- "grad_norm": 2.0700013637542725,
674
- "learning_rate": 2.7720000000000002e-05,
675
- "loss": 0.0831,
676
- "step": 760
677
- },
678
- {
679
- "epoch": 0.19439535470840696,
680
- "grad_norm": 0.765533983707428,
681
- "learning_rate": 2.769e-05,
682
- "loss": 0.0707,
683
- "step": 770
684
- },
685
- {
686
- "epoch": 0.19691996970462006,
687
- "grad_norm": 1.6104159355163574,
688
- "learning_rate": 2.7660000000000003e-05,
689
- "loss": 0.073,
690
- "step": 780
691
- },
692
- {
693
- "epoch": 0.19944458470083312,
694
- "grad_norm": 1.1069729328155518,
695
- "learning_rate": 2.763e-05,
696
- "loss": 0.0702,
697
- "step": 790
698
- },
699
- {
700
- "epoch": 0.2019691996970462,
701
- "grad_norm": 1.6577630043029785,
702
- "learning_rate": 2.7600000000000003e-05,
703
- "loss": 0.0865,
704
- "step": 800
705
- },
706
- {
707
- "epoch": 0.2019691996970462,
708
- "eval_f1": 0.5952780441035476,
709
- "eval_loss": 0.058820515871047974,
710
- "eval_runtime": 917.4267,
711
- "eval_samples_per_second": 224.827,
712
- "eval_steps_per_second": 3.513,
713
- "step": 800
714
- },
715
- {
716
- "epoch": 0.20449381469325928,
717
- "grad_norm": 1.5197840929031372,
718
- "learning_rate": 2.7570000000000002e-05,
719
- "loss": 0.0846,
720
- "step": 810
721
- },
722
- {
723
- "epoch": 0.20701842968947234,
724
- "grad_norm": 1.1758556365966797,
725
- "learning_rate": 2.754e-05,
726
- "loss": 0.0813,
727
- "step": 820
728
- },
729
- {
730
- "epoch": 0.20954304468568544,
731
- "grad_norm": 0.5016022324562073,
732
- "learning_rate": 2.7510000000000003e-05,
733
- "loss": 0.0718,
734
- "step": 830
735
- },
736
- {
737
- "epoch": 0.2120676596818985,
738
- "grad_norm": 1.3600627183914185,
739
- "learning_rate": 2.748e-05,
740
- "loss": 0.0942,
741
- "step": 840
742
- },
743
- {
744
- "epoch": 0.2145922746781116,
745
- "grad_norm": 0.6990534067153931,
746
- "learning_rate": 2.7450000000000003e-05,
747
- "loss": 0.0767,
748
- "step": 850
749
- },
750
- {
751
- "epoch": 0.2145922746781116,
752
- "eval_f1": 0.5918155918155918,
753
- "eval_loss": 0.04372716695070267,
754
- "eval_runtime": 913.4291,
755
- "eval_samples_per_second": 225.811,
756
- "eval_steps_per_second": 3.528,
757
- "step": 850
758
- },
759
- {
760
- "epoch": 0.21711688967432466,
761
- "grad_norm": 1.0468288660049438,
762
- "learning_rate": 2.7420000000000002e-05,
763
- "loss": 0.0805,
764
- "step": 860
765
- },
766
- {
767
- "epoch": 0.21964150467053775,
768
- "grad_norm": 1.2046771049499512,
769
- "learning_rate": 2.739e-05,
770
- "loss": 0.0879,
771
- "step": 870
772
- },
773
- {
774
- "epoch": 0.22216611966675082,
775
- "grad_norm": 0.9044977426528931,
776
- "learning_rate": 2.7360000000000002e-05,
777
- "loss": 0.0597,
778
- "step": 880
779
- },
780
- {
781
- "epoch": 0.2246907346629639,
782
- "grad_norm": 1.145572304725647,
783
- "learning_rate": 2.733e-05,
784
- "loss": 0.1007,
785
- "step": 890
786
- },
787
- {
788
- "epoch": 0.22721534965917697,
789
- "grad_norm": 1.058166742324829,
790
- "learning_rate": 2.7300000000000003e-05,
791
- "loss": 0.0773,
792
- "step": 900
793
- },
794
- {
795
- "epoch": 0.22721534965917697,
796
- "eval_f1": 0.5956852791878172,
797
- "eval_loss": 0.05675825849175453,
798
- "eval_runtime": 923.1927,
799
- "eval_samples_per_second": 223.422,
800
- "eval_steps_per_second": 3.491,
801
- "step": 900
802
- },
803
- {
804
- "epoch": 0.22973996465539007,
805
- "grad_norm": 0.7665570974349976,
806
- "learning_rate": 2.727e-05,
807
- "loss": 0.084,
808
- "step": 910
809
- },
810
- {
811
- "epoch": 0.23226457965160313,
812
- "grad_norm": 0.8884145021438599,
813
- "learning_rate": 2.724e-05,
814
- "loss": 0.0748,
815
- "step": 920
816
- },
817
- {
818
- "epoch": 0.2347891946478162,
819
- "grad_norm": 0.7132917046546936,
820
- "learning_rate": 2.7210000000000002e-05,
821
- "loss": 0.0861,
822
- "step": 930
823
- },
824
- {
825
- "epoch": 0.2373138096440293,
826
- "grad_norm": 1.3353750705718994,
827
- "learning_rate": 2.718e-05,
828
- "loss": 0.091,
829
- "step": 940
830
- },
831
- {
832
- "epoch": 0.23983842464024235,
833
- "grad_norm": 1.216691255569458,
834
- "learning_rate": 2.7150000000000003e-05,
835
- "loss": 0.0748,
836
- "step": 950
837
- },
838
- {
839
- "epoch": 0.23983842464024235,
840
- "eval_f1": 0.5942299042601041,
841
- "eval_loss": 0.04645048826932907,
842
- "eval_runtime": 919.478,
843
- "eval_samples_per_second": 224.325,
844
- "eval_steps_per_second": 3.505,
845
- "step": 950
846
- },
847
- {
848
- "epoch": 0.24236303963645545,
849
- "grad_norm": 1.0420501232147217,
850
- "learning_rate": 2.712e-05,
851
- "loss": 0.0953,
852
- "step": 960
853
- },
854
- {
855
- "epoch": 0.2448876546326685,
856
- "grad_norm": 1.1488158702850342,
857
- "learning_rate": 2.709e-05,
858
- "loss": 0.0796,
859
- "step": 970
860
- },
861
- {
862
- "epoch": 0.2474122696288816,
863
- "grad_norm": 0.7872379422187805,
864
- "learning_rate": 2.7060000000000002e-05,
865
- "loss": 0.0844,
866
- "step": 980
867
- },
868
- {
869
- "epoch": 0.24993688462509467,
870
- "grad_norm": 0.9102885127067566,
871
- "learning_rate": 2.703e-05,
872
- "loss": 0.0792,
873
- "step": 990
874
- },
875
- {
876
- "epoch": 0.25246149962130776,
877
- "grad_norm": 1.040650486946106,
878
- "learning_rate": 2.7000000000000002e-05,
879
- "loss": 0.0761,
880
- "step": 1000
881
- },
882
- {
883
- "epoch": 0.25246149962130776,
884
- "eval_f1": NaN,
885
- "eval_loss": 0.06595388799905777,
886
- "eval_runtime": 948.4123,
887
- "eval_samples_per_second": 217.481,
888
- "eval_steps_per_second": 3.398,
889
- "step": 1000
890
- },
891
- {
892
- "epoch": 0.25498611461752085,
893
- "grad_norm": 1.0717836618423462,
894
- "learning_rate": 2.697e-05,
895
- "loss": 0.0569,
896
- "step": 1010
897
- },
898
- {
899
- "epoch": 0.2575107296137339,
900
- "grad_norm": 0.7504699230194092,
901
- "learning_rate": 2.6940000000000003e-05,
902
- "loss": 0.072,
903
- "step": 1020
904
- },
905
- {
906
- "epoch": 0.260035344609947,
907
- "grad_norm": 0.9767778515815735,
908
- "learning_rate": 2.691e-05,
909
- "loss": 0.0658,
910
- "step": 1030
911
- },
912
- {
913
- "epoch": 0.2625599596061601,
914
- "grad_norm": 0.5905674695968628,
915
- "learning_rate": 2.688e-05,
916
- "loss": 0.0775,
917
- "step": 1040
918
- },
919
- {
920
- "epoch": 0.2650845746023731,
921
- "grad_norm": 1.6352293491363525,
922
- "learning_rate": 2.6850000000000002e-05,
923
- "loss": 0.0855,
924
- "step": 1050
925
- },
926
- {
927
- "epoch": 0.2650845746023731,
928
- "eval_f1": 0.5963938973647711,
929
- "eval_loss": 0.04910014942288399,
930
- "eval_runtime": 986.8376,
931
- "eval_samples_per_second": 209.013,
932
- "eval_steps_per_second": 3.266,
933
- "step": 1050
934
- },
935
- {
936
- "epoch": 0.2676091895985862,
937
- "grad_norm": 0.6634190082550049,
938
- "learning_rate": 2.682e-05,
939
- "loss": 0.0741,
940
- "step": 1060
941
- },
942
- {
943
- "epoch": 0.2701338045947993,
944
- "grad_norm": 0.5896914601325989,
945
- "learning_rate": 2.6790000000000003e-05,
946
- "loss": 0.0713,
947
- "step": 1070
948
- },
949
- {
950
- "epoch": 0.2726584195910124,
951
- "grad_norm": 1.3768564462661743,
952
- "learning_rate": 2.676e-05,
953
- "loss": 0.0684,
954
- "step": 1080
955
- },
956
- {
957
- "epoch": 0.27518303458722543,
958
- "grad_norm": 0.7323074340820312,
959
- "learning_rate": 2.673e-05,
960
- "loss": 0.084,
961
- "step": 1090
962
- },
963
- {
964
- "epoch": 0.2777076495834385,
965
- "grad_norm": 0.6660707592964172,
966
- "learning_rate": 2.6700000000000002e-05,
967
- "loss": 0.0832,
968
- "step": 1100
969
- },
970
- {
971
- "epoch": 0.2777076495834385,
972
- "eval_f1": 0.6048397002825205,
973
- "eval_loss": 0.049847185611724854,
974
- "eval_runtime": 967.6797,
975
- "eval_samples_per_second": 213.151,
976
- "eval_steps_per_second": 3.331,
977
- "step": 1100
978
- },
979
- {
980
- "epoch": 0.2802322645796516,
981
- "grad_norm": 1.425309419631958,
982
- "learning_rate": 2.667e-05,
983
- "loss": 0.0793,
984
- "step": 1110
985
- },
986
- {
987
- "epoch": 0.2827568795758647,
988
- "grad_norm": 1.3583918809890747,
989
- "learning_rate": 2.6640000000000002e-05,
990
- "loss": 0.0808,
991
- "step": 1120
992
- },
993
- {
994
- "epoch": 0.28528149457207774,
995
- "grad_norm": 1.1851533651351929,
996
- "learning_rate": 2.661e-05,
997
- "loss": 0.0738,
998
- "step": 1130
999
- },
1000
- {
1001
- "epoch": 0.28780610956829084,
1002
- "grad_norm": 1.4497005939483643,
1003
- "learning_rate": 2.658e-05,
1004
- "loss": 0.078,
1005
- "step": 1140
1006
- },
1007
- {
1008
- "epoch": 0.29033072456450393,
1009
- "grad_norm": 1.4407027959823608,
1010
- "learning_rate": 2.655e-05,
1011
- "loss": 0.0821,
1012
- "step": 1150
1013
- },
1014
- {
1015
- "epoch": 0.29033072456450393,
1016
- "eval_f1": 0.6031633616619453,
1017
- "eval_loss": 0.059650588780641556,
1018
- "eval_runtime": 962.0892,
1019
- "eval_samples_per_second": 214.39,
1020
- "eval_steps_per_second": 3.35,
1021
- "step": 1150
1022
- },
1023
- {
1024
- "epoch": 0.29285533956071697,
1025
- "grad_norm": 1.0721668004989624,
1026
- "learning_rate": 2.652e-05,
1027
- "loss": 0.0706,
1028
- "step": 1160
1029
- },
1030
- {
1031
- "epoch": 0.29537995455693006,
1032
- "grad_norm": 1.1033729314804077,
1033
- "learning_rate": 2.6490000000000002e-05,
1034
- "loss": 0.0737,
1035
- "step": 1170
1036
- },
1037
- {
1038
- "epoch": 0.29790456955314315,
1039
- "grad_norm": 0.9764577746391296,
1040
- "learning_rate": 2.646e-05,
1041
- "loss": 0.0743,
1042
- "step": 1180
1043
- },
1044
- {
1045
- "epoch": 0.30042918454935624,
1046
- "grad_norm": 1.2160297632217407,
1047
- "learning_rate": 2.643e-05,
1048
- "loss": 0.0768,
1049
- "step": 1190
1050
- },
1051
- {
1052
- "epoch": 0.3029537995455693,
1053
- "grad_norm": 0.8387085795402527,
1054
- "learning_rate": 2.64e-05,
1055
- "loss": 0.0715,
1056
- "step": 1200
1057
- },
1058
- {
1059
- "epoch": 0.3029537995455693,
1060
- "eval_f1": NaN,
1061
- "eval_loss": 0.06428094953298569,
1062
- "eval_runtime": 961.1037,
1063
- "eval_samples_per_second": 214.61,
1064
- "eval_steps_per_second": 3.353,
1065
- "step": 1200
1066
- },
1067
- {
1068
- "epoch": 0.3054784145417824,
1069
- "grad_norm": 1.061087727546692,
1070
- "learning_rate": 2.637e-05,
1071
- "loss": 0.0672,
1072
- "step": 1210
1073
- },
1074
- {
1075
- "epoch": 0.30800302953799547,
1076
- "grad_norm": 0.6768150925636292,
1077
- "learning_rate": 2.6340000000000002e-05,
1078
- "loss": 0.0762,
1079
- "step": 1220
1080
- },
1081
- {
1082
- "epoch": 0.31052764453420856,
1083
- "grad_norm": 0.7020296454429626,
1084
- "learning_rate": 2.631e-05,
1085
- "loss": 0.0838,
1086
- "step": 1230
1087
- },
1088
- {
1089
- "epoch": 0.3130522595304216,
1090
- "grad_norm": 0.9264736175537109,
1091
- "learning_rate": 2.628e-05,
1092
- "loss": 0.0769,
1093
- "step": 1240
1094
- },
1095
- {
1096
- "epoch": 0.3155768745266347,
1097
- "grad_norm": 0.657778799533844,
1098
- "learning_rate": 2.625e-05,
1099
- "loss": 0.085,
1100
- "step": 1250
1101
- },
1102
- {
1103
- "epoch": 0.3155768745266347,
1104
- "eval_f1": 0.6054250016184373,
1105
- "eval_loss": 0.06593530625104904,
1106
- "eval_runtime": 970.0262,
1107
- "eval_samples_per_second": 212.635,
1108
- "eval_steps_per_second": 3.323,
1109
- "step": 1250
1110
- },
1111
- {
1112
- "epoch": 0.3181014895228478,
1113
- "grad_norm": 0.6904731392860413,
1114
- "learning_rate": 2.622e-05,
1115
- "loss": 0.0736,
1116
- "step": 1260
1117
- },
1118
- {
1119
- "epoch": 0.3206261045190608,
1120
- "grad_norm": 1.4745820760726929,
1121
- "learning_rate": 2.619e-05,
1122
- "loss": 0.0832,
1123
- "step": 1270
1124
- },
1125
- {
1126
- "epoch": 0.3231507195152739,
1127
- "grad_norm": 1.0614553689956665,
1128
- "learning_rate": 2.616e-05,
1129
- "loss": 0.0781,
1130
- "step": 1280
1131
- },
1132
- {
1133
- "epoch": 0.325675334511487,
1134
- "grad_norm": 1.2228913307189941,
1135
- "learning_rate": 2.6130000000000002e-05,
1136
- "loss": 0.0872,
1137
- "step": 1290
1138
- },
1139
- {
1140
- "epoch": 0.3281999495077001,
1141
- "grad_norm": 0.9905760288238525,
1142
- "learning_rate": 2.61e-05,
1143
- "loss": 0.0826,
1144
- "step": 1300
1145
- },
1146
- {
1147
- "epoch": 0.3281999495077001,
1148
- "eval_f1": 0.6011740745177908,
1149
- "eval_loss": 0.05560224503278732,
1150
- "eval_runtime": 964.1962,
1151
- "eval_samples_per_second": 213.921,
1152
- "eval_steps_per_second": 3.343,
1153
- "step": 1300
1154
- },
1155
- {
1156
- "epoch": 0.33072456450391313,
1157
- "grad_norm": 1.1195616722106934,
1158
- "learning_rate": 2.607e-05,
1159
- "loss": 0.0751,
1160
- "step": 1310
1161
- },
1162
- {
1163
- "epoch": 0.3332491795001262,
1164
- "grad_norm": 0.9830445647239685,
1165
- "learning_rate": 2.604e-05,
1166
- "loss": 0.0694,
1167
- "step": 1320
1168
- },
1169
- {
1170
- "epoch": 0.3357737944963393,
1171
- "grad_norm": 1.7140698432922363,
1172
- "learning_rate": 2.601e-05,
1173
- "loss": 0.0694,
1174
- "step": 1330
1175
- },
1176
- {
1177
- "epoch": 0.3382984094925524,
1178
- "grad_norm": 0.9545607566833496,
1179
- "learning_rate": 2.5980000000000002e-05,
1180
- "loss": 0.0626,
1181
- "step": 1340
1182
- },
1183
- {
1184
- "epoch": 0.34082302448876545,
1185
- "grad_norm": 0.8236456513404846,
1186
- "learning_rate": 2.595e-05,
1187
- "loss": 0.064,
1188
- "step": 1350
1189
- },
1190
- {
1191
- "epoch": 0.34082302448876545,
1192
- "eval_f1": NaN,
1193
- "eval_loss": 0.0564185306429863,
1194
- "eval_runtime": 1030.8018,
1195
- "eval_samples_per_second": 200.099,
1196
- "eval_steps_per_second": 3.127,
1197
- "step": 1350
1198
- },
1199
- {
1200
- "epoch": 0.34334763948497854,
1201
- "grad_norm": 1.0344712734222412,
1202
- "learning_rate": 2.592e-05,
1203
- "loss": 0.074,
1204
- "step": 1360
1205
- },
1206
- {
1207
- "epoch": 0.34587225448119163,
1208
- "grad_norm": 1.647894024848938,
1209
- "learning_rate": 2.589e-05,
1210
- "loss": 0.0756,
1211
- "step": 1370
1212
- },
1213
- {
1214
- "epoch": 0.34839686947740467,
1215
- "grad_norm": 1.0268642902374268,
1216
- "learning_rate": 2.586e-05,
1217
- "loss": 0.064,
1218
- "step": 1380
1219
- },
1220
- {
1221
- "epoch": 0.35092148447361776,
1222
- "grad_norm": 0.6588199734687805,
1223
- "learning_rate": 2.5830000000000002e-05,
1224
- "loss": 0.0685,
1225
- "step": 1390
1226
- },
1227
- {
1228
- "epoch": 0.35344609946983085,
1229
- "grad_norm": 0.8278918862342834,
1230
- "learning_rate": 2.58e-05,
1231
- "loss": 0.0854,
1232
- "step": 1400
1233
- },
1234
- {
1235
- "epoch": 0.35344609946983085,
1236
- "eval_f1": NaN,
1237
- "eval_loss": 0.05516933649778366,
1238
- "eval_runtime": 1032.3177,
1239
- "eval_samples_per_second": 199.805,
1240
- "eval_steps_per_second": 3.122,
1241
- "step": 1400
1242
- },
1243
- {
1244
- "epoch": 0.35597071446604395,
1245
- "grad_norm": 0.4216013550758362,
1246
- "learning_rate": 2.577e-05,
1247
- "loss": 0.0785,
1248
- "step": 1410
1249
- },
1250
- {
1251
- "epoch": 0.358495329462257,
1252
- "grad_norm": 0.9567118287086487,
1253
- "learning_rate": 2.574e-05,
1254
- "loss": 0.089,
1255
- "step": 1420
1256
- },
1257
- {
1258
- "epoch": 0.3610199444584701,
1259
- "grad_norm": 1.3202637434005737,
1260
- "learning_rate": 2.571e-05,
1261
- "loss": 0.0884,
1262
- "step": 1430
1263
- },
1264
- {
1265
- "epoch": 0.36354455945468317,
1266
- "grad_norm": 1.3245704174041748,
1267
- "learning_rate": 2.568e-05,
1268
- "loss": 0.0739,
1269
- "step": 1440
1270
- },
1271
- {
1272
- "epoch": 0.36606917445089626,
1273
- "grad_norm": 0.6416196823120117,
1274
- "learning_rate": 2.565e-05,
1275
- "loss": 0.0702,
1276
- "step": 1450
1277
- },
1278
- {
1279
- "epoch": 0.36606917445089626,
1280
- "eval_f1": 0.6061020319393525,
1281
- "eval_loss": 0.06748606264591217,
1282
- "eval_runtime": 999.3826,
1283
- "eval_samples_per_second": 206.389,
1284
- "eval_steps_per_second": 3.225,
1285
- "step": 1450
1286
- },
1287
- {
1288
- "epoch": 0.3685937894471093,
1289
- "grad_norm": 0.9312785267829895,
1290
- "learning_rate": 2.562e-05,
1291
- "loss": 0.0674,
1292
- "step": 1460
1293
- },
1294
- {
1295
- "epoch": 0.3711184044433224,
1296
- "grad_norm": 0.9092572927474976,
1297
- "learning_rate": 2.559e-05,
1298
- "loss": 0.0676,
1299
- "step": 1470
1300
- },
1301
- {
1302
- "epoch": 0.3736430194395355,
1303
- "grad_norm": 1.4935100078582764,
1304
- "learning_rate": 2.556e-05,
1305
- "loss": 0.0712,
1306
- "step": 1480
1307
- },
1308
- {
1309
- "epoch": 0.3761676344357485,
1310
- "grad_norm": 0.9569060802459717,
1311
- "learning_rate": 2.553e-05,
1312
- "loss": 0.0747,
1313
- "step": 1490
1314
- },
1315
- {
1316
- "epoch": 0.3786922494319616,
1317
- "grad_norm": 0.947384774684906,
1318
- "learning_rate": 2.55e-05,
1319
- "loss": 0.0771,
1320
- "step": 1500
1321
- },
1322
- {
1323
- "epoch": 0.3786922494319616,
1324
- "eval_f1": NaN,
1325
- "eval_loss": 0.057753585278987885,
1326
- "eval_runtime": 1000.0105,
1327
- "eval_samples_per_second": 206.26,
1328
- "eval_steps_per_second": 3.223,
1329
- "step": 1500
1330
- },
1331
- {
1332
- "epoch": 0.3812168644281747,
1333
- "grad_norm": 0.6996080875396729,
1334
- "learning_rate": 2.547e-05,
1335
- "loss": 0.0696,
1336
- "step": 1510
1337
- },
1338
- {
1339
- "epoch": 0.3837414794243878,
1340
- "grad_norm": 0.5415595173835754,
1341
- "learning_rate": 2.544e-05,
1342
- "loss": 0.0757,
1343
- "step": 1520
1344
- },
1345
- {
1346
- "epoch": 0.38626609442060084,
1347
- "grad_norm": 0.5137012600898743,
1348
- "learning_rate": 2.541e-05,
1349
- "loss": 0.0621,
1350
- "step": 1530
1351
- },
1352
- {
1353
- "epoch": 0.38879070941681393,
1354
- "grad_norm": 0.9606865048408508,
1355
- "learning_rate": 2.538e-05,
1356
- "loss": 0.073,
1357
- "step": 1540
1358
- },
1359
- {
1360
- "epoch": 0.391315324413027,
1361
- "grad_norm": 1.1751604080200195,
1362
- "learning_rate": 2.535e-05,
1363
- "loss": 0.08,
1364
- "step": 1550
1365
- },
1366
- {
1367
- "epoch": 0.391315324413027,
1368
- "eval_f1": NaN,
1369
- "eval_loss": 0.0491572804749012,
1370
- "eval_runtime": 963.3777,
1371
- "eval_samples_per_second": 214.103,
1372
- "eval_steps_per_second": 3.346,
1373
- "step": 1550
1374
- },
1375
- {
1376
- "epoch": 0.3938399394092401,
1377
- "grad_norm": 0.935338020324707,
1378
- "learning_rate": 2.5319999999999998e-05,
1379
- "loss": 0.0729,
1380
- "step": 1560
1381
- },
1382
- {
1383
- "epoch": 0.39636455440545315,
1384
- "grad_norm": 0.7157814502716064,
1385
- "learning_rate": 2.529e-05,
1386
- "loss": 0.0719,
1387
- "step": 1570
1388
- },
1389
- {
1390
- "epoch": 0.39888916940166624,
1391
- "grad_norm": 0.6739543676376343,
1392
- "learning_rate": 2.526e-05,
1393
- "loss": 0.0631,
1394
- "step": 1580
1395
- },
1396
- {
1397
- "epoch": 0.40141378439787934,
1398
- "grad_norm": 0.4896785318851471,
1399
- "learning_rate": 2.523e-05,
1400
- "loss": 0.0746,
1401
- "step": 1590
1402
- },
1403
- {
1404
- "epoch": 0.4039383993940924,
1405
- "grad_norm": 0.7619987726211548,
1406
- "learning_rate": 2.52e-05,
1407
- "loss": 0.0804,
1408
- "step": 1600
1409
- },
1410
- {
1411
- "epoch": 0.4039383993940924,
1412
- "eval_f1": 0.6111605289687482,
1413
- "eval_loss": 0.05378127843141556,
1414
- "eval_runtime": 958.466,
1415
- "eval_samples_per_second": 215.2,
1416
- "eval_steps_per_second": 3.363,
1417
- "step": 1600
1418
- },
1419
- {
1420
- "epoch": 0.40646301439030547,
1421
- "grad_norm": 0.7464210987091064,
1422
- "learning_rate": 2.517e-05,
1423
- "loss": 0.0707,
1424
- "step": 1610
1425
- },
1426
- {
1427
- "epoch": 0.40898762938651856,
1428
- "grad_norm": 0.6707102656364441,
1429
- "learning_rate": 2.514e-05,
1430
- "loss": 0.0671,
1431
- "step": 1620
1432
- },
1433
- {
1434
- "epoch": 0.41151224438273165,
1435
- "grad_norm": 1.246846079826355,
1436
- "learning_rate": 2.511e-05,
1437
- "loss": 0.0627,
1438
- "step": 1630
1439
- },
1440
- {
1441
- "epoch": 0.4140368593789447,
1442
- "grad_norm": 0.9796457886695862,
1443
- "learning_rate": 2.508e-05,
1444
- "loss": 0.0677,
1445
- "step": 1640
1446
- },
1447
- {
1448
- "epoch": 0.4165614743751578,
1449
- "grad_norm": 0.9717236161231995,
1450
- "learning_rate": 2.505e-05,
1451
- "loss": 0.083,
1452
- "step": 1650
1453
- },
1454
- {
1455
- "epoch": 0.4165614743751578,
1456
- "eval_f1": 0.6047686163965234,
1457
- "eval_loss": 0.057900335639715195,
1458
- "eval_runtime": 962.7843,
1459
- "eval_samples_per_second": 214.235,
1460
- "eval_steps_per_second": 3.348,
1461
- "step": 1650
1462
- },
1463
- {
1464
- "epoch": 0.4190860893713709,
1465
- "grad_norm": 1.1706446409225464,
1466
- "learning_rate": 2.502e-05,
1467
- "loss": 0.0764,
1468
- "step": 1660
1469
- },
1470
- {
1471
- "epoch": 0.42161070436758397,
1472
- "grad_norm": 0.45280393958091736,
1473
- "learning_rate": 2.499e-05,
1474
- "loss": 0.0682,
1475
- "step": 1670
1476
- },
1477
- {
1478
- "epoch": 0.424135319363797,
1479
- "grad_norm": 1.0100760459899902,
1480
- "learning_rate": 2.4959999999999998e-05,
1481
- "loss": 0.0892,
1482
- "step": 1680
1483
- },
1484
- {
1485
- "epoch": 0.4266599343600101,
1486
- "grad_norm": 1.0506736040115356,
1487
- "learning_rate": 2.493e-05,
1488
- "loss": 0.0666,
1489
- "step": 1690
1490
- },
1491
- {
1492
- "epoch": 0.4291845493562232,
1493
- "grad_norm": 0.7978639006614685,
1494
- "learning_rate": 2.49e-05,
1495
- "loss": 0.0701,
1496
- "step": 1700
1497
- },
1498
- {
1499
- "epoch": 0.4291845493562232,
1500
- "eval_f1": 0.6044656147662996,
1501
- "eval_loss": 0.06738731265068054,
1502
- "eval_runtime": 1102.7864,
1503
- "eval_samples_per_second": 187.037,
1504
- "eval_steps_per_second": 2.923,
1505
- "step": 1700
1506
- },
1507
- {
1508
- "epoch": 0.4317091643524363,
1509
- "grad_norm": 1.121317982673645,
1510
- "learning_rate": 2.487e-05,
1511
- "loss": 0.0771,
1512
- "step": 1710
1513
- },
1514
- {
1515
- "epoch": 0.4342337793486493,
1516
- "grad_norm": 1.0836131572723389,
1517
- "learning_rate": 2.484e-05,
1518
- "loss": 0.0719,
1519
- "step": 1720
1520
- },
1521
- {
1522
- "epoch": 0.4367583943448624,
1523
- "grad_norm": 0.61658775806427,
1524
- "learning_rate": 2.4809999999999998e-05,
1525
- "loss": 0.0681,
1526
- "step": 1730
1527
- },
1528
- {
1529
- "epoch": 0.4392830093410755,
1530
- "grad_norm": 0.647393524646759,
1531
- "learning_rate": 2.478e-05,
1532
- "loss": 0.0668,
1533
- "step": 1740
1534
- },
1535
- {
1536
- "epoch": 0.44180762433728854,
1537
- "grad_norm": 0.782483696937561,
1538
- "learning_rate": 2.475e-05,
1539
- "loss": 0.0721,
1540
- "step": 1750
1541
- },
1542
- {
1543
- "epoch": 0.44180762433728854,
1544
- "eval_f1": 0.5979155238617663,
1545
- "eval_loss": 0.04912808537483215,
1546
- "eval_runtime": 1171.9033,
1547
- "eval_samples_per_second": 176.006,
1548
- "eval_steps_per_second": 2.75,
1549
- "step": 1750
1550
- },
1551
- {
1552
- "epoch": 0.44433223933350163,
1553
- "grad_norm": 0.4538789987564087,
1554
- "learning_rate": 2.472e-05,
1555
- "loss": 0.0641,
1556
- "step": 1760
1557
- },
1558
- {
1559
- "epoch": 0.4468568543297147,
1560
- "grad_norm": 0.7954159379005432,
1561
- "learning_rate": 2.469e-05,
1562
- "loss": 0.079,
1563
- "step": 1770
1564
- },
1565
- {
1566
- "epoch": 0.4493814693259278,
1567
- "grad_norm": 0.4370203912258148,
1568
- "learning_rate": 2.4659999999999998e-05,
1569
- "loss": 0.0769,
1570
- "step": 1780
1571
- },
1572
- {
1573
- "epoch": 0.45190608432214086,
1574
- "grad_norm": 1.2641068696975708,
1575
- "learning_rate": 2.463e-05,
1576
- "loss": 0.0649,
1577
- "step": 1790
1578
  },
1579
  {
1580
- "epoch": 0.45443069931835395,
1581
- "grad_norm": 1.262468695640564,
1582
- "learning_rate": 2.4599999999999998e-05,
1583
- "loss": 0.0765,
1584
- "step": 1800
1585
  },
1586
  {
1587
- "epoch": 0.45443069931835395,
1588
  "eval_f1": NaN,
1589
- "eval_loss": 0.04386861249804497,
1590
- "eval_runtime": 1166.6982,
1591
- "eval_samples_per_second": 176.791,
1592
- "eval_steps_per_second": 2.762,
1593
- "step": 1800
1594
- },
1595
- {
1596
- "epoch": 0.45695531431456704,
1597
- "grad_norm": 1.0922938585281372,
1598
- "learning_rate": 2.457e-05,
1599
- "loss": 0.074,
1600
- "step": 1810
1601
- },
1602
- {
1603
- "epoch": 0.45947992931078013,
1604
- "grad_norm": 0.8298421502113342,
1605
- "learning_rate": 2.454e-05,
1606
- "loss": 0.0778,
1607
- "step": 1820
1608
- },
1609
- {
1610
- "epoch": 0.46200454430699317,
1611
- "grad_norm": 1.182712435722351,
1612
- "learning_rate": 2.4509999999999997e-05,
1613
- "loss": 0.0793,
1614
- "step": 1830
1615
- },
1616
- {
1617
- "epoch": 0.46452915930320626,
1618
- "grad_norm": 0.7366443276405334,
1619
- "learning_rate": 2.448e-05,
1620
- "loss": 0.0655,
1621
- "step": 1840
1622
- },
1623
- {
1624
- "epoch": 0.46705377429941936,
1625
- "grad_norm": 0.9185643792152405,
1626
- "learning_rate": 2.4449999999999998e-05,
1627
- "loss": 0.0692,
1628
- "step": 1850
1629
- },
1630
- {
1631
- "epoch": 0.46705377429941936,
1632
- "eval_f1": 0.6057632592224568,
1633
- "eval_loss": 0.04681675508618355,
1634
- "eval_runtime": 1170.8075,
1635
- "eval_samples_per_second": 176.171,
1636
- "eval_steps_per_second": 2.753,
1637
- "step": 1850
1638
- },
1639
- {
1640
- "epoch": 0.4695783892956324,
1641
- "grad_norm": 0.99897301197052,
1642
- "learning_rate": 2.442e-05,
1643
- "loss": 0.0685,
1644
- "step": 1860
1645
- },
1646
- {
1647
- "epoch": 0.4721030042918455,
1648
- "grad_norm": 1.0028034448623657,
1649
- "learning_rate": 2.439e-05,
1650
- "loss": 0.0748,
1651
- "step": 1870
1652
- },
1653
- {
1654
- "epoch": 0.4746276192880586,
1655
- "grad_norm": 2.5226945877075195,
1656
- "learning_rate": 2.4360000000000004e-05,
1657
- "loss": 0.0715,
1658
- "step": 1880
1659
- },
1660
- {
1661
- "epoch": 0.47715223428427167,
1662
- "grad_norm": 0.903256893157959,
1663
- "learning_rate": 2.4330000000000003e-05,
1664
- "loss": 0.0709,
1665
- "step": 1890
1666
- },
1667
- {
1668
- "epoch": 0.4796768492804847,
1669
- "grad_norm": 0.9269793629646301,
1670
- "learning_rate": 2.43e-05,
1671
- "loss": 0.0761,
1672
- "step": 1900
1673
- },
1674
- {
1675
- "epoch": 0.4796768492804847,
1676
- "eval_f1": 0.6124984470120511,
1677
- "eval_loss": 0.05741230770945549,
1678
- "eval_runtime": 1159.3714,
1679
- "eval_samples_per_second": 177.908,
1680
- "eval_steps_per_second": 2.78,
1681
- "step": 1900
1682
- },
1683
- {
1684
- "epoch": 0.4822014642766978,
1685
- "grad_norm": 1.0651170015335083,
1686
- "learning_rate": 2.4270000000000003e-05,
1687
- "loss": 0.0751,
1688
- "step": 1910
1689
- },
1690
- {
1691
- "epoch": 0.4847260792729109,
1692
- "grad_norm": 1.2628437280654907,
1693
- "learning_rate": 2.4240000000000002e-05,
1694
- "loss": 0.0852,
1695
- "step": 1920
1696
- },
1697
- {
1698
- "epoch": 0.487250694269124,
1699
- "grad_norm": 1.3889621496200562,
1700
- "learning_rate": 2.4210000000000004e-05,
1701
- "loss": 0.073,
1702
- "step": 1930
1703
- },
1704
- {
1705
- "epoch": 0.489775309265337,
1706
- "grad_norm": 1.028456687927246,
1707
- "learning_rate": 2.4180000000000002e-05,
1708
- "loss": 0.0644,
1709
- "step": 1940
1710
- },
1711
- {
1712
- "epoch": 0.4922999242615501,
1713
- "grad_norm": 0.6997565627098083,
1714
- "learning_rate": 2.415e-05,
1715
- "loss": 0.0757,
1716
- "step": 1950
1717
- },
1718
- {
1719
- "epoch": 0.4922999242615501,
1720
- "eval_f1": 0.6126181795711549,
1721
- "eval_loss": 0.05692484602332115,
1722
- "eval_runtime": 1161.9825,
1723
- "eval_samples_per_second": 177.509,
1724
- "eval_steps_per_second": 2.774,
1725
- "step": 1950
1726
- },
1727
- {
1728
- "epoch": 0.4948245392577632,
1729
- "grad_norm": 1.384186863899231,
1730
- "learning_rate": 2.4120000000000003e-05,
1731
- "loss": 0.0697,
1732
- "step": 1960
1733
- },
1734
- {
1735
- "epoch": 0.49734915425397624,
1736
- "grad_norm": 0.8674394488334656,
1737
- "learning_rate": 2.409e-05,
1738
- "loss": 0.0739,
1739
- "step": 1970
1740
- },
1741
- {
1742
- "epoch": 0.49987376925018934,
1743
- "grad_norm": 1.826121211051941,
1744
- "learning_rate": 2.4060000000000003e-05,
1745
- "loss": 0.0739,
1746
- "step": 1980
1747
- },
1748
- {
1749
- "epoch": 0.5023983842464024,
1750
- "grad_norm": 0.6903666257858276,
1751
- "learning_rate": 2.4030000000000002e-05,
1752
- "loss": 0.0661,
1753
- "step": 1990
1754
  },
1755
  {
1756
- "epoch": 0.5049229992426155,
1757
- "grad_norm": 0.7339742183685303,
1758
- "learning_rate": 2.4e-05,
1759
- "loss": 0.0654,
1760
- "step": 2000
 
 
1761
  },
1762
  {
1763
- "epoch": 0.5049229992426155,
1764
- "eval_f1": 0.6095153739086423,
1765
- "eval_loss": 0.05489746853709221,
1766
- "eval_runtime": 1168.0449,
1767
- "eval_samples_per_second": 176.587,
1768
- "eval_steps_per_second": 2.759,
1769
  "step": 2000
1770
  },
1771
  {
1772
- "epoch": 0.5074476142388286,
1773
- "grad_norm": 0.7863900065422058,
1774
- "learning_rate": 2.3970000000000003e-05,
1775
- "loss": 0.061,
1776
- "step": 2010
1777
- },
1778
- {
1779
- "epoch": 0.5099722292350417,
1780
- "grad_norm": 1.0800750255584717,
1781
- "learning_rate": 2.394e-05,
1782
- "loss": 0.0781,
1783
- "step": 2020
1784
- },
1785
- {
1786
- "epoch": 0.5124968442312547,
1787
- "grad_norm": 1.0992929935455322,
1788
- "learning_rate": 2.3910000000000003e-05,
1789
- "loss": 0.0694,
1790
- "step": 2030
1791
- },
1792
- {
1793
- "epoch": 0.5150214592274678,
1794
- "grad_norm": 0.703554093837738,
1795
- "learning_rate": 2.3880000000000002e-05,
1796
- "loss": 0.0881,
1797
- "step": 2040
1798
- },
1799
- {
1800
- "epoch": 0.5175460742236809,
1801
- "grad_norm": 1.214089274406433,
1802
- "learning_rate": 2.385e-05,
1803
- "loss": 0.0736,
1804
- "step": 2050
1805
- },
1806
- {
1807
- "epoch": 0.5175460742236809,
1808
- "eval_f1": 0.612187690432663,
1809
- "eval_loss": 0.05384594947099686,
1810
- "eval_runtime": 1155.7771,
1811
- "eval_samples_per_second": 178.462,
1812
- "eval_steps_per_second": 2.789,
1813
- "step": 2050
1814
- },
1815
- {
1816
- "epoch": 0.520070689219894,
1817
- "grad_norm": 0.8359307050704956,
1818
- "learning_rate": 2.3820000000000002e-05,
1819
- "loss": 0.0759,
1820
- "step": 2060
1821
- },
1822
- {
1823
- "epoch": 0.522595304216107,
1824
- "grad_norm": 1.6299511194229126,
1825
- "learning_rate": 2.379e-05,
1826
- "loss": 0.076,
1827
- "step": 2070
1828
- },
1829
- {
1830
- "epoch": 0.5251199192123202,
1831
- "grad_norm": 0.6880617737770081,
1832
- "learning_rate": 2.3760000000000003e-05,
1833
- "loss": 0.0745,
1834
- "step": 2080
1835
- },
1836
- {
1837
- "epoch": 0.5276445342085332,
1838
- "grad_norm": 0.7822777032852173,
1839
- "learning_rate": 2.373e-05,
1840
- "loss": 0.0697,
1841
- "step": 2090
1842
- },
1843
- {
1844
- "epoch": 0.5301691492047462,
1845
- "grad_norm": 0.7941886782646179,
1846
- "learning_rate": 2.37e-05,
1847
- "loss": 0.0685,
1848
- "step": 2100
1849
- },
1850
- {
1851
- "epoch": 0.5301691492047462,
1852
- "eval_f1": 0.6104315862855695,
1853
- "eval_loss": 0.04854836314916611,
1854
- "eval_runtime": 1154.0649,
1855
- "eval_samples_per_second": 178.727,
1856
- "eval_steps_per_second": 2.793,
1857
- "step": 2100
1858
- },
1859
- {
1860
- "epoch": 0.5326937642009594,
1861
- "grad_norm": 0.948130190372467,
1862
- "learning_rate": 2.3670000000000002e-05,
1863
- "loss": 0.0706,
1864
- "step": 2110
1865
- },
1866
- {
1867
- "epoch": 0.5352183791971724,
1868
- "grad_norm": 0.959032416343689,
1869
- "learning_rate": 2.364e-05,
1870
- "loss": 0.0684,
1871
- "step": 2120
1872
- },
1873
- {
1874
- "epoch": 0.5377429941933856,
1875
- "grad_norm": 1.1859666109085083,
1876
- "learning_rate": 2.3610000000000003e-05,
1877
- "loss": 0.0757,
1878
- "step": 2130
1879
- },
1880
- {
1881
- "epoch": 0.5402676091895986,
1882
- "grad_norm": 0.9001142978668213,
1883
- "learning_rate": 2.358e-05,
1884
- "loss": 0.079,
1885
- "step": 2140
1886
- },
1887
- {
1888
- "epoch": 0.5427922241858116,
1889
- "grad_norm": 0.47399717569351196,
1890
- "learning_rate": 2.3550000000000003e-05,
1891
- "loss": 0.0726,
1892
- "step": 2150
1893
- },
1894
- {
1895
- "epoch": 0.5427922241858116,
1896
- "eval_f1": 0.611992731677771,
1897
- "eval_loss": 0.05662121623754501,
1898
- "eval_runtime": 1151.3771,
1899
- "eval_samples_per_second": 179.144,
1900
  "eval_steps_per_second": 2.799,
1901
- "step": 2150
1902
- },
1903
- {
1904
- "epoch": 0.5453168391820248,
1905
- "grad_norm": 0.6292353272438049,
1906
- "learning_rate": 2.3520000000000002e-05,
1907
- "loss": 0.0677,
1908
- "step": 2160
1909
- },
1910
- {
1911
- "epoch": 0.5478414541782378,
1912
- "grad_norm": 0.7090362906455994,
1913
- "learning_rate": 2.349e-05,
1914
- "loss": 0.0703,
1915
- "step": 2170
1916
- },
1917
- {
1918
- "epoch": 0.5503660691744509,
1919
- "grad_norm": 0.6082953810691833,
1920
- "learning_rate": 2.3460000000000002e-05,
1921
- "loss": 0.0672,
1922
- "step": 2180
1923
- },
1924
- {
1925
- "epoch": 0.552890684170664,
1926
- "grad_norm": 0.5937643051147461,
1927
- "learning_rate": 2.343e-05,
1928
- "loss": 0.0686,
1929
- "step": 2190
1930
- },
1931
- {
1932
- "epoch": 0.555415299166877,
1933
- "grad_norm": 0.7394770979881287,
1934
- "learning_rate": 2.3400000000000003e-05,
1935
- "loss": 0.0731,
1936
- "step": 2200
1937
- },
1938
- {
1939
- "epoch": 0.555415299166877,
1940
- "eval_f1": 0.6111780293905084,
1941
- "eval_loss": 0.05852247402071953,
1942
- "eval_runtime": 1153.1003,
1943
- "eval_samples_per_second": 178.876,
1944
- "eval_steps_per_second": 2.795,
1945
- "step": 2200
1946
- },
1947
- {
1948
- "epoch": 0.5579399141630901,
1949
- "grad_norm": 0.7641323804855347,
1950
- "learning_rate": 2.337e-05,
1951
- "loss": 0.0732,
1952
- "step": 2210
1953
- },
1954
- {
1955
- "epoch": 0.5604645291593032,
1956
- "grad_norm": 0.8567935824394226,
1957
- "learning_rate": 2.334e-05,
1958
- "loss": 0.0599,
1959
- "step": 2220
1960
- },
1961
- {
1962
- "epoch": 0.5629891441555163,
1963
- "grad_norm": 0.9106941819190979,
1964
- "learning_rate": 2.3310000000000002e-05,
1965
- "loss": 0.0593,
1966
- "step": 2230
1967
- },
1968
- {
1969
- "epoch": 0.5655137591517294,
1970
- "grad_norm": 1.5944632291793823,
1971
- "learning_rate": 2.328e-05,
1972
- "loss": 0.0669,
1973
- "step": 2240
1974
- },
1975
- {
1976
- "epoch": 0.5680383741479424,
1977
- "grad_norm": 0.9120457768440247,
1978
- "learning_rate": 2.3250000000000003e-05,
1979
- "loss": 0.0722,
1980
- "step": 2250
1981
- },
1982
- {
1983
- "epoch": 0.5680383741479424,
1984
- "eval_f1": 0.6139676730710583,
1985
- "eval_loss": 0.05887339636683464,
1986
- "eval_runtime": 1155.9087,
1987
- "eval_samples_per_second": 178.441,
1988
- "eval_steps_per_second": 2.788,
1989
- "step": 2250
1990
- },
1991
- {
1992
- "epoch": 0.5705629891441555,
1993
- "grad_norm": 0.8505953550338745,
1994
- "learning_rate": 2.322e-05,
1995
- "loss": 0.0863,
1996
- "step": 2260
1997
- },
1998
- {
1999
- "epoch": 0.5730876041403686,
2000
- "grad_norm": 0.9573137164115906,
2001
- "learning_rate": 2.319e-05,
2002
- "loss": 0.0712,
2003
- "step": 2270
2004
- },
2005
- {
2006
- "epoch": 0.5756122191365817,
2007
- "grad_norm": 1.230735182762146,
2008
- "learning_rate": 2.3160000000000002e-05,
2009
- "loss": 0.0677,
2010
- "step": 2280
2011
- },
2012
- {
2013
- "epoch": 0.5781368341327947,
2014
- "grad_norm": 1.203621745109558,
2015
- "learning_rate": 2.313e-05,
2016
- "loss": 0.0634,
2017
- "step": 2290
2018
- },
2019
- {
2020
- "epoch": 0.5806614491290079,
2021
- "grad_norm": 1.3590195178985596,
2022
- "learning_rate": 2.3100000000000002e-05,
2023
- "loss": 0.0819,
2024
- "step": 2300
2025
- },
2026
- {
2027
- "epoch": 0.5806614491290079,
2028
- "eval_f1": 0.6121980676328502,
2029
- "eval_loss": 0.050494007766246796,
2030
- "eval_runtime": 1153.6589,
2031
- "eval_samples_per_second": 178.789,
2032
- "eval_steps_per_second": 2.794,
2033
- "step": 2300
2034
- },
2035
- {
2036
- "epoch": 0.5831860641252209,
2037
- "grad_norm": 0.8538402318954468,
2038
- "learning_rate": 2.307e-05,
2039
- "loss": 0.0674,
2040
- "step": 2310
2041
- },
2042
- {
2043
- "epoch": 0.5857106791214339,
2044
- "grad_norm": 1.1863012313842773,
2045
- "learning_rate": 2.304e-05,
2046
- "loss": 0.0665,
2047
- "step": 2320
2048
- },
2049
- {
2050
- "epoch": 0.5882352941176471,
2051
- "grad_norm": 1.0120714902877808,
2052
- "learning_rate": 2.301e-05,
2053
- "loss": 0.0675,
2054
- "step": 2330
2055
- },
2056
- {
2057
- "epoch": 0.5907599091138601,
2058
- "grad_norm": 0.8394482135772705,
2059
- "learning_rate": 2.298e-05,
2060
- "loss": 0.0812,
2061
- "step": 2340
2062
- },
2063
- {
2064
- "epoch": 0.5932845241100733,
2065
- "grad_norm": 0.8855767250061035,
2066
- "learning_rate": 2.2950000000000002e-05,
2067
- "loss": 0.0694,
2068
- "step": 2350
2069
- },
2070
- {
2071
- "epoch": 0.5932845241100733,
2072
- "eval_f1": 0.6101251634597422,
2073
- "eval_loss": 0.053731031715869904,
2074
- "eval_runtime": 1147.8424,
2075
- "eval_samples_per_second": 179.695,
2076
- "eval_steps_per_second": 2.808,
2077
- "step": 2350
2078
- },
2079
- {
2080
- "epoch": 0.5958091391062863,
2081
- "grad_norm": 1.241045594215393,
2082
- "learning_rate": 2.292e-05,
2083
- "loss": 0.0646,
2084
- "step": 2360
2085
- },
2086
- {
2087
- "epoch": 0.5983337541024993,
2088
- "grad_norm": 2.065401315689087,
2089
- "learning_rate": 2.289e-05,
2090
- "loss": 0.0792,
2091
- "step": 2370
2092
- },
2093
- {
2094
- "epoch": 0.6008583690987125,
2095
- "grad_norm": 1.0024877786636353,
2096
- "learning_rate": 2.286e-05,
2097
- "loss": 0.0751,
2098
- "step": 2380
2099
- },
2100
- {
2101
- "epoch": 0.6033829840949255,
2102
- "grad_norm": 0.4943256080150604,
2103
- "learning_rate": 2.283e-05,
2104
- "loss": 0.076,
2105
- "step": 2390
2106
- },
2107
- {
2108
- "epoch": 0.6059075990911386,
2109
- "grad_norm": 1.0907814502716064,
2110
- "learning_rate": 2.2800000000000002e-05,
2111
- "loss": 0.0705,
2112
- "step": 2400
2113
- },
2114
- {
2115
- "epoch": 0.6059075990911386,
2116
- "eval_f1": 0.6130196664177247,
2117
- "eval_loss": 0.06461644172668457,
2118
- "eval_runtime": 1149.8253,
2119
- "eval_samples_per_second": 179.386,
2120
- "eval_steps_per_second": 2.803,
2121
- "step": 2400
2122
- },
2123
- {
2124
- "epoch": 0.6084322140873517,
2125
- "grad_norm": 1.1304162740707397,
2126
- "learning_rate": 2.277e-05,
2127
- "loss": 0.0548,
2128
- "step": 2410
2129
- },
2130
- {
2131
- "epoch": 0.6109568290835647,
2132
- "grad_norm": 1.3394097089767456,
2133
- "learning_rate": 2.274e-05,
2134
- "loss": 0.0607,
2135
- "step": 2420
2136
- },
2137
- {
2138
- "epoch": 0.6134814440797778,
2139
- "grad_norm": 0.5467960834503174,
2140
- "learning_rate": 2.271e-05,
2141
- "loss": 0.0701,
2142
- "step": 2430
2143
- },
2144
- {
2145
- "epoch": 0.6160060590759909,
2146
- "grad_norm": 0.5510517954826355,
2147
- "learning_rate": 2.268e-05,
2148
- "loss": 0.0725,
2149
- "step": 2440
2150
- },
2151
- {
2152
- "epoch": 0.618530674072204,
2153
- "grad_norm": 0.7682734131813049,
2154
- "learning_rate": 2.265e-05,
2155
- "loss": 0.0702,
2156
- "step": 2450
2157
- },
2158
- {
2159
- "epoch": 0.618530674072204,
2160
- "eval_f1": 0.6124447065762312,
2161
- "eval_loss": 0.046234920620918274,
2162
- "eval_runtime": 1146.4615,
2163
- "eval_samples_per_second": 179.912,
2164
- "eval_steps_per_second": 2.811,
2165
- "step": 2450
2166
- },
2167
- {
2168
- "epoch": 0.6210552890684171,
2169
- "grad_norm": 0.7578818798065186,
2170
- "learning_rate": 2.262e-05,
2171
- "loss": 0.0703,
2172
- "step": 2460
2173
- },
2174
- {
2175
- "epoch": 0.6235799040646302,
2176
- "grad_norm": 0.7244108319282532,
2177
- "learning_rate": 2.2590000000000002e-05,
2178
- "loss": 0.0635,
2179
- "step": 2470
2180
- },
2181
- {
2182
- "epoch": 0.6261045190608432,
2183
- "grad_norm": 1.1047908067703247,
2184
- "learning_rate": 2.256e-05,
2185
- "loss": 0.0614,
2186
- "step": 2480
2187
- },
2188
- {
2189
- "epoch": 0.6286291340570563,
2190
- "grad_norm": 1.0824987888336182,
2191
- "learning_rate": 2.253e-05,
2192
- "loss": 0.081,
2193
- "step": 2490
2194
- },
2195
- {
2196
- "epoch": 0.6311537490532694,
2197
- "grad_norm": 1.9344598054885864,
2198
- "learning_rate": 2.25e-05,
2199
- "loss": 0.0709,
2200
- "step": 2500
2201
  },
2202
  {
2203
- "epoch": 0.6311537490532694,
2204
- "eval_f1": 0.6133951445650848,
2205
- "eval_loss": 0.04044894501566887,
2206
- "eval_runtime": 1148.0724,
2207
- "eval_samples_per_second": 179.659,
2208
- "eval_steps_per_second": 2.807,
2209
  "step": 2500
2210
  },
2211
  {
2212
- "epoch": 0.6336783640494824,
2213
- "grad_norm": 1.2797091007232666,
2214
- "learning_rate": 2.247e-05,
2215
- "loss": 0.072,
2216
- "step": 2510
2217
- },
2218
- {
2219
- "epoch": 0.6362029790456956,
2220
- "grad_norm": 0.7228933572769165,
2221
- "learning_rate": 2.2440000000000002e-05,
2222
- "loss": 0.071,
2223
- "step": 2520
2224
- },
2225
- {
2226
- "epoch": 0.6387275940419086,
2227
- "grad_norm": 0.9655591249465942,
2228
- "learning_rate": 2.241e-05,
2229
- "loss": 0.0611,
2230
- "step": 2530
2231
- },
2232
- {
2233
- "epoch": 0.6412522090381216,
2234
- "grad_norm": 0.9924450516700745,
2235
- "learning_rate": 2.238e-05,
2236
- "loss": 0.0676,
2237
- "step": 2540
2238
- },
2239
- {
2240
- "epoch": 0.6437768240343348,
2241
- "grad_norm": 1.12591552734375,
2242
- "learning_rate": 2.235e-05,
2243
- "loss": 0.0804,
2244
- "step": 2550
2245
- },
2246
- {
2247
- "epoch": 0.6437768240343348,
2248
- "eval_f1": 0.612305676335696,
2249
- "eval_loss": 0.04778852313756943,
2250
- "eval_runtime": 1160.4576,
2251
- "eval_samples_per_second": 177.742,
2252
- "eval_steps_per_second": 2.777,
2253
- "step": 2550
2254
- },
2255
- {
2256
- "epoch": 0.6463014390305478,
2257
- "grad_norm": 0.7478006482124329,
2258
- "learning_rate": 2.232e-05,
2259
- "loss": 0.0638,
2260
- "step": 2560
2261
- },
2262
- {
2263
- "epoch": 0.648826054026761,
2264
- "grad_norm": 0.7661213874816895,
2265
- "learning_rate": 2.2290000000000002e-05,
2266
- "loss": 0.0632,
2267
- "step": 2570
2268
- },
2269
- {
2270
- "epoch": 0.651350669022974,
2271
- "grad_norm": 0.9824168086051941,
2272
- "learning_rate": 2.226e-05,
2273
- "loss": 0.0602,
2274
- "step": 2580
2275
- },
2276
- {
2277
- "epoch": 0.653875284019187,
2278
- "grad_norm": 1.1700901985168457,
2279
- "learning_rate": 2.223e-05,
2280
- "loss": 0.0714,
2281
- "step": 2590
2282
- },
2283
- {
2284
- "epoch": 0.6563998990154002,
2285
- "grad_norm": 0.8846214413642883,
2286
- "learning_rate": 2.22e-05,
2287
- "loss": 0.0666,
2288
- "step": 2600
2289
- },
2290
- {
2291
- "epoch": 0.6563998990154002,
2292
- "eval_f1": 0.6104417670682731,
2293
- "eval_loss": 0.04546576738357544,
2294
- "eval_runtime": 1160.1326,
2295
- "eval_samples_per_second": 177.792,
2296
- "eval_steps_per_second": 2.778,
2297
- "step": 2600
2298
  },
2299
  {
2300
- "epoch": 0.6589245140116132,
2301
- "grad_norm": 0.7641239166259766,
2302
- "learning_rate": 2.217e-05,
2303
- "loss": 0.058,
2304
- "step": 2610
 
 
2305
  },
2306
  {
2307
- "epoch": 0.6614491290078263,
2308
- "grad_norm": 0.5828648209571838,
2309
- "learning_rate": 2.214e-05,
2310
- "loss": 0.0686,
2311
- "step": 2620
2312
- },
2313
- {
2314
- "epoch": 0.6639737440040394,
2315
- "grad_norm": 0.6906914710998535,
2316
- "learning_rate": 2.211e-05,
2317
- "loss": 0.0764,
2318
- "step": 2630
2319
- },
2320
- {
2321
- "epoch": 0.6664983590002524,
2322
- "grad_norm": 1.3137489557266235,
2323
- "learning_rate": 2.208e-05,
2324
- "loss": 0.0768,
2325
- "step": 2640
2326
- },
2327
- {
2328
- "epoch": 0.6690229739964655,
2329
- "grad_norm": 0.863865077495575,
2330
- "learning_rate": 2.205e-05,
2331
- "loss": 0.0749,
2332
- "step": 2650
2333
- },
2334
- {
2335
- "epoch": 0.6690229739964655,
2336
- "eval_f1": 0.6131900703964431,
2337
- "eval_loss": 0.04790908098220825,
2338
- "eval_runtime": 1162.4462,
2339
- "eval_samples_per_second": 177.438,
2340
- "eval_steps_per_second": 2.773,
2341
- "step": 2650
2342
- },
2343
- {
2344
- "epoch": 0.6715475889926786,
2345
- "grad_norm": 0.9182652235031128,
2346
- "learning_rate": 2.202e-05,
2347
- "loss": 0.0625,
2348
- "step": 2660
2349
- },
2350
- {
2351
- "epoch": 0.6740722039888917,
2352
- "grad_norm": 1.4961283206939697,
2353
- "learning_rate": 2.199e-05,
2354
- "loss": 0.0726,
2355
- "step": 2670
2356
- },
2357
- {
2358
- "epoch": 0.6765968189851048,
2359
- "grad_norm": 0.7803681492805481,
2360
- "learning_rate": 2.196e-05,
2361
- "loss": 0.0669,
2362
- "step": 2680
2363
- },
2364
- {
2365
- "epoch": 0.6791214339813179,
2366
- "grad_norm": 1.0371824502944946,
2367
- "learning_rate": 2.193e-05,
2368
- "loss": 0.0566,
2369
- "step": 2690
2370
- },
2371
- {
2372
- "epoch": 0.6816460489775309,
2373
- "grad_norm": 1.1832714080810547,
2374
- "learning_rate": 2.19e-05,
2375
- "loss": 0.067,
2376
- "step": 2700
2377
- },
2378
- {
2379
- "epoch": 0.6816460489775309,
2380
- "eval_f1": 0.6132461161079312,
2381
- "eval_loss": 0.055793602019548416,
2382
- "eval_runtime": 1161.8914,
2383
- "eval_samples_per_second": 177.523,
2384
- "eval_steps_per_second": 2.774,
2385
- "step": 2700
2386
- },
2387
- {
2388
- "epoch": 0.684170663973744,
2389
- "grad_norm": 0.7899573445320129,
2390
- "learning_rate": 2.187e-05,
2391
- "loss": 0.0763,
2392
- "step": 2710
2393
- },
2394
- {
2395
- "epoch": 0.6866952789699571,
2396
- "grad_norm": 1.4638808965682983,
2397
- "learning_rate": 2.184e-05,
2398
- "loss": 0.0768,
2399
- "step": 2720
2400
- },
2401
- {
2402
- "epoch": 0.6892198939661701,
2403
- "grad_norm": 0.7547538876533508,
2404
- "learning_rate": 2.181e-05,
2405
- "loss": 0.0761,
2406
- "step": 2730
2407
- },
2408
- {
2409
- "epoch": 0.6917445089623833,
2410
- "grad_norm": 0.5143932700157166,
2411
- "learning_rate": 2.178e-05,
2412
- "loss": 0.0808,
2413
- "step": 2740
2414
- },
2415
- {
2416
- "epoch": 0.6942691239585963,
2417
- "grad_norm": 1.011730432510376,
2418
- "learning_rate": 2.175e-05,
2419
- "loss": 0.068,
2420
- "step": 2750
2421
  },
2422
  {
2423
- "epoch": 0.6942691239585963,
2424
- "eval_f1": 0.6108202443280978,
2425
- "eval_loss": 0.053855251520872116,
2426
- "eval_runtime": 1160.3338,
2427
- "eval_samples_per_second": 177.761,
2428
- "eval_steps_per_second": 2.778,
2429
- "step": 2750
2430
  },
2431
  {
2432
- "epoch": 0.6942691239585963,
2433
- "step": 2750,
2434
- "total_flos": 1.3639932886745088e+19,
2435
- "train_loss": 0.019194319985129618,
2436
- "train_runtime": 18605.0451,
2437
- "train_samples_per_second": 34.399,
2438
- "train_steps_per_second": 0.537
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2439
  }
2440
  ],
2441
- "logging_steps": 10,
2442
- "max_steps": 10000,
2443
  "num_input_tokens_seen": 0,
2444
- "num_train_epochs": 3,
2445
  "save_steps": 500,
2446
  "stateful_callbacks": {
2447
  "EarlyStoppingCallback": {
2448
  "args": {
2449
- "early_stopping_patience": 3,
2450
  "early_stopping_threshold": 0.0
2451
  },
2452
  "attributes": {
@@ -2459,13 +148,13 @@
2459
  "should_evaluate": false,
2460
  "should_log": false,
2461
  "should_save": true,
2462
- "should_training_stop": false
2463
  },
2464
  "attributes": {}
2465
  }
2466
  },
2467
- "total_flos": 1.3639932886745088e+19,
2468
- "train_batch_size": 64,
2469
  "trial_name": null,
2470
  "trial_params": null
2471
  }
 
1
  {
2
+ "best_metric": 0.38255795104555035,
3
+ "best_model_checkpoint": "./step_test_microsoft_dit/checkpoint-2000",
4
+ "epoch": 2.271580010095911,
5
+ "eval_steps": 500,
6
+ "global_step": 4500,
7
  "is_hyper_param_search": false,
8
  "is_local_process_zero": true,
9
  "is_world_process_zero": true,
10
  "log_history": [
11
  {
12
+ "epoch": 0.2523977788995457,
13
+ "eval_f1": 0.3040393637715986,
14
+ "eval_loss": 0.16232739388942719,
15
+ "eval_runtime": 573.4246,
16
+ "eval_samples_per_second": 359.702,
17
+ "eval_steps_per_second": 2.811,
18
+ "step": 500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  },
20
  {
21
+ "epoch": 0.5047955577990914,
22
+ "grad_norm": 3.499274253845215,
23
+ "learning_rate": 0.0004986304738420684,
24
+ "loss": 0.1656,
25
+ "step": 1000
26
  },
27
  {
28
+ "epoch": 0.5047955577990914,
29
  "eval_f1": NaN,
30
+ "eval_loss": 0.43072596192359924,
31
+ "eval_runtime": 571.3845,
32
+ "eval_samples_per_second": 360.986,
33
+ "eval_steps_per_second": 2.821,
34
+ "step": 1000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  },
36
  {
37
+ "epoch": 0.7571933366986371,
38
+ "eval_f1": NaN,
39
+ "eval_loss": 0.14925973117351532,
40
+ "eval_runtime": 574.5622,
41
+ "eval_samples_per_second": 358.99,
42
+ "eval_steps_per_second": 2.806,
43
+ "step": 1500
44
  },
45
  {
46
+ "epoch": 1.0095911155981827,
47
+ "grad_norm": 0.47774481773376465,
48
+ "learning_rate": 0.0004945369001834514,
49
+ "loss": 0.14,
 
 
50
  "step": 2000
51
  },
52
  {
53
+ "epoch": 1.0095911155981827,
54
+ "eval_f1": 0.38255795104555035,
55
+ "eval_loss": 0.18703292310237885,
56
+ "eval_runtime": 575.9325,
57
+ "eval_samples_per_second": 358.136,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  "eval_steps_per_second": 2.799,
59
+ "step": 2000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  },
61
  {
62
+ "epoch": 1.2619888944977284,
63
+ "eval_f1": NaN,
64
+ "eval_loss": 0.10761596262454987,
65
+ "eval_runtime": 570.7076,
66
+ "eval_samples_per_second": 361.414,
67
+ "eval_steps_per_second": 2.825,
68
  "step": 2500
69
  },
70
  {
71
+ "epoch": 1.514386673397274,
72
+ "grad_norm": 0.8171533942222595,
73
+ "learning_rate": 0.0004877641290737884,
74
+ "loss": 0.1285,
75
+ "step": 3000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  },
77
  {
78
+ "epoch": 1.514386673397274,
79
+ "eval_f1": 0.381203007518797,
80
+ "eval_loss": 0.08960258215665817,
81
+ "eval_runtime": 570.2026,
82
+ "eval_samples_per_second": 361.735,
83
+ "eval_steps_per_second": 2.827,
84
+ "step": 3000
85
  },
86
  {
87
+ "epoch": 1.76678445229682,
88
+ "eval_f1": NaN,
89
+ "eval_loss": 0.14047682285308838,
90
+ "eval_runtime": 572.8668,
91
+ "eval_samples_per_second": 360.052,
92
+ "eval_steps_per_second": 2.814,
93
+ "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  },
95
  {
96
+ "epoch": 2.0191822311963654,
97
+ "grad_norm": 0.7625566124916077,
98
+ "learning_rate": 0.0004783863644106502,
99
+ "loss": 0.123,
100
+ "step": 4000
 
 
101
  },
102
  {
103
+ "epoch": 2.0191822311963654,
104
+ "eval_f1": NaN,
105
+ "eval_loss": 0.20614498853683472,
106
+ "eval_runtime": 571.417,
107
+ "eval_samples_per_second": 360.966,
108
+ "eval_steps_per_second": 2.821,
109
+ "step": 4000
110
+ },
111
+ {
112
+ "epoch": 2.271580010095911,
113
+ "eval_f1": 0.23949467373266953,
114
+ "eval_loss": 0.22833816707134247,
115
+ "eval_runtime": 576.5663,
116
+ "eval_samples_per_second": 357.742,
117
+ "eval_steps_per_second": 2.796,
118
+ "step": 4500
119
+ },
120
+ {
121
+ "epoch": 2.271580010095911,
122
+ "step": 4500,
123
+ "total_flos": 4.4623040438986555e+19,
124
+ "train_loss": 0.13670311906602647,
125
+ "train_runtime": 12745.6466,
126
+ "train_samples_per_second": 301.279,
127
+ "train_steps_per_second": 2.354
128
  }
129
  ],
130
+ "logging_steps": 1000,
131
+ "max_steps": 30000,
132
  "num_input_tokens_seen": 0,
133
+ "num_train_epochs": 16,
134
  "save_steps": 500,
135
  "stateful_callbacks": {
136
  "EarlyStoppingCallback": {
137
  "args": {
138
+ "early_stopping_patience": 5,
139
  "early_stopping_threshold": 0.0
140
  },
141
  "attributes": {
 
148
  "should_evaluate": false,
149
  "should_log": false,
150
  "should_save": true,
151
+ "should_training_stop": true
152
  },
153
  "attributes": {}
154
  }
155
  },
156
+ "total_flos": 4.4623040438986555e+19,
157
+ "train_batch_size": 128,
158
  "trial_name": null,
159
  "trial_params": null
160
  }