1NEYRON1 commited on
Commit
f2e0a49
·
1 Parent(s): 36ad183

Delete checkpoint/trainer_state.json

Browse files
Files changed (1) hide show
  1. checkpoint/trainer_state.json +0 -2059
checkpoint/trainer_state.json DELETED
@@ -1,2059 +0,0 @@
1
- {
2
- "best_global_step": 2500,
3
- "best_metric": 0.6559016016048936,
4
- "best_model_checkpoint": "./results/checkpoint-2500",
5
- "epoch": 1.2195121951219512,
6
- "eval_steps": 100,
7
- "global_step": 2500,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.004878048780487805,
14
- "grad_norm": 0.5707780718803406,
15
- "learning_rate": 1.0000000000000002e-06,
16
- "loss": 0.373,
17
- "step": 10
18
- },
19
- {
20
- "epoch": 0.00975609756097561,
21
- "grad_norm": 0.5740946531295776,
22
- "learning_rate": 2.0000000000000003e-06,
23
- "loss": 0.3695,
24
- "step": 20
25
- },
26
- {
27
- "epoch": 0.014634146341463415,
28
- "grad_norm": 0.5766364336013794,
29
- "learning_rate": 3e-06,
30
- "loss": 0.3651,
31
- "step": 30
32
- },
33
- {
34
- "epoch": 0.01951219512195122,
35
- "grad_norm": 0.5667976140975952,
36
- "learning_rate": 4.000000000000001e-06,
37
- "loss": 0.3589,
38
- "step": 40
39
- },
40
- {
41
- "epoch": 0.024390243902439025,
42
- "grad_norm": 0.5701761841773987,
43
- "learning_rate": 5e-06,
44
- "loss": 0.3521,
45
- "step": 50
46
- },
47
- {
48
- "epoch": 0.02926829268292683,
49
- "grad_norm": 0.562853991985321,
50
- "learning_rate": 6e-06,
51
- "loss": 0.3424,
52
- "step": 60
53
- },
54
- {
55
- "epoch": 0.03414634146341464,
56
- "grad_norm": 0.558131754398346,
57
- "learning_rate": 7.000000000000001e-06,
58
- "loss": 0.3304,
59
- "step": 70
60
- },
61
- {
62
- "epoch": 0.03902439024390244,
63
- "grad_norm": 0.5538951754570007,
64
- "learning_rate": 8.000000000000001e-06,
65
- "loss": 0.315,
66
- "step": 80
67
- },
68
- {
69
- "epoch": 0.04390243902439024,
70
- "grad_norm": 0.5401511788368225,
71
- "learning_rate": 9e-06,
72
- "loss": 0.2998,
73
- "step": 90
74
- },
75
- {
76
- "epoch": 0.04878048780487805,
77
- "grad_norm": 0.5289145708084106,
78
- "learning_rate": 1e-05,
79
- "loss": 0.2813,
80
- "step": 100
81
- },
82
- {
83
- "epoch": 0.04878048780487805,
84
- "eval_f1": 0.02288934762189008,
85
- "eval_loss": 0.2639255225658417,
86
- "eval_precision": 0.011585886400544311,
87
- "eval_recall": 0.938986325802616,
88
- "eval_runtime": 132.4738,
89
- "eval_samples_per_second": 61.899,
90
- "eval_steps_per_second": 0.974,
91
- "step": 100
92
- },
93
- {
94
- "epoch": 0.05365853658536585,
95
- "grad_norm": 0.5152837038040161,
96
- "learning_rate": 1.1000000000000001e-05,
97
- "loss": 0.2626,
98
- "step": 110
99
- },
100
- {
101
- "epoch": 0.05853658536585366,
102
- "grad_norm": 0.48544010519981384,
103
- "learning_rate": 1.2e-05,
104
- "loss": 0.2423,
105
- "step": 120
106
- },
107
- {
108
- "epoch": 0.06341463414634146,
109
- "grad_norm": 0.47114691138267517,
110
- "learning_rate": 1.3000000000000001e-05,
111
- "loss": 0.2222,
112
- "step": 130
113
- },
114
- {
115
- "epoch": 0.06829268292682927,
116
- "grad_norm": 0.44177231192588806,
117
- "learning_rate": 1.4000000000000001e-05,
118
- "loss": 0.2014,
119
- "step": 140
120
- },
121
- {
122
- "epoch": 0.07317073170731707,
123
- "grad_norm": 0.4138778746128082,
124
- "learning_rate": 1.5e-05,
125
- "loss": 0.1792,
126
- "step": 150
127
- },
128
- {
129
- "epoch": 0.07804878048780488,
130
- "grad_norm": 0.3736928403377533,
131
- "learning_rate": 1.6000000000000003e-05,
132
- "loss": 0.1606,
133
- "step": 160
134
- },
135
- {
136
- "epoch": 0.08292682926829269,
137
- "grad_norm": 0.3438684046268463,
138
- "learning_rate": 1.7000000000000003e-05,
139
- "loss": 0.1425,
140
- "step": 170
141
- },
142
- {
143
- "epoch": 0.08780487804878048,
144
- "grad_norm": 0.29839885234832764,
145
- "learning_rate": 1.8e-05,
146
- "loss": 0.1254,
147
- "step": 180
148
- },
149
- {
150
- "epoch": 0.09268292682926829,
151
- "grad_norm": 0.27134254574775696,
152
- "learning_rate": 1.9e-05,
153
- "loss": 0.1091,
154
- "step": 190
155
- },
156
- {
157
- "epoch": 0.0975609756097561,
158
- "grad_norm": 0.24228985607624054,
159
- "learning_rate": 2e-05,
160
- "loss": 0.0955,
161
- "step": 200
162
- },
163
- {
164
- "epoch": 0.0975609756097561,
165
- "eval_f1": 0.40798082528988794,
166
- "eval_loss": 0.08500728011131287,
167
- "eval_precision": 0.2875361436615431,
168
- "eval_recall": 0.7020659928656362,
169
- "eval_runtime": 133.9225,
170
- "eval_samples_per_second": 61.229,
171
- "eval_steps_per_second": 0.963,
172
- "step": 200
173
- },
174
- {
175
- "epoch": 0.1024390243902439,
176
- "grad_norm": 0.21252021193504333,
177
- "learning_rate": 2.1e-05,
178
- "loss": 0.0858,
179
- "step": 210
180
- },
181
- {
182
- "epoch": 0.1073170731707317,
183
- "grad_norm": 0.20333224534988403,
184
- "learning_rate": 2.2000000000000003e-05,
185
- "loss": 0.0761,
186
- "step": 220
187
- },
188
- {
189
- "epoch": 0.11219512195121951,
190
- "grad_norm": 0.1808163970708847,
191
- "learning_rate": 2.3000000000000003e-05,
192
- "loss": 0.0707,
193
- "step": 230
194
- },
195
- {
196
- "epoch": 0.11707317073170732,
197
- "grad_norm": 0.14729662239551544,
198
- "learning_rate": 2.4e-05,
199
- "loss": 0.0631,
200
- "step": 240
201
- },
202
- {
203
- "epoch": 0.12195121951219512,
204
- "grad_norm": 0.16881898045539856,
205
- "learning_rate": 2.5e-05,
206
- "loss": 0.0588,
207
- "step": 250
208
- },
209
- {
210
- "epoch": 0.12682926829268293,
211
- "grad_norm": 0.17096662521362305,
212
- "learning_rate": 2.6000000000000002e-05,
213
- "loss": 0.0535,
214
- "step": 260
215
- },
216
- {
217
- "epoch": 0.13170731707317074,
218
- "grad_norm": 0.15806329250335693,
219
- "learning_rate": 2.7000000000000002e-05,
220
- "loss": 0.0507,
221
- "step": 270
222
- },
223
- {
224
- "epoch": 0.13658536585365855,
225
- "grad_norm": 0.10550623387098312,
226
- "learning_rate": 2.8000000000000003e-05,
227
- "loss": 0.0529,
228
- "step": 280
229
- },
230
- {
231
- "epoch": 0.14146341463414633,
232
- "grad_norm": 0.1570628583431244,
233
- "learning_rate": 2.9e-05,
234
- "loss": 0.0464,
235
- "step": 290
236
- },
237
- {
238
- "epoch": 0.14634146341463414,
239
- "grad_norm": 0.11487462371587753,
240
- "learning_rate": 3e-05,
241
- "loss": 0.0455,
242
- "step": 300
243
- },
244
- {
245
- "epoch": 0.14634146341463414,
246
- "eval_f1": 0.40842269111034246,
247
- "eval_loss": 0.04193972423672676,
248
- "eval_precision": 0.28798780487804876,
249
- "eval_recall": 0.7019916765755053,
250
- "eval_runtime": 131.8065,
251
- "eval_samples_per_second": 62.212,
252
- "eval_steps_per_second": 0.979,
253
- "step": 300
254
- },
255
- {
256
- "epoch": 0.15121951219512195,
257
- "grad_norm": 0.10432987660169601,
258
- "learning_rate": 3.1e-05,
259
- "loss": 0.0414,
260
- "step": 310
261
- },
262
- {
263
- "epoch": 0.15609756097560976,
264
- "grad_norm": 0.12195131927728653,
265
- "learning_rate": 3.2000000000000005e-05,
266
- "loss": 0.0415,
267
- "step": 320
268
- },
269
- {
270
- "epoch": 0.16097560975609757,
271
- "grad_norm": 0.11948198080062866,
272
- "learning_rate": 3.3e-05,
273
- "loss": 0.0415,
274
- "step": 330
275
- },
276
- {
277
- "epoch": 0.16585365853658537,
278
- "grad_norm": 0.11321187764406204,
279
- "learning_rate": 3.4000000000000007e-05,
280
- "loss": 0.0402,
281
- "step": 340
282
- },
283
- {
284
- "epoch": 0.17073170731707318,
285
- "grad_norm": 0.13302814960479736,
286
- "learning_rate": 3.5e-05,
287
- "loss": 0.0385,
288
- "step": 350
289
- },
290
- {
291
- "epoch": 0.17560975609756097,
292
- "grad_norm": 0.18504567444324493,
293
- "learning_rate": 3.6e-05,
294
- "loss": 0.0393,
295
- "step": 360
296
- },
297
- {
298
- "epoch": 0.18048780487804877,
299
- "grad_norm": 0.11665136367082596,
300
- "learning_rate": 3.7e-05,
301
- "loss": 0.0361,
302
- "step": 370
303
- },
304
- {
305
- "epoch": 0.18536585365853658,
306
- "grad_norm": 0.09814433008432388,
307
- "learning_rate": 3.8e-05,
308
- "loss": 0.0373,
309
- "step": 380
310
- },
311
- {
312
- "epoch": 0.1902439024390244,
313
- "grad_norm": 0.10006968677043915,
314
- "learning_rate": 3.9000000000000006e-05,
315
- "loss": 0.0336,
316
- "step": 390
317
- },
318
- {
319
- "epoch": 0.1951219512195122,
320
- "grad_norm": 0.14240923523902893,
321
- "learning_rate": 4e-05,
322
- "loss": 0.0363,
323
- "step": 400
324
- },
325
- {
326
- "epoch": 0.1951219512195122,
327
- "eval_f1": 0.4903333065186496,
328
- "eval_loss": 0.03265204280614853,
329
- "eval_precision": 0.383563367873474,
330
- "eval_recall": 0.6794738406658739,
331
- "eval_runtime": 131.7664,
332
- "eval_samples_per_second": 62.231,
333
- "eval_steps_per_second": 0.979,
334
- "step": 400
335
- },
336
- {
337
- "epoch": 0.2,
338
- "grad_norm": 0.09334200620651245,
339
- "learning_rate": 4.1e-05,
340
- "loss": 0.0332,
341
- "step": 410
342
- },
343
- {
344
- "epoch": 0.2048780487804878,
345
- "grad_norm": 0.0977184921503067,
346
- "learning_rate": 4.2e-05,
347
- "loss": 0.0326,
348
- "step": 420
349
- },
350
- {
351
- "epoch": 0.2097560975609756,
352
- "grad_norm": 0.12227483093738556,
353
- "learning_rate": 4.3e-05,
354
- "loss": 0.0321,
355
- "step": 430
356
- },
357
- {
358
- "epoch": 0.2146341463414634,
359
- "grad_norm": 0.07999078929424286,
360
- "learning_rate": 4.4000000000000006e-05,
361
- "loss": 0.0304,
362
- "step": 440
363
- },
364
- {
365
- "epoch": 0.21951219512195122,
366
- "grad_norm": 0.13925114274024963,
367
- "learning_rate": 4.5e-05,
368
- "loss": 0.0296,
369
- "step": 450
370
- },
371
- {
372
- "epoch": 0.22439024390243903,
373
- "grad_norm": 0.09850054234266281,
374
- "learning_rate": 4.600000000000001e-05,
375
- "loss": 0.0286,
376
- "step": 460
377
- },
378
- {
379
- "epoch": 0.22926829268292684,
380
- "grad_norm": 0.09087326377630234,
381
- "learning_rate": 4.7e-05,
382
- "loss": 0.0293,
383
- "step": 470
384
- },
385
- {
386
- "epoch": 0.23414634146341465,
387
- "grad_norm": 0.11158157885074615,
388
- "learning_rate": 4.8e-05,
389
- "loss": 0.0329,
390
- "step": 480
391
- },
392
- {
393
- "epoch": 0.23902439024390243,
394
- "grad_norm": 0.0853157714009285,
395
- "learning_rate": 4.9e-05,
396
- "loss": 0.0302,
397
- "step": 490
398
- },
399
- {
400
- "epoch": 0.24390243902439024,
401
- "grad_norm": 0.1297878473997116,
402
- "learning_rate": 5e-05,
403
- "loss": 0.0321,
404
- "step": 500
405
- },
406
- {
407
- "epoch": 0.24390243902439024,
408
- "eval_f1": 0.5804941656357618,
409
- "eval_loss": 0.02912677265703678,
410
- "eval_precision": 0.5276241414939524,
411
- "eval_recall": 0.645139714625446,
412
- "eval_runtime": 132.6761,
413
- "eval_samples_per_second": 61.805,
414
- "eval_steps_per_second": 0.972,
415
- "step": 500
416
- },
417
- {
418
- "epoch": 0.24878048780487805,
419
- "grad_norm": 0.1282796561717987,
420
- "learning_rate": 4.9911504424778765e-05,
421
- "loss": 0.0316,
422
- "step": 510
423
- },
424
- {
425
- "epoch": 0.25365853658536586,
426
- "grad_norm": 0.12388455867767334,
427
- "learning_rate": 4.982300884955752e-05,
428
- "loss": 0.0302,
429
- "step": 520
430
- },
431
- {
432
- "epoch": 0.25853658536585367,
433
- "grad_norm": 0.11067409813404083,
434
- "learning_rate": 4.9734513274336284e-05,
435
- "loss": 0.0299,
436
- "step": 530
437
- },
438
- {
439
- "epoch": 0.2634146341463415,
440
- "grad_norm": 0.16229726374149323,
441
- "learning_rate": 4.964601769911505e-05,
442
- "loss": 0.0269,
443
- "step": 540
444
- },
445
- {
446
- "epoch": 0.2682926829268293,
447
- "grad_norm": 0.12282289564609528,
448
- "learning_rate": 4.955752212389381e-05,
449
- "loss": 0.0303,
450
- "step": 550
451
- },
452
- {
453
- "epoch": 0.2731707317073171,
454
- "grad_norm": 0.11870788037776947,
455
- "learning_rate": 4.946902654867257e-05,
456
- "loss": 0.031,
457
- "step": 560
458
- },
459
- {
460
- "epoch": 0.2780487804878049,
461
- "grad_norm": 0.17082759737968445,
462
- "learning_rate": 4.938053097345133e-05,
463
- "loss": 0.0318,
464
- "step": 570
465
- },
466
- {
467
- "epoch": 0.28292682926829266,
468
- "grad_norm": 0.10085482150316238,
469
- "learning_rate": 4.929203539823009e-05,
470
- "loss": 0.0269,
471
- "step": 580
472
- },
473
- {
474
- "epoch": 0.28780487804878047,
475
- "grad_norm": 0.15493524074554443,
476
- "learning_rate": 4.9203539823008854e-05,
477
- "loss": 0.0308,
478
- "step": 590
479
- },
480
- {
481
- "epoch": 0.2926829268292683,
482
- "grad_norm": 0.106038898229599,
483
- "learning_rate": 4.911504424778761e-05,
484
- "loss": 0.0284,
485
- "step": 600
486
- },
487
- {
488
- "epoch": 0.2926829268292683,
489
- "eval_f1": 0.5525415591430115,
490
- "eval_loss": 0.027496203780174255,
491
- "eval_precision": 0.4633618520382486,
492
- "eval_recall": 0.684230083234245,
493
- "eval_runtime": 132.4968,
494
- "eval_samples_per_second": 61.888,
495
- "eval_steps_per_second": 0.974,
496
- "step": 600
497
- },
498
- {
499
- "epoch": 0.2975609756097561,
500
- "grad_norm": 0.0896712988615036,
501
- "learning_rate": 4.902654867256637e-05,
502
- "loss": 0.0279,
503
- "step": 610
504
- },
505
- {
506
- "epoch": 0.3024390243902439,
507
- "grad_norm": 0.0858699232339859,
508
- "learning_rate": 4.893805309734513e-05,
509
- "loss": 0.0287,
510
- "step": 620
511
- },
512
- {
513
- "epoch": 0.3073170731707317,
514
- "grad_norm": 0.09710809588432312,
515
- "learning_rate": 4.88495575221239e-05,
516
- "loss": 0.0253,
517
- "step": 630
518
- },
519
- {
520
- "epoch": 0.3121951219512195,
521
- "grad_norm": 0.09191035479307175,
522
- "learning_rate": 4.876106194690266e-05,
523
- "loss": 0.0251,
524
- "step": 640
525
- },
526
- {
527
- "epoch": 0.3170731707317073,
528
- "grad_norm": 0.08466064929962158,
529
- "learning_rate": 4.867256637168142e-05,
530
- "loss": 0.0241,
531
- "step": 650
532
- },
533
- {
534
- "epoch": 0.32195121951219513,
535
- "grad_norm": 0.10560336709022522,
536
- "learning_rate": 4.858407079646018e-05,
537
- "loss": 0.0263,
538
- "step": 660
539
- },
540
- {
541
- "epoch": 0.32682926829268294,
542
- "grad_norm": 0.1608184576034546,
543
- "learning_rate": 4.849557522123894e-05,
544
- "loss": 0.027,
545
- "step": 670
546
- },
547
- {
548
- "epoch": 0.33170731707317075,
549
- "grad_norm": 0.10734377801418304,
550
- "learning_rate": 4.84070796460177e-05,
551
- "loss": 0.0281,
552
- "step": 680
553
- },
554
- {
555
- "epoch": 0.33658536585365856,
556
- "grad_norm": 0.07804067432880402,
557
- "learning_rate": 4.831858407079646e-05,
558
- "loss": 0.0274,
559
- "step": 690
560
- },
561
- {
562
- "epoch": 0.34146341463414637,
563
- "grad_norm": 0.08093168586492538,
564
- "learning_rate": 4.823008849557522e-05,
565
- "loss": 0.029,
566
- "step": 700
567
- },
568
- {
569
- "epoch": 0.34146341463414637,
570
- "eval_f1": 0.626304952497923,
571
- "eval_loss": 0.026616454124450684,
572
- "eval_precision": 0.6093343642370141,
573
- "eval_recall": 0.6442479191438764,
574
- "eval_runtime": 132.4172,
575
- "eval_samples_per_second": 61.925,
576
- "eval_steps_per_second": 0.974,
577
- "step": 700
578
- },
579
- {
580
- "epoch": 0.3463414634146341,
581
- "grad_norm": 0.12165658921003342,
582
- "learning_rate": 4.814159292035398e-05,
583
- "loss": 0.0303,
584
- "step": 710
585
- },
586
- {
587
- "epoch": 0.35121951219512193,
588
- "grad_norm": 0.11557145416736603,
589
- "learning_rate": 4.805309734513275e-05,
590
- "loss": 0.0288,
591
- "step": 720
592
- },
593
- {
594
- "epoch": 0.35609756097560974,
595
- "grad_norm": 0.120982825756073,
596
- "learning_rate": 4.7964601769911506e-05,
597
- "loss": 0.0251,
598
- "step": 730
599
- },
600
- {
601
- "epoch": 0.36097560975609755,
602
- "grad_norm": 0.09892363101243973,
603
- "learning_rate": 4.787610619469027e-05,
604
- "loss": 0.0292,
605
- "step": 740
606
- },
607
- {
608
- "epoch": 0.36585365853658536,
609
- "grad_norm": 0.10615638643503189,
610
- "learning_rate": 4.778761061946903e-05,
611
- "loss": 0.0258,
612
- "step": 750
613
- },
614
- {
615
- "epoch": 0.37073170731707317,
616
- "grad_norm": 0.1078324243426323,
617
- "learning_rate": 4.769911504424779e-05,
618
- "loss": 0.0249,
619
- "step": 760
620
- },
621
- {
622
- "epoch": 0.375609756097561,
623
- "grad_norm": 0.131972536444664,
624
- "learning_rate": 4.761061946902655e-05,
625
- "loss": 0.0291,
626
- "step": 770
627
- },
628
- {
629
- "epoch": 0.3804878048780488,
630
- "grad_norm": 0.08731543272733688,
631
- "learning_rate": 4.752212389380531e-05,
632
- "loss": 0.0294,
633
- "step": 780
634
- },
635
- {
636
- "epoch": 0.3853658536585366,
637
- "grad_norm": 0.14496171474456787,
638
- "learning_rate": 4.743362831858407e-05,
639
- "loss": 0.0235,
640
- "step": 790
641
- },
642
- {
643
- "epoch": 0.3902439024390244,
644
- "grad_norm": 0.10081491619348526,
645
- "learning_rate": 4.734513274336283e-05,
646
- "loss": 0.0304,
647
- "step": 800
648
- },
649
- {
650
- "epoch": 0.3902439024390244,
651
- "eval_f1": 0.5844212089339851,
652
- "eval_loss": 0.02550012618303299,
653
- "eval_precision": 0.5005829358770535,
654
- "eval_recall": 0.7019916765755053,
655
- "eval_runtime": 132.3532,
656
- "eval_samples_per_second": 61.955,
657
- "eval_steps_per_second": 0.975,
658
- "step": 800
659
- },
660
- {
661
- "epoch": 0.3951219512195122,
662
- "grad_norm": 0.07158678770065308,
663
- "learning_rate": 4.7256637168141595e-05,
664
- "loss": 0.025,
665
- "step": 810
666
- },
667
- {
668
- "epoch": 0.4,
669
- "grad_norm": 0.1092309057712555,
670
- "learning_rate": 4.716814159292036e-05,
671
- "loss": 0.0257,
672
- "step": 820
673
- },
674
- {
675
- "epoch": 0.40487804878048783,
676
- "grad_norm": 0.10517269372940063,
677
- "learning_rate": 4.707964601769912e-05,
678
- "loss": 0.0235,
679
- "step": 830
680
- },
681
- {
682
- "epoch": 0.4097560975609756,
683
- "grad_norm": 0.0983092337846756,
684
- "learning_rate": 4.699115044247788e-05,
685
- "loss": 0.0257,
686
- "step": 840
687
- },
688
- {
689
- "epoch": 0.4146341463414634,
690
- "grad_norm": 0.11815937608480453,
691
- "learning_rate": 4.690265486725664e-05,
692
- "loss": 0.0273,
693
- "step": 850
694
- },
695
- {
696
- "epoch": 0.4195121951219512,
697
- "grad_norm": 0.13173235952854156,
698
- "learning_rate": 4.6814159292035396e-05,
699
- "loss": 0.0266,
700
- "step": 860
701
- },
702
- {
703
- "epoch": 0.424390243902439,
704
- "grad_norm": 0.08506595343351364,
705
- "learning_rate": 4.672566371681416e-05,
706
- "loss": 0.0243,
707
- "step": 870
708
- },
709
- {
710
- "epoch": 0.4292682926829268,
711
- "grad_norm": 0.07343988120555878,
712
- "learning_rate": 4.663716814159292e-05,
713
- "loss": 0.026,
714
- "step": 880
715
- },
716
- {
717
- "epoch": 0.43414634146341463,
718
- "grad_norm": 0.08313016593456268,
719
- "learning_rate": 4.6548672566371684e-05,
720
- "loss": 0.0283,
721
- "step": 890
722
- },
723
- {
724
- "epoch": 0.43902439024390244,
725
- "grad_norm": 0.07794070988893509,
726
- "learning_rate": 4.646017699115045e-05,
727
- "loss": 0.0267,
728
- "step": 900
729
- },
730
- {
731
- "epoch": 0.43902439024390244,
732
- "eval_f1": 0.5883609508987152,
733
- "eval_loss": 0.024745287373661995,
734
- "eval_precision": 0.4991456531869725,
735
- "eval_recall": 0.7164090368608799,
736
- "eval_runtime": 132.4255,
737
- "eval_samples_per_second": 61.922,
738
- "eval_steps_per_second": 0.974,
739
- "step": 900
740
- },
741
- {
742
- "epoch": 0.44390243902439025,
743
- "grad_norm": 0.1075003445148468,
744
- "learning_rate": 4.637168141592921e-05,
745
- "loss": 0.0285,
746
- "step": 910
747
- },
748
- {
749
- "epoch": 0.44878048780487806,
750
- "grad_norm": 0.11088255792856216,
751
- "learning_rate": 4.6283185840707966e-05,
752
- "loss": 0.0229,
753
- "step": 920
754
- },
755
- {
756
- "epoch": 0.45365853658536587,
757
- "grad_norm": 0.10665366053581238,
758
- "learning_rate": 4.619469026548673e-05,
759
- "loss": 0.027,
760
- "step": 930
761
- },
762
- {
763
- "epoch": 0.4585365853658537,
764
- "grad_norm": 0.11324100941419601,
765
- "learning_rate": 4.6106194690265485e-05,
766
- "loss": 0.0265,
767
- "step": 940
768
- },
769
- {
770
- "epoch": 0.4634146341463415,
771
- "grad_norm": 0.1377974897623062,
772
- "learning_rate": 4.601769911504425e-05,
773
- "loss": 0.0249,
774
- "step": 950
775
- },
776
- {
777
- "epoch": 0.4682926829268293,
778
- "grad_norm": 0.08704473823308945,
779
- "learning_rate": 4.592920353982301e-05,
780
- "loss": 0.0275,
781
- "step": 960
782
- },
783
- {
784
- "epoch": 0.47317073170731705,
785
- "grad_norm": 0.07551635056734085,
786
- "learning_rate": 4.584070796460177e-05,
787
- "loss": 0.0267,
788
- "step": 970
789
- },
790
- {
791
- "epoch": 0.47804878048780486,
792
- "grad_norm": 0.06435199081897736,
793
- "learning_rate": 4.5752212389380536e-05,
794
- "loss": 0.0263,
795
- "step": 980
796
- },
797
- {
798
- "epoch": 0.48292682926829267,
799
- "grad_norm": 0.1029893159866333,
800
- "learning_rate": 4.56637168141593e-05,
801
- "loss": 0.0215,
802
- "step": 990
803
- },
804
- {
805
- "epoch": 0.4878048780487805,
806
- "grad_norm": 0.09537643939256668,
807
- "learning_rate": 4.5575221238938055e-05,
808
- "loss": 0.024,
809
- "step": 1000
810
- },
811
- {
812
- "epoch": 0.4878048780487805,
813
- "eval_f1": 0.6293077812654881,
814
- "eval_loss": 0.023848505690693855,
815
- "eval_precision": 0.5665417335950979,
816
- "eval_recall": 0.7077140309155767,
817
- "eval_runtime": 131.8557,
818
- "eval_samples_per_second": 62.189,
819
- "eval_steps_per_second": 0.978,
820
- "step": 1000
821
- },
822
- {
823
- "epoch": 0.4926829268292683,
824
- "grad_norm": 0.11213461309671402,
825
- "learning_rate": 4.548672566371682e-05,
826
- "loss": 0.0258,
827
- "step": 1010
828
- },
829
- {
830
- "epoch": 0.4975609756097561,
831
- "grad_norm": 0.09799027442932129,
832
- "learning_rate": 4.5398230088495574e-05,
833
- "loss": 0.0244,
834
- "step": 1020
835
- },
836
- {
837
- "epoch": 0.5024390243902439,
838
- "grad_norm": 0.0755227655172348,
839
- "learning_rate": 4.5309734513274336e-05,
840
- "loss": 0.0269,
841
- "step": 1030
842
- },
843
- {
844
- "epoch": 0.5073170731707317,
845
- "grad_norm": 0.10655403882265091,
846
- "learning_rate": 4.52212389380531e-05,
847
- "loss": 0.027,
848
- "step": 1040
849
- },
850
- {
851
- "epoch": 0.5121951219512195,
852
- "grad_norm": 0.11206043511629105,
853
- "learning_rate": 4.5132743362831855e-05,
854
- "loss": 0.0268,
855
- "step": 1050
856
- },
857
- {
858
- "epoch": 0.5170731707317073,
859
- "grad_norm": 0.08644779026508331,
860
- "learning_rate": 4.5044247787610625e-05,
861
- "loss": 0.0258,
862
- "step": 1060
863
- },
864
- {
865
- "epoch": 0.5219512195121951,
866
- "grad_norm": 0.07745319604873657,
867
- "learning_rate": 4.495575221238939e-05,
868
- "loss": 0.0232,
869
- "step": 1070
870
- },
871
- {
872
- "epoch": 0.526829268292683,
873
- "grad_norm": 0.10960444808006287,
874
- "learning_rate": 4.4867256637168144e-05,
875
- "loss": 0.0271,
876
- "step": 1080
877
- },
878
- {
879
- "epoch": 0.5317073170731708,
880
- "grad_norm": 0.09823193401098251,
881
- "learning_rate": 4.4778761061946906e-05,
882
- "loss": 0.0279,
883
- "step": 1090
884
- },
885
- {
886
- "epoch": 0.5365853658536586,
887
- "grad_norm": 0.10037508606910706,
888
- "learning_rate": 4.469026548672566e-05,
889
- "loss": 0.0226,
890
- "step": 1100
891
- },
892
- {
893
- "epoch": 0.5365853658536586,
894
- "eval_f1": 0.6288948069241012,
895
- "eval_loss": 0.023997528478503227,
896
- "eval_precision": 0.5695851423058369,
897
- "eval_recall": 0.7019916765755053,
898
- "eval_runtime": 131.8239,
899
- "eval_samples_per_second": 62.204,
900
- "eval_steps_per_second": 0.979,
901
- "step": 1100
902
- },
903
- {
904
- "epoch": 0.5414634146341464,
905
- "grad_norm": 0.086235910654068,
906
- "learning_rate": 4.4601769911504425e-05,
907
- "loss": 0.0243,
908
- "step": 1110
909
- },
910
- {
911
- "epoch": 0.5463414634146342,
912
- "grad_norm": 0.09304425120353699,
913
- "learning_rate": 4.451327433628319e-05,
914
- "loss": 0.0236,
915
- "step": 1120
916
- },
917
- {
918
- "epoch": 0.551219512195122,
919
- "grad_norm": 0.09922584891319275,
920
- "learning_rate": 4.4424778761061944e-05,
921
- "loss": 0.0222,
922
- "step": 1130
923
- },
924
- {
925
- "epoch": 0.5560975609756098,
926
- "grad_norm": 0.07448361814022064,
927
- "learning_rate": 4.433628318584071e-05,
928
- "loss": 0.0246,
929
- "step": 1140
930
- },
931
- {
932
- "epoch": 0.5609756097560976,
933
- "grad_norm": 0.08881635963916779,
934
- "learning_rate": 4.4247787610619477e-05,
935
- "loss": 0.023,
936
- "step": 1150
937
- },
938
- {
939
- "epoch": 0.5658536585365853,
940
- "grad_norm": 0.11040020734071732,
941
- "learning_rate": 4.415929203539823e-05,
942
- "loss": 0.0255,
943
- "step": 1160
944
- },
945
- {
946
- "epoch": 0.5707317073170731,
947
- "grad_norm": 0.10073444992303848,
948
- "learning_rate": 4.4070796460176995e-05,
949
- "loss": 0.0234,
950
- "step": 1170
951
- },
952
- {
953
- "epoch": 0.5756097560975609,
954
- "grad_norm": 0.1330658495426178,
955
- "learning_rate": 4.398230088495575e-05,
956
- "loss": 0.0253,
957
- "step": 1180
958
- },
959
- {
960
- "epoch": 0.5804878048780487,
961
- "grad_norm": 0.11121776700019836,
962
- "learning_rate": 4.3893805309734514e-05,
963
- "loss": 0.0253,
964
- "step": 1190
965
- },
966
- {
967
- "epoch": 0.5853658536585366,
968
- "grad_norm": 0.07611318677663803,
969
- "learning_rate": 4.380530973451328e-05,
970
- "loss": 0.021,
971
- "step": 1200
972
- },
973
- {
974
- "epoch": 0.5853658536585366,
975
- "eval_f1": 0.6257888792931642,
976
- "eval_loss": 0.024241872131824493,
977
- "eval_precision": 0.5657657657657658,
978
- "eval_recall": 0.7000594530321046,
979
- "eval_runtime": 132.4797,
980
- "eval_samples_per_second": 61.896,
981
- "eval_steps_per_second": 0.974,
982
- "step": 1200
983
- },
984
- {
985
- "epoch": 0.5902439024390244,
986
- "grad_norm": 0.06641782820224762,
987
- "learning_rate": 4.371681415929203e-05,
988
- "loss": 0.0252,
989
- "step": 1210
990
- },
991
- {
992
- "epoch": 0.5951219512195122,
993
- "grad_norm": 0.10350623726844788,
994
- "learning_rate": 4.3628318584070796e-05,
995
- "loss": 0.0261,
996
- "step": 1220
997
- },
998
- {
999
- "epoch": 0.6,
1000
- "grad_norm": 0.09171286225318909,
1001
- "learning_rate": 4.353982300884956e-05,
1002
- "loss": 0.0222,
1003
- "step": 1230
1004
- },
1005
- {
1006
- "epoch": 0.6048780487804878,
1007
- "grad_norm": 0.09831242263317108,
1008
- "learning_rate": 4.345132743362832e-05,
1009
- "loss": 0.0196,
1010
- "step": 1240
1011
- },
1012
- {
1013
- "epoch": 0.6097560975609756,
1014
- "grad_norm": 0.06655886769294739,
1015
- "learning_rate": 4.3362831858407084e-05,
1016
- "loss": 0.0214,
1017
- "step": 1250
1018
- },
1019
- {
1020
- "epoch": 0.6146341463414634,
1021
- "grad_norm": 0.10220635682344437,
1022
- "learning_rate": 4.327433628318584e-05,
1023
- "loss": 0.0225,
1024
- "step": 1260
1025
- },
1026
- {
1027
- "epoch": 0.6195121951219512,
1028
- "grad_norm": 0.09263930469751358,
1029
- "learning_rate": 4.31858407079646e-05,
1030
- "loss": 0.0237,
1031
- "step": 1270
1032
- },
1033
- {
1034
- "epoch": 0.624390243902439,
1035
- "grad_norm": 0.12217256426811218,
1036
- "learning_rate": 4.3097345132743366e-05,
1037
- "loss": 0.0238,
1038
- "step": 1280
1039
- },
1040
- {
1041
- "epoch": 0.6292682926829268,
1042
- "grad_norm": 0.08786381781101227,
1043
- "learning_rate": 4.300884955752212e-05,
1044
- "loss": 0.0217,
1045
- "step": 1290
1046
- },
1047
- {
1048
- "epoch": 0.6341463414634146,
1049
- "grad_norm": 0.06754878163337708,
1050
- "learning_rate": 4.2920353982300885e-05,
1051
- "loss": 0.0209,
1052
- "step": 1300
1053
- },
1054
- {
1055
- "epoch": 0.6341463414634146,
1056
- "eval_f1": 0.635122838944495,
1057
- "eval_loss": 0.022672206163406372,
1058
- "eval_precision": 0.5643335643335643,
1059
- "eval_recall": 0.726218787158145,
1060
- "eval_runtime": 132.3145,
1061
- "eval_samples_per_second": 61.974,
1062
- "eval_steps_per_second": 0.975,
1063
- "step": 1300
1064
- },
1065
- {
1066
- "epoch": 0.6390243902439025,
1067
- "grad_norm": 0.07975753396749496,
1068
- "learning_rate": 4.283185840707965e-05,
1069
- "loss": 0.0208,
1070
- "step": 1310
1071
- },
1072
- {
1073
- "epoch": 0.6439024390243903,
1074
- "grad_norm": 0.10725169628858566,
1075
- "learning_rate": 4.274336283185841e-05,
1076
- "loss": 0.0243,
1077
- "step": 1320
1078
- },
1079
- {
1080
- "epoch": 0.6487804878048781,
1081
- "grad_norm": 0.07709804177284241,
1082
- "learning_rate": 4.265486725663717e-05,
1083
- "loss": 0.0241,
1084
- "step": 1330
1085
- },
1086
- {
1087
- "epoch": 0.6536585365853659,
1088
- "grad_norm": 0.07307865470647812,
1089
- "learning_rate": 4.256637168141593e-05,
1090
- "loss": 0.0221,
1091
- "step": 1340
1092
- },
1093
- {
1094
- "epoch": 0.6585365853658537,
1095
- "grad_norm": 0.07658623158931732,
1096
- "learning_rate": 4.247787610619469e-05,
1097
- "loss": 0.0216,
1098
- "step": 1350
1099
- },
1100
- {
1101
- "epoch": 0.6634146341463415,
1102
- "grad_norm": 0.12642726302146912,
1103
- "learning_rate": 4.2389380530973455e-05,
1104
- "loss": 0.0247,
1105
- "step": 1360
1106
- },
1107
- {
1108
- "epoch": 0.6682926829268293,
1109
- "grad_norm": 0.09556315094232559,
1110
- "learning_rate": 4.230088495575221e-05,
1111
- "loss": 0.0217,
1112
- "step": 1370
1113
- },
1114
- {
1115
- "epoch": 0.6731707317073171,
1116
- "grad_norm": 0.06885875016450882,
1117
- "learning_rate": 4.2212389380530974e-05,
1118
- "loss": 0.02,
1119
- "step": 1380
1120
- },
1121
- {
1122
- "epoch": 0.6780487804878049,
1123
- "grad_norm": 0.0719994455575943,
1124
- "learning_rate": 4.2123893805309737e-05,
1125
- "loss": 0.0214,
1126
- "step": 1390
1127
- },
1128
- {
1129
- "epoch": 0.6829268292682927,
1130
- "grad_norm": 0.06444905698299408,
1131
- "learning_rate": 4.20353982300885e-05,
1132
- "loss": 0.0207,
1133
- "step": 1400
1134
- },
1135
- {
1136
- "epoch": 0.6829268292682927,
1137
- "eval_f1": 0.6471813478465364,
1138
- "eval_loss": 0.02256700210273266,
1139
- "eval_precision": 0.590557939914163,
1140
- "eval_recall": 0.7158145065398336,
1141
- "eval_runtime": 132.1018,
1142
- "eval_samples_per_second": 62.073,
1143
- "eval_steps_per_second": 0.977,
1144
- "step": 1400
1145
- },
1146
- {
1147
- "epoch": 0.6878048780487804,
1148
- "grad_norm": 0.07349838316440582,
1149
- "learning_rate": 4.194690265486726e-05,
1150
- "loss": 0.0225,
1151
- "step": 1410
1152
- },
1153
- {
1154
- "epoch": 0.6926829268292682,
1155
- "grad_norm": 0.07198076695203781,
1156
- "learning_rate": 4.185840707964602e-05,
1157
- "loss": 0.021,
1158
- "step": 1420
1159
- },
1160
- {
1161
- "epoch": 0.697560975609756,
1162
- "grad_norm": 0.0936046615242958,
1163
- "learning_rate": 4.176991150442478e-05,
1164
- "loss": 0.0252,
1165
- "step": 1430
1166
- },
1167
- {
1168
- "epoch": 0.7024390243902439,
1169
- "grad_norm": 0.08654190599918365,
1170
- "learning_rate": 4.1681415929203544e-05,
1171
- "loss": 0.0218,
1172
- "step": 1440
1173
- },
1174
- {
1175
- "epoch": 0.7073170731707317,
1176
- "grad_norm": 0.08444487303495407,
1177
- "learning_rate": 4.15929203539823e-05,
1178
- "loss": 0.0267,
1179
- "step": 1450
1180
- },
1181
- {
1182
- "epoch": 0.7121951219512195,
1183
- "grad_norm": 0.12378791719675064,
1184
- "learning_rate": 4.150442477876106e-05,
1185
- "loss": 0.0233,
1186
- "step": 1460
1187
- },
1188
- {
1189
- "epoch": 0.7170731707317073,
1190
- "grad_norm": 0.09955397993326187,
1191
- "learning_rate": 4.1415929203539825e-05,
1192
- "loss": 0.0253,
1193
- "step": 1470
1194
- },
1195
- {
1196
- "epoch": 0.7219512195121951,
1197
- "grad_norm": 0.08549737185239792,
1198
- "learning_rate": 4.132743362831858e-05,
1199
- "loss": 0.0252,
1200
- "step": 1480
1201
- },
1202
- {
1203
- "epoch": 0.7268292682926829,
1204
- "grad_norm": 0.06966210901737213,
1205
- "learning_rate": 4.123893805309735e-05,
1206
- "loss": 0.0242,
1207
- "step": 1490
1208
- },
1209
- {
1210
- "epoch": 0.7317073170731707,
1211
- "grad_norm": 0.08285216242074966,
1212
- "learning_rate": 4.115044247787611e-05,
1213
- "loss": 0.0219,
1214
- "step": 1500
1215
- },
1216
- {
1217
- "epoch": 0.7317073170731707,
1218
- "eval_f1": 0.6405168738316225,
1219
- "eval_loss": 0.022300876677036285,
1220
- "eval_precision": 0.5732315820369827,
1221
- "eval_recall": 0.7256985731272295,
1222
- "eval_runtime": 132.0099,
1223
- "eval_samples_per_second": 62.117,
1224
- "eval_steps_per_second": 0.977,
1225
- "step": 1500
1226
- },
1227
- {
1228
- "epoch": 0.7365853658536585,
1229
- "grad_norm": 0.10072668641805649,
1230
- "learning_rate": 4.106194690265487e-05,
1231
- "loss": 0.0227,
1232
- "step": 1510
1233
- },
1234
- {
1235
- "epoch": 0.7414634146341463,
1236
- "grad_norm": 0.10805150866508484,
1237
- "learning_rate": 4.097345132743363e-05,
1238
- "loss": 0.0238,
1239
- "step": 1520
1240
- },
1241
- {
1242
- "epoch": 0.7463414634146341,
1243
- "grad_norm": 0.06195740029215813,
1244
- "learning_rate": 4.088495575221239e-05,
1245
- "loss": 0.0216,
1246
- "step": 1530
1247
- },
1248
- {
1249
- "epoch": 0.751219512195122,
1250
- "grad_norm": 0.06914755702018738,
1251
- "learning_rate": 4.079646017699115e-05,
1252
- "loss": 0.0215,
1253
- "step": 1540
1254
- },
1255
- {
1256
- "epoch": 0.7560975609756098,
1257
- "grad_norm": 0.09708551317453384,
1258
- "learning_rate": 4.0707964601769914e-05,
1259
- "loss": 0.0205,
1260
- "step": 1550
1261
- },
1262
- {
1263
- "epoch": 0.7609756097560976,
1264
- "grad_norm": 0.07369375228881836,
1265
- "learning_rate": 4.061946902654867e-05,
1266
- "loss": 0.0238,
1267
- "step": 1560
1268
- },
1269
- {
1270
- "epoch": 0.7658536585365854,
1271
- "grad_norm": 0.07211218029260635,
1272
- "learning_rate": 4.053097345132743e-05,
1273
- "loss": 0.02,
1274
- "step": 1570
1275
- },
1276
- {
1277
- "epoch": 0.7707317073170732,
1278
- "grad_norm": 0.09262284636497498,
1279
- "learning_rate": 4.0442477876106196e-05,
1280
- "loss": 0.0227,
1281
- "step": 1580
1282
- },
1283
- {
1284
- "epoch": 0.775609756097561,
1285
- "grad_norm": 0.09008630365133286,
1286
- "learning_rate": 4.035398230088496e-05,
1287
- "loss": 0.0228,
1288
- "step": 1590
1289
- },
1290
- {
1291
- "epoch": 0.7804878048780488,
1292
- "grad_norm": 0.1014399379491806,
1293
- "learning_rate": 4.026548672566372e-05,
1294
- "loss": 0.0215,
1295
- "step": 1600
1296
- },
1297
- {
1298
- "epoch": 0.7804878048780488,
1299
- "eval_f1": 0.637167016339246,
1300
- "eval_loss": 0.02189124934375286,
1301
- "eval_precision": 0.556715920453283,
1302
- "eval_recall": 0.7447978596908442,
1303
- "eval_runtime": 132.2243,
1304
- "eval_samples_per_second": 62.016,
1305
- "eval_steps_per_second": 0.976,
1306
- "step": 1600
1307
- },
1308
- {
1309
- "epoch": 0.7853658536585366,
1310
- "grad_norm": 0.1170215755701065,
1311
- "learning_rate": 4.017699115044248e-05,
1312
- "loss": 0.025,
1313
- "step": 1610
1314
- },
1315
- {
1316
- "epoch": 0.7902439024390244,
1317
- "grad_norm": 0.07999496906995773,
1318
- "learning_rate": 4.008849557522124e-05,
1319
- "loss": 0.0237,
1320
- "step": 1620
1321
- },
1322
- {
1323
- "epoch": 0.7951219512195122,
1324
- "grad_norm": 0.08684638142585754,
1325
- "learning_rate": 4e-05,
1326
- "loss": 0.0261,
1327
- "step": 1630
1328
- },
1329
- {
1330
- "epoch": 0.8,
1331
- "grad_norm": 0.06762222945690155,
1332
- "learning_rate": 3.991150442477876e-05,
1333
- "loss": 0.0232,
1334
- "step": 1640
1335
- },
1336
- {
1337
- "epoch": 0.8048780487804879,
1338
- "grad_norm": 0.08543413132429123,
1339
- "learning_rate": 3.982300884955752e-05,
1340
- "loss": 0.0225,
1341
- "step": 1650
1342
- },
1343
- {
1344
- "epoch": 0.8097560975609757,
1345
- "grad_norm": 0.09491296857595444,
1346
- "learning_rate": 3.9734513274336285e-05,
1347
- "loss": 0.0221,
1348
- "step": 1660
1349
- },
1350
- {
1351
- "epoch": 0.8146341463414634,
1352
- "grad_norm": 0.06884954869747162,
1353
- "learning_rate": 3.964601769911505e-05,
1354
- "loss": 0.0221,
1355
- "step": 1670
1356
- },
1357
- {
1358
- "epoch": 0.8195121951219512,
1359
- "grad_norm": 0.10209941118955612,
1360
- "learning_rate": 3.955752212389381e-05,
1361
- "loss": 0.025,
1362
- "step": 1680
1363
- },
1364
- {
1365
- "epoch": 0.824390243902439,
1366
- "grad_norm": 0.06364341080188751,
1367
- "learning_rate": 3.9469026548672567e-05,
1368
- "loss": 0.0197,
1369
- "step": 1690
1370
- },
1371
- {
1372
- "epoch": 0.8292682926829268,
1373
- "grad_norm": 0.10673107206821442,
1374
- "learning_rate": 3.938053097345133e-05,
1375
- "loss": 0.0199,
1376
- "step": 1700
1377
- },
1378
- {
1379
- "epoch": 0.8292682926829268,
1380
- "eval_f1": 0.6390703962403836,
1381
- "eval_loss": 0.021866334602236748,
1382
- "eval_precision": 0.5636817897904719,
1383
- "eval_recall": 0.7377378121284186,
1384
- "eval_runtime": 132.1953,
1385
- "eval_samples_per_second": 62.029,
1386
- "eval_steps_per_second": 0.976,
1387
- "step": 1700
1388
- },
1389
- {
1390
- "epoch": 0.8341463414634146,
1391
- "grad_norm": 0.08173991739749908,
1392
- "learning_rate": 3.929203539823009e-05,
1393
- "loss": 0.0217,
1394
- "step": 1710
1395
- },
1396
- {
1397
- "epoch": 0.8390243902439024,
1398
- "grad_norm": 0.09340299665927887,
1399
- "learning_rate": 3.920353982300885e-05,
1400
- "loss": 0.0218,
1401
- "step": 1720
1402
- },
1403
- {
1404
- "epoch": 0.8439024390243902,
1405
- "grad_norm": 0.08972273021936417,
1406
- "learning_rate": 3.911504424778761e-05,
1407
- "loss": 0.0178,
1408
- "step": 1730
1409
- },
1410
- {
1411
- "epoch": 0.848780487804878,
1412
- "grad_norm": 0.11011021584272385,
1413
- "learning_rate": 3.9026548672566374e-05,
1414
- "loss": 0.0282,
1415
- "step": 1740
1416
- },
1417
- {
1418
- "epoch": 0.8536585365853658,
1419
- "grad_norm": 0.1096154972910881,
1420
- "learning_rate": 3.893805309734514e-05,
1421
- "loss": 0.0244,
1422
- "step": 1750
1423
- },
1424
- {
1425
- "epoch": 0.8585365853658536,
1426
- "grad_norm": 0.08531954139471054,
1427
- "learning_rate": 3.88495575221239e-05,
1428
- "loss": 0.0217,
1429
- "step": 1760
1430
- },
1431
- {
1432
- "epoch": 0.8634146341463415,
1433
- "grad_norm": 0.1026742234826088,
1434
- "learning_rate": 3.8761061946902655e-05,
1435
- "loss": 0.0199,
1436
- "step": 1770
1437
- },
1438
- {
1439
- "epoch": 0.8682926829268293,
1440
- "grad_norm": 0.06563183665275574,
1441
- "learning_rate": 3.867256637168142e-05,
1442
- "loss": 0.0185,
1443
- "step": 1780
1444
- },
1445
- {
1446
- "epoch": 0.8731707317073171,
1447
- "grad_norm": 0.07869338989257812,
1448
- "learning_rate": 3.858407079646018e-05,
1449
- "loss": 0.0235,
1450
- "step": 1790
1451
- },
1452
- {
1453
- "epoch": 0.8780487804878049,
1454
- "grad_norm": 0.08610737323760986,
1455
- "learning_rate": 3.849557522123894e-05,
1456
- "loss": 0.0193,
1457
- "step": 1800
1458
- },
1459
- {
1460
- "epoch": 0.8780487804878049,
1461
- "eval_f1": 0.6328935570641732,
1462
- "eval_loss": 0.022152401506900787,
1463
- "eval_precision": 0.5560432140445645,
1464
- "eval_recall": 0.7343935790725327,
1465
- "eval_runtime": 132.0266,
1466
- "eval_samples_per_second": 62.109,
1467
- "eval_steps_per_second": 0.977,
1468
- "step": 1800
1469
- },
1470
- {
1471
- "epoch": 0.8829268292682927,
1472
- "grad_norm": 0.09815912693738937,
1473
- "learning_rate": 3.84070796460177e-05,
1474
- "loss": 0.0179,
1475
- "step": 1810
1476
- },
1477
- {
1478
- "epoch": 0.8878048780487805,
1479
- "grad_norm": 0.06768873333930969,
1480
- "learning_rate": 3.831858407079646e-05,
1481
- "loss": 0.0219,
1482
- "step": 1820
1483
- },
1484
- {
1485
- "epoch": 0.8926829268292683,
1486
- "grad_norm": 0.09629742056131363,
1487
- "learning_rate": 3.8230088495575226e-05,
1488
- "loss": 0.0212,
1489
- "step": 1830
1490
- },
1491
- {
1492
- "epoch": 0.8975609756097561,
1493
- "grad_norm": 0.0719202533364296,
1494
- "learning_rate": 3.814159292035399e-05,
1495
- "loss": 0.0221,
1496
- "step": 1840
1497
- },
1498
- {
1499
- "epoch": 0.9024390243902439,
1500
- "grad_norm": 0.09463170170783997,
1501
- "learning_rate": 3.8053097345132744e-05,
1502
- "loss": 0.0233,
1503
- "step": 1850
1504
- },
1505
- {
1506
- "epoch": 0.9073170731707317,
1507
- "grad_norm": 0.08261518180370331,
1508
- "learning_rate": 3.796460176991151e-05,
1509
- "loss": 0.0205,
1510
- "step": 1860
1511
- },
1512
- {
1513
- "epoch": 0.9121951219512195,
1514
- "grad_norm": 0.09188443422317505,
1515
- "learning_rate": 3.787610619469027e-05,
1516
- "loss": 0.0186,
1517
- "step": 1870
1518
- },
1519
- {
1520
- "epoch": 0.9170731707317074,
1521
- "grad_norm": 0.0914570763707161,
1522
- "learning_rate": 3.7787610619469026e-05,
1523
- "loss": 0.0207,
1524
- "step": 1880
1525
- },
1526
- {
1527
- "epoch": 0.9219512195121952,
1528
- "grad_norm": 0.0840875431895256,
1529
- "learning_rate": 3.769911504424779e-05,
1530
- "loss": 0.0225,
1531
- "step": 1890
1532
- },
1533
- {
1534
- "epoch": 0.926829268292683,
1535
- "grad_norm": 0.0681457445025444,
1536
- "learning_rate": 3.7610619469026545e-05,
1537
- "loss": 0.0216,
1538
- "step": 1900
1539
- },
1540
- {
1541
- "epoch": 0.926829268292683,
1542
- "eval_f1": 0.6448713774014979,
1543
- "eval_loss": 0.021633492782711983,
1544
- "eval_precision": 0.5738959081952011,
1545
- "eval_recall": 0.7358799048751486,
1546
- "eval_runtime": 132.1734,
1547
- "eval_samples_per_second": 62.04,
1548
- "eval_steps_per_second": 0.976,
1549
- "step": 1900
1550
- },
1551
- {
1552
- "epoch": 0.9317073170731708,
1553
- "grad_norm": 0.07665237784385681,
1554
- "learning_rate": 3.752212389380531e-05,
1555
- "loss": 0.0231,
1556
- "step": 1910
1557
- },
1558
- {
1559
- "epoch": 0.9365853658536586,
1560
- "grad_norm": 0.12333638221025467,
1561
- "learning_rate": 3.743362831858408e-05,
1562
- "loss": 0.0244,
1563
- "step": 1920
1564
- },
1565
- {
1566
- "epoch": 0.9414634146341463,
1567
- "grad_norm": 0.05435947701334953,
1568
- "learning_rate": 3.734513274336283e-05,
1569
- "loss": 0.0209,
1570
- "step": 1930
1571
- },
1572
- {
1573
- "epoch": 0.9463414634146341,
1574
- "grad_norm": 0.09085798263549805,
1575
- "learning_rate": 3.7256637168141596e-05,
1576
- "loss": 0.0212,
1577
- "step": 1940
1578
- },
1579
- {
1580
- "epoch": 0.9512195121951219,
1581
- "grad_norm": 0.11483143270015717,
1582
- "learning_rate": 3.716814159292036e-05,
1583
- "loss": 0.0213,
1584
- "step": 1950
1585
- },
1586
- {
1587
- "epoch": 0.9560975609756097,
1588
- "grad_norm": 0.06926431506872177,
1589
- "learning_rate": 3.7079646017699115e-05,
1590
- "loss": 0.0203,
1591
- "step": 1960
1592
- },
1593
- {
1594
- "epoch": 0.9609756097560975,
1595
- "grad_norm": 0.1061626598238945,
1596
- "learning_rate": 3.699115044247788e-05,
1597
- "loss": 0.0227,
1598
- "step": 1970
1599
- },
1600
- {
1601
- "epoch": 0.9658536585365853,
1602
- "grad_norm": 0.09048457443714142,
1603
- "learning_rate": 3.6902654867256634e-05,
1604
- "loss": 0.0229,
1605
- "step": 1980
1606
- },
1607
- {
1608
- "epoch": 0.9707317073170731,
1609
- "grad_norm": 0.09511193633079529,
1610
- "learning_rate": 3.68141592920354e-05,
1611
- "loss": 0.0216,
1612
- "step": 1990
1613
- },
1614
- {
1615
- "epoch": 0.975609756097561,
1616
- "grad_norm": 0.091468945145607,
1617
- "learning_rate": 3.672566371681416e-05,
1618
- "loss": 0.0168,
1619
- "step": 2000
1620
- },
1621
- {
1622
- "epoch": 0.975609756097561,
1623
- "eval_f1": 0.6518885040409038,
1624
- "eval_loss": 0.021417897194623947,
1625
- "eval_precision": 0.5860964470016016,
1626
- "eval_recall": 0.7343192627824019,
1627
- "eval_runtime": 131.7709,
1628
- "eval_samples_per_second": 62.229,
1629
- "eval_steps_per_second": 0.979,
1630
- "step": 2000
1631
- },
1632
- {
1633
- "epoch": 0.9804878048780488,
1634
- "grad_norm": 0.09001079201698303,
1635
- "learning_rate": 3.663716814159292e-05,
1636
- "loss": 0.0192,
1637
- "step": 2010
1638
- },
1639
- {
1640
- "epoch": 0.9853658536585366,
1641
- "grad_norm": 0.10538368672132492,
1642
- "learning_rate": 3.6548672566371685e-05,
1643
- "loss": 0.0228,
1644
- "step": 2020
1645
- },
1646
- {
1647
- "epoch": 0.9902439024390244,
1648
- "grad_norm": 0.08615951985120773,
1649
- "learning_rate": 3.646017699115045e-05,
1650
- "loss": 0.0238,
1651
- "step": 2030
1652
- },
1653
- {
1654
- "epoch": 0.9951219512195122,
1655
- "grad_norm": 0.0890466719865799,
1656
- "learning_rate": 3.6371681415929204e-05,
1657
- "loss": 0.0218,
1658
- "step": 2040
1659
- },
1660
- {
1661
- "epoch": 1.0,
1662
- "grad_norm": 0.08089441806077957,
1663
- "learning_rate": 3.628318584070797e-05,
1664
- "loss": 0.0204,
1665
- "step": 2050
1666
- },
1667
- {
1668
- "epoch": 1.0048780487804878,
1669
- "grad_norm": 0.09391944855451584,
1670
- "learning_rate": 3.619469026548672e-05,
1671
- "loss": 0.0213,
1672
- "step": 2060
1673
- },
1674
- {
1675
- "epoch": 1.0097560975609756,
1676
- "grad_norm": 0.059187982231378555,
1677
- "learning_rate": 3.6106194690265486e-05,
1678
- "loss": 0.0197,
1679
- "step": 2070
1680
- },
1681
- {
1682
- "epoch": 1.0146341463414634,
1683
- "grad_norm": 0.07538473606109619,
1684
- "learning_rate": 3.601769911504425e-05,
1685
- "loss": 0.0227,
1686
- "step": 2080
1687
- },
1688
- {
1689
- "epoch": 1.0195121951219512,
1690
- "grad_norm": 0.0840989351272583,
1691
- "learning_rate": 3.592920353982301e-05,
1692
- "loss": 0.0205,
1693
- "step": 2090
1694
- },
1695
- {
1696
- "epoch": 1.024390243902439,
1697
- "grad_norm": 0.11819695681333542,
1698
- "learning_rate": 3.5840707964601774e-05,
1699
- "loss": 0.0214,
1700
- "step": 2100
1701
- },
1702
- {
1703
- "epoch": 1.024390243902439,
1704
- "eval_f1": 0.6467788814717813,
1705
- "eval_loss": 0.020904576405882835,
1706
- "eval_precision": 0.5671315745252898,
1707
- "eval_recall": 0.7524524375743162,
1708
- "eval_runtime": 132.2697,
1709
- "eval_samples_per_second": 61.995,
1710
- "eval_steps_per_second": 0.975,
1711
- "step": 2100
1712
- },
1713
- {
1714
- "epoch": 1.0292682926829269,
1715
- "grad_norm": 0.121103934943676,
1716
- "learning_rate": 3.575221238938054e-05,
1717
- "loss": 0.0196,
1718
- "step": 2110
1719
- },
1720
- {
1721
- "epoch": 1.0341463414634147,
1722
- "grad_norm": 0.0714835524559021,
1723
- "learning_rate": 3.566371681415929e-05,
1724
- "loss": 0.0193,
1725
- "step": 2120
1726
- },
1727
- {
1728
- "epoch": 1.0390243902439025,
1729
- "grad_norm": 0.07826493680477142,
1730
- "learning_rate": 3.5575221238938056e-05,
1731
- "loss": 0.0202,
1732
- "step": 2130
1733
- },
1734
- {
1735
- "epoch": 1.0439024390243903,
1736
- "grad_norm": 0.0855259820818901,
1737
- "learning_rate": 3.548672566371681e-05,
1738
- "loss": 0.0217,
1739
- "step": 2140
1740
- },
1741
- {
1742
- "epoch": 1.048780487804878,
1743
- "grad_norm": 0.07726403325796127,
1744
- "learning_rate": 3.5398230088495574e-05,
1745
- "loss": 0.0213,
1746
- "step": 2150
1747
- },
1748
- {
1749
- "epoch": 1.053658536585366,
1750
- "grad_norm": 0.09908368438482285,
1751
- "learning_rate": 3.530973451327434e-05,
1752
- "loss": 0.0186,
1753
- "step": 2160
1754
- },
1755
- {
1756
- "epoch": 1.0585365853658537,
1757
- "grad_norm": 0.08559077978134155,
1758
- "learning_rate": 3.52212389380531e-05,
1759
- "loss": 0.0205,
1760
- "step": 2170
1761
- },
1762
- {
1763
- "epoch": 1.0634146341463415,
1764
- "grad_norm": 0.09128253161907196,
1765
- "learning_rate": 3.513274336283186e-05,
1766
- "loss": 0.0247,
1767
- "step": 2180
1768
- },
1769
- {
1770
- "epoch": 1.0682926829268293,
1771
- "grad_norm": 0.08086485415697098,
1772
- "learning_rate": 3.5044247787610626e-05,
1773
- "loss": 0.0209,
1774
- "step": 2190
1775
- },
1776
- {
1777
- "epoch": 1.0731707317073171,
1778
- "grad_norm": 0.08016868680715561,
1779
- "learning_rate": 3.495575221238938e-05,
1780
- "loss": 0.0205,
1781
- "step": 2200
1782
- },
1783
- {
1784
- "epoch": 1.0731707317073171,
1785
- "eval_f1": 0.633068352720911,
1786
- "eval_loss": 0.020948156714439392,
1787
- "eval_precision": 0.5392418300653595,
1788
- "eval_recall": 0.766423900118906,
1789
- "eval_runtime": 132.144,
1790
- "eval_samples_per_second": 62.054,
1791
- "eval_steps_per_second": 0.976,
1792
- "step": 2200
1793
- },
1794
- {
1795
- "epoch": 1.078048780487805,
1796
- "grad_norm": 0.08863110840320587,
1797
- "learning_rate": 3.4867256637168145e-05,
1798
- "loss": 0.021,
1799
- "step": 2210
1800
- },
1801
- {
1802
- "epoch": 1.0829268292682928,
1803
- "grad_norm": 0.07697419077157974,
1804
- "learning_rate": 3.47787610619469e-05,
1805
- "loss": 0.0222,
1806
- "step": 2220
1807
- },
1808
- {
1809
- "epoch": 1.0878048780487806,
1810
- "grad_norm": 0.07876092195510864,
1811
- "learning_rate": 3.469026548672566e-05,
1812
- "loss": 0.0176,
1813
- "step": 2230
1814
- },
1815
- {
1816
- "epoch": 1.0926829268292684,
1817
- "grad_norm": 0.08885340392589569,
1818
- "learning_rate": 3.4601769911504426e-05,
1819
- "loss": 0.0183,
1820
- "step": 2240
1821
- },
1822
- {
1823
- "epoch": 1.0975609756097562,
1824
- "grad_norm": 0.08264743536710739,
1825
- "learning_rate": 3.451327433628319e-05,
1826
- "loss": 0.0212,
1827
- "step": 2250
1828
- },
1829
- {
1830
- "epoch": 1.102439024390244,
1831
- "grad_norm": 0.06534498184919357,
1832
- "learning_rate": 3.442477876106195e-05,
1833
- "loss": 0.018,
1834
- "step": 2260
1835
- },
1836
- {
1837
- "epoch": 1.1073170731707318,
1838
- "grad_norm": 0.08914489299058914,
1839
- "learning_rate": 3.4336283185840715e-05,
1840
- "loss": 0.0201,
1841
- "step": 2270
1842
- },
1843
- {
1844
- "epoch": 1.1121951219512196,
1845
- "grad_norm": 0.07213272899389267,
1846
- "learning_rate": 3.424778761061947e-05,
1847
- "loss": 0.0145,
1848
- "step": 2280
1849
- },
1850
- {
1851
- "epoch": 1.1170731707317074,
1852
- "grad_norm": 0.07594022899866104,
1853
- "learning_rate": 3.4159292035398233e-05,
1854
- "loss": 0.0201,
1855
- "step": 2290
1856
- },
1857
- {
1858
- "epoch": 1.1219512195121952,
1859
- "grad_norm": 0.06026766449213028,
1860
- "learning_rate": 3.407079646017699e-05,
1861
- "loss": 0.019,
1862
- "step": 2300
1863
- },
1864
- {
1865
- "epoch": 1.1219512195121952,
1866
- "eval_f1": 0.6488888888888888,
1867
- "eval_loss": 0.020637808367609978,
1868
- "eval_precision": 0.569448341657781,
1869
- "eval_recall": 0.7540873959571938,
1870
- "eval_runtime": 132.2303,
1871
- "eval_samples_per_second": 62.013,
1872
- "eval_steps_per_second": 0.976,
1873
- "step": 2300
1874
- },
1875
- {
1876
- "epoch": 1.126829268292683,
1877
- "grad_norm": 0.08499179780483246,
1878
- "learning_rate": 3.398230088495575e-05,
1879
- "loss": 0.0197,
1880
- "step": 2310
1881
- },
1882
- {
1883
- "epoch": 1.1317073170731708,
1884
- "grad_norm": 0.09881128370761871,
1885
- "learning_rate": 3.3893805309734515e-05,
1886
- "loss": 0.0194,
1887
- "step": 2320
1888
- },
1889
- {
1890
- "epoch": 1.1365853658536587,
1891
- "grad_norm": 0.06719642132520676,
1892
- "learning_rate": 3.380530973451327e-05,
1893
- "loss": 0.0202,
1894
- "step": 2330
1895
- },
1896
- {
1897
- "epoch": 1.1414634146341462,
1898
- "grad_norm": 0.10720915347337723,
1899
- "learning_rate": 3.3716814159292034e-05,
1900
- "loss": 0.0165,
1901
- "step": 2340
1902
- },
1903
- {
1904
- "epoch": 1.146341463414634,
1905
- "grad_norm": 0.06894739717245102,
1906
- "learning_rate": 3.3628318584070804e-05,
1907
- "loss": 0.0164,
1908
- "step": 2350
1909
- },
1910
- {
1911
- "epoch": 1.1512195121951219,
1912
- "grad_norm": 0.0639248788356781,
1913
- "learning_rate": 3.353982300884956e-05,
1914
- "loss": 0.0217,
1915
- "step": 2360
1916
- },
1917
- {
1918
- "epoch": 1.1560975609756097,
1919
- "grad_norm": 0.044793836772441864,
1920
- "learning_rate": 3.345132743362832e-05,
1921
- "loss": 0.0197,
1922
- "step": 2370
1923
- },
1924
- {
1925
- "epoch": 1.1609756097560975,
1926
- "grad_norm": 0.0624634325504303,
1927
- "learning_rate": 3.336283185840708e-05,
1928
- "loss": 0.0202,
1929
- "step": 2380
1930
- },
1931
- {
1932
- "epoch": 1.1658536585365853,
1933
- "grad_norm": 0.12909162044525146,
1934
- "learning_rate": 3.327433628318584e-05,
1935
- "loss": 0.0195,
1936
- "step": 2390
1937
- },
1938
- {
1939
- "epoch": 1.170731707317073,
1940
- "grad_norm": 0.0766359269618988,
1941
- "learning_rate": 3.3185840707964604e-05,
1942
- "loss": 0.0208,
1943
- "step": 2400
1944
- },
1945
- {
1946
- "epoch": 1.170731707317073,
1947
- "eval_f1": 0.6382413782457007,
1948
- "eval_loss": 0.02069213055074215,
1949
- "eval_precision": 0.5459251043152168,
1950
- "eval_recall": 0.7681331747919143,
1951
- "eval_runtime": 131.9061,
1952
- "eval_samples_per_second": 62.165,
1953
- "eval_steps_per_second": 0.978,
1954
- "step": 2400
1955
- },
1956
- {
1957
- "epoch": 1.175609756097561,
1958
- "grad_norm": 0.1002466231584549,
1959
- "learning_rate": 3.309734513274336e-05,
1960
- "loss": 0.0198,
1961
- "step": 2410
1962
- },
1963
- {
1964
- "epoch": 1.1804878048780487,
1965
- "grad_norm": 0.09123210608959198,
1966
- "learning_rate": 3.300884955752212e-05,
1967
- "loss": 0.0167,
1968
- "step": 2420
1969
- },
1970
- {
1971
- "epoch": 1.1853658536585365,
1972
- "grad_norm": 0.08641325682401657,
1973
- "learning_rate": 3.2920353982300886e-05,
1974
- "loss": 0.0225,
1975
- "step": 2430
1976
- },
1977
- {
1978
- "epoch": 1.1902439024390243,
1979
- "grad_norm": 0.06566398590803146,
1980
- "learning_rate": 3.283185840707965e-05,
1981
- "loss": 0.0177,
1982
- "step": 2440
1983
- },
1984
- {
1985
- "epoch": 1.1951219512195121,
1986
- "grad_norm": 0.06867921352386475,
1987
- "learning_rate": 3.274336283185841e-05,
1988
- "loss": 0.02,
1989
- "step": 2450
1990
- },
1991
- {
1992
- "epoch": 1.2,
1993
- "grad_norm": 0.0619225949048996,
1994
- "learning_rate": 3.265486725663717e-05,
1995
- "loss": 0.0203,
1996
- "step": 2460
1997
- },
1998
- {
1999
- "epoch": 1.2048780487804878,
2000
- "grad_norm": 0.07883109152317047,
2001
- "learning_rate": 3.256637168141593e-05,
2002
- "loss": 0.0199,
2003
- "step": 2470
2004
- },
2005
- {
2006
- "epoch": 1.2097560975609756,
2007
- "grad_norm": 0.09298081696033478,
2008
- "learning_rate": 3.247787610619469e-05,
2009
- "loss": 0.0191,
2010
- "step": 2480
2011
- },
2012
- {
2013
- "epoch": 1.2146341463414634,
2014
- "grad_norm": 0.06301239132881165,
2015
- "learning_rate": 3.238938053097345e-05,
2016
- "loss": 0.0212,
2017
- "step": 2490
2018
- },
2019
- {
2020
- "epoch": 1.2195121951219512,
2021
- "grad_norm": 0.06936347484588623,
2022
- "learning_rate": 3.230088495575221e-05,
2023
- "loss": 0.0203,
2024
- "step": 2500
2025
- },
2026
- {
2027
- "epoch": 1.2195121951219512,
2028
- "eval_f1": 0.6559016016048936,
2029
- "eval_loss": 0.020873118191957474,
2030
- "eval_precision": 0.5882838770574007,
2031
- "eval_recall": 0.7410820451843044,
2032
- "eval_runtime": 132.0933,
2033
- "eval_samples_per_second": 62.077,
2034
- "eval_steps_per_second": 0.977,
2035
- "step": 2500
2036
- }
2037
- ],
2038
- "logging_steps": 10,
2039
- "max_steps": 6150,
2040
- "num_input_tokens_seen": 0,
2041
- "num_train_epochs": 3,
2042
- "save_steps": 100,
2043
- "stateful_callbacks": {
2044
- "TrainerControl": {
2045
- "args": {
2046
- "should_epoch_stop": false,
2047
- "should_evaluate": false,
2048
- "should_log": false,
2049
- "should_save": true,
2050
- "should_training_stop": false
2051
- },
2052
- "attributes": {}
2053
- }
2054
- },
2055
- "total_flos": 5311547228160000.0,
2056
- "train_batch_size": 16,
2057
- "trial_name": null,
2058
- "trial_params": null
2059
- }