xzuyn commited on
Commit
ae95232
·
verified ·
1 Parent(s): 9f6e902

Upload Step 80/9427

Browse files
adapter_config.json CHANGED
@@ -6,7 +6,7 @@
6
  "eva_config": null,
7
  "exclude_modules": null,
8
  "fan_in_fan_out": null,
9
- "inference_mode": false,
10
  "init_lora_weights": true,
11
  "layer_replication": null,
12
  "layers_pattern": null,
 
6
  "eva_config": null,
7
  "exclude_modules": null,
8
  "fan_in_fan_out": null,
9
+ "inference_mode": true,
10
  "init_lora_weights": true,
11
  "layer_replication": null,
12
  "layers_pattern": null,
adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d58e8b70cae140c27869d506ce1fd1f11fba30e54691e644d41e79743eee8cff
3
+ size 1907432232
tokenizer_config.json CHANGED
@@ -51325,7 +51325,6 @@
51325
  },
51326
  "boi_token": "<start_of_image>",
51327
  "bos_token": "<bos>",
51328
- "chat_template": "{{- bos_token }}{% for message in messages %}{{ '<start_of_turn>' + ('model' if message['role'] == 'assistant' else message['role']) + '\n' + message['content'] | trim + '<end_of_turn>' }}{% if not loop.last %}{{ '\n' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '\n<start_of_turn>model\n' }}{% endif %}",
51329
  "clean_up_tokenization_spaces": false,
51330
  "eoi_token": "<end_of_image>",
51331
  "eos_token": "<end_of_turn>",
 
51325
  },
51326
  "boi_token": "<start_of_image>",
51327
  "bos_token": "<bos>",
 
51328
  "clean_up_tokenization_spaces": false,
51329
  "eoi_token": "<end_of_image>",
51330
  "eos_token": "<end_of_turn>",
trainer_state.json ADDED
@@ -0,0 +1,666 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.008485812781755503,
6
+ "eval_steps": 10,
7
+ "global_step": 80,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.00010607265977194378,
14
+ "grad_norm": 1.647079586982727,
15
+ "learning_rate": 4.99994960800331e-07,
16
+ "loss": 2.3252,
17
+ "step": 1
18
+ },
19
+ {
20
+ "epoch": 0.00010607265977194378,
21
+ "eval_loss": 2.388094663619995,
22
+ "eval_runtime": 67.0533,
23
+ "eval_samples_per_second": 1.909,
24
+ "eval_steps_per_second": 0.477,
25
+ "step": 1
26
+ },
27
+ {
28
+ "epoch": 0.00021214531954388756,
29
+ "grad_norm": 1.7374595403671265,
30
+ "learning_rate": 4.999899206382888e-07,
31
+ "loss": 2.2091,
32
+ "step": 2
33
+ },
34
+ {
35
+ "epoch": 0.00031821797931583137,
36
+ "grad_norm": 1.4541484117507935,
37
+ "learning_rate": 4.999848795135978e-07,
38
+ "loss": 2.4903,
39
+ "step": 3
40
+ },
41
+ {
42
+ "epoch": 0.0004242906390877751,
43
+ "grad_norm": 1.3729519844055176,
44
+ "learning_rate": 4.999798374259822e-07,
45
+ "loss": 2.3478,
46
+ "step": 4
47
+ },
48
+ {
49
+ "epoch": 0.0005303632988597189,
50
+ "grad_norm": 2.059835433959961,
51
+ "learning_rate": 4.999747943751658e-07,
52
+ "loss": 2.2294,
53
+ "step": 5
54
+ },
55
+ {
56
+ "epoch": 0.0006364359586316627,
57
+ "grad_norm": 1.4303361177444458,
58
+ "learning_rate": 4.999697503608729e-07,
59
+ "loss": 2.3702,
60
+ "step": 6
61
+ },
62
+ {
63
+ "epoch": 0.0007425086184036064,
64
+ "grad_norm": 1.506787657737732,
65
+ "learning_rate": 4.999647053828272e-07,
66
+ "loss": 2.2794,
67
+ "step": 7
68
+ },
69
+ {
70
+ "epoch": 0.0008485812781755502,
71
+ "grad_norm": 1.1569323539733887,
72
+ "learning_rate": 4.999596594407525e-07,
73
+ "loss": 2.2048,
74
+ "step": 8
75
+ },
76
+ {
77
+ "epoch": 0.0009546539379474941,
78
+ "grad_norm": 1.0063825845718384,
79
+ "learning_rate": 4.999546125343724e-07,
80
+ "loss": 2.0392,
81
+ "step": 9
82
+ },
83
+ {
84
+ "epoch": 0.0010607265977194379,
85
+ "grad_norm": 1.2020900249481201,
86
+ "learning_rate": 4.999495646634105e-07,
87
+ "loss": 2.1371,
88
+ "step": 10
89
+ },
90
+ {
91
+ "epoch": 0.0010607265977194379,
92
+ "eval_loss": 1.957827091217041,
93
+ "eval_runtime": 70.5284,
94
+ "eval_samples_per_second": 1.815,
95
+ "eval_steps_per_second": 0.454,
96
+ "step": 10
97
+ },
98
+ {
99
+ "epoch": 0.0011667992574913816,
100
+ "grad_norm": 0.9959272742271423,
101
+ "learning_rate": 4.999445158275902e-07,
102
+ "loss": 2.0418,
103
+ "step": 11
104
+ },
105
+ {
106
+ "epoch": 0.0012728719172633255,
107
+ "grad_norm": 0.8702138662338257,
108
+ "learning_rate": 4.999394660266349e-07,
109
+ "loss": 1.97,
110
+ "step": 12
111
+ },
112
+ {
113
+ "epoch": 0.0013789445770352692,
114
+ "grad_norm": 0.6772508025169373,
115
+ "learning_rate": 4.999344152602678e-07,
116
+ "loss": 1.6033,
117
+ "step": 13
118
+ },
119
+ {
120
+ "epoch": 0.0014850172368072129,
121
+ "grad_norm": 1.0128448009490967,
122
+ "learning_rate": 4.99929363528212e-07,
123
+ "loss": 2.1025,
124
+ "step": 14
125
+ },
126
+ {
127
+ "epoch": 0.0015910898965791568,
128
+ "grad_norm": 0.9335213899612427,
129
+ "learning_rate": 4.999243108301906e-07,
130
+ "loss": 1.9303,
131
+ "step": 15
132
+ },
133
+ {
134
+ "epoch": 0.0016971625563511005,
135
+ "grad_norm": 0.9497168660163879,
136
+ "learning_rate": 4.999192571659265e-07,
137
+ "loss": 1.8389,
138
+ "step": 16
139
+ },
140
+ {
141
+ "epoch": 0.0018032352161230442,
142
+ "grad_norm": 0.8370901346206665,
143
+ "learning_rate": 4.999142025351424e-07,
144
+ "loss": 1.8423,
145
+ "step": 17
146
+ },
147
+ {
148
+ "epoch": 0.0019093078758949881,
149
+ "grad_norm": 1.1051160097122192,
150
+ "learning_rate": 4.999091469375611e-07,
151
+ "loss": 1.9193,
152
+ "step": 18
153
+ },
154
+ {
155
+ "epoch": 0.002015380535666932,
156
+ "grad_norm": 1.2687193155288696,
157
+ "learning_rate": 4.999040903729051e-07,
158
+ "loss": 2.1474,
159
+ "step": 19
160
+ },
161
+ {
162
+ "epoch": 0.0021214531954388757,
163
+ "grad_norm": 0.9469927549362183,
164
+ "learning_rate": 4.99899032840897e-07,
165
+ "loss": 1.6744,
166
+ "step": 20
167
+ },
168
+ {
169
+ "epoch": 0.0021214531954388757,
170
+ "eval_loss": 1.7858046293258667,
171
+ "eval_runtime": 68.1877,
172
+ "eval_samples_per_second": 1.877,
173
+ "eval_steps_per_second": 0.469,
174
+ "step": 20
175
+ },
176
+ {
177
+ "epoch": 0.0022275258552108194,
178
+ "grad_norm": 0.9844200015068054,
179
+ "learning_rate": 4.998939743412591e-07,
180
+ "loss": 1.9173,
181
+ "step": 21
182
+ },
183
+ {
184
+ "epoch": 0.002333598514982763,
185
+ "grad_norm": 1.1347674131393433,
186
+ "learning_rate": 4.998889148737137e-07,
187
+ "loss": 1.8158,
188
+ "step": 22
189
+ },
190
+ {
191
+ "epoch": 0.002439671174754707,
192
+ "grad_norm": 1.0652062892913818,
193
+ "learning_rate": 4.99883854437983e-07,
194
+ "loss": 1.9189,
195
+ "step": 23
196
+ },
197
+ {
198
+ "epoch": 0.002545743834526651,
199
+ "grad_norm": 1.247801661491394,
200
+ "learning_rate": 4.998787930337891e-07,
201
+ "loss": 1.6044,
202
+ "step": 24
203
+ },
204
+ {
205
+ "epoch": 0.0026518164942985947,
206
+ "grad_norm": 0.856572151184082,
207
+ "learning_rate": 4.998737306608538e-07,
208
+ "loss": 1.9368,
209
+ "step": 25
210
+ },
211
+ {
212
+ "epoch": 0.0027578891540705384,
213
+ "grad_norm": 0.8902915716171265,
214
+ "learning_rate": 4.998686673188991e-07,
215
+ "loss": 1.7691,
216
+ "step": 26
217
+ },
218
+ {
219
+ "epoch": 0.002863961813842482,
220
+ "grad_norm": 0.9449676871299744,
221
+ "learning_rate": 4.998636030076468e-07,
222
+ "loss": 1.7605,
223
+ "step": 27
224
+ },
225
+ {
226
+ "epoch": 0.0029700344736144258,
227
+ "grad_norm": 0.7392516136169434,
228
+ "learning_rate": 4.998585377268183e-07,
229
+ "loss": 1.7255,
230
+ "step": 28
231
+ },
232
+ {
233
+ "epoch": 0.0030761071333863695,
234
+ "grad_norm": 0.8038751482963562,
235
+ "learning_rate": 4.998534714761353e-07,
236
+ "loss": 1.7689,
237
+ "step": 29
238
+ },
239
+ {
240
+ "epoch": 0.0031821797931583136,
241
+ "grad_norm": 0.8447410464286804,
242
+ "learning_rate": 4.998484042553191e-07,
243
+ "loss": 2.0482,
244
+ "step": 30
245
+ },
246
+ {
247
+ "epoch": 0.0031821797931583136,
248
+ "eval_loss": 1.6817430257797241,
249
+ "eval_runtime": 67.9443,
250
+ "eval_samples_per_second": 1.884,
251
+ "eval_steps_per_second": 0.471,
252
+ "step": 30
253
+ },
254
+ {
255
+ "epoch": 0.0032882524529302573,
256
+ "grad_norm": 0.8365817070007324,
257
+ "learning_rate": 4.998433360640912e-07,
258
+ "loss": 1.8075,
259
+ "step": 31
260
+ },
261
+ {
262
+ "epoch": 0.003394325112702201,
263
+ "grad_norm": 0.7525676488876343,
264
+ "learning_rate": 4.998382669021727e-07,
265
+ "loss": 1.6026,
266
+ "step": 32
267
+ },
268
+ {
269
+ "epoch": 0.0035003977724741447,
270
+ "grad_norm": 0.9553101658821106,
271
+ "learning_rate": 4.998331967692847e-07,
272
+ "loss": 1.743,
273
+ "step": 33
274
+ },
275
+ {
276
+ "epoch": 0.0036064704322460884,
277
+ "grad_norm": 0.966307520866394,
278
+ "learning_rate": 4.998281256651483e-07,
279
+ "loss": 1.7075,
280
+ "step": 34
281
+ },
282
+ {
283
+ "epoch": 0.0037125430920180325,
284
+ "grad_norm": 0.8701184988021851,
285
+ "learning_rate": 4.998230535894843e-07,
286
+ "loss": 1.6257,
287
+ "step": 35
288
+ },
289
+ {
290
+ "epoch": 0.0038186157517899762,
291
+ "grad_norm": 0.8450726866722107,
292
+ "learning_rate": 4.998179805420135e-07,
293
+ "loss": 1.8127,
294
+ "step": 36
295
+ },
296
+ {
297
+ "epoch": 0.00392468841156192,
298
+ "grad_norm": 0.8065881133079529,
299
+ "learning_rate": 4.998129065224565e-07,
300
+ "loss": 1.7574,
301
+ "step": 37
302
+ },
303
+ {
304
+ "epoch": 0.004030761071333864,
305
+ "grad_norm": 0.7674804925918579,
306
+ "learning_rate": 4.99807831530534e-07,
307
+ "loss": 1.6796,
308
+ "step": 38
309
+ },
310
+ {
311
+ "epoch": 0.004136833731105807,
312
+ "grad_norm": 0.8442147970199585,
313
+ "learning_rate": 4.998027555659665e-07,
314
+ "loss": 1.5551,
315
+ "step": 39
316
+ },
317
+ {
318
+ "epoch": 0.0042429063908777515,
319
+ "grad_norm": 0.7327367663383484,
320
+ "learning_rate": 4.99797678628474e-07,
321
+ "loss": 1.075,
322
+ "step": 40
323
+ },
324
+ {
325
+ "epoch": 0.0042429063908777515,
326
+ "eval_loss": 1.6100257635116577,
327
+ "eval_runtime": 67.9157,
328
+ "eval_samples_per_second": 1.885,
329
+ "eval_steps_per_second": 0.471,
330
+ "step": 40
331
+ },
332
+ {
333
+ "epoch": 0.004348979050649695,
334
+ "grad_norm": 0.8727586269378662,
335
+ "learning_rate": 4.997926007177772e-07,
336
+ "loss": 1.6814,
337
+ "step": 41
338
+ },
339
+ {
340
+ "epoch": 0.004455051710421639,
341
+ "grad_norm": 1.0420920848846436,
342
+ "learning_rate": 4.99787521833596e-07,
343
+ "loss": 1.6025,
344
+ "step": 42
345
+ },
346
+ {
347
+ "epoch": 0.004561124370193583,
348
+ "grad_norm": 0.757056713104248,
349
+ "learning_rate": 4.997824419756506e-07,
350
+ "loss": 1.7756,
351
+ "step": 43
352
+ },
353
+ {
354
+ "epoch": 0.004667197029965526,
355
+ "grad_norm": 0.9350019693374634,
356
+ "learning_rate": 4.997773611436606e-07,
357
+ "loss": 1.6165,
358
+ "step": 44
359
+ },
360
+ {
361
+ "epoch": 0.00477326968973747,
362
+ "grad_norm": 0.7474361062049866,
363
+ "learning_rate": 4.997722793373462e-07,
364
+ "loss": 1.8263,
365
+ "step": 45
366
+ },
367
+ {
368
+ "epoch": 0.004879342349509414,
369
+ "grad_norm": 0.6356221437454224,
370
+ "learning_rate": 4.997671965564268e-07,
371
+ "loss": 1.7313,
372
+ "step": 46
373
+ },
374
+ {
375
+ "epoch": 0.004985415009281358,
376
+ "grad_norm": 0.7225522398948669,
377
+ "learning_rate": 4.997621128006223e-07,
378
+ "loss": 1.8336,
379
+ "step": 47
380
+ },
381
+ {
382
+ "epoch": 0.005091487669053302,
383
+ "grad_norm": 0.7901801466941833,
384
+ "learning_rate": 4.997570280696519e-07,
385
+ "loss": 1.5573,
386
+ "step": 48
387
+ },
388
+ {
389
+ "epoch": 0.005197560328825245,
390
+ "grad_norm": 0.7901318073272705,
391
+ "learning_rate": 4.997519423632353e-07,
392
+ "loss": 1.5356,
393
+ "step": 49
394
+ },
395
+ {
396
+ "epoch": 0.005303632988597189,
397
+ "grad_norm": 0.7905575633049011,
398
+ "learning_rate": 4.997468556810914e-07,
399
+ "loss": 1.5592,
400
+ "step": 50
401
+ },
402
+ {
403
+ "epoch": 0.005303632988597189,
404
+ "eval_loss": 1.565526008605957,
405
+ "eval_runtime": 68.1883,
406
+ "eval_samples_per_second": 1.877,
407
+ "eval_steps_per_second": 0.469,
408
+ "step": 50
409
+ },
410
+ {
411
+ "epoch": 0.005409705648369133,
412
+ "grad_norm": 0.7927577495574951,
413
+ "learning_rate": 4.997417680229397e-07,
414
+ "loss": 1.8136,
415
+ "step": 51
416
+ },
417
+ {
418
+ "epoch": 0.005515778308141077,
419
+ "grad_norm": 0.6780912280082703,
420
+ "learning_rate": 4.997366793884992e-07,
421
+ "loss": 1.635,
422
+ "step": 52
423
+ },
424
+ {
425
+ "epoch": 0.005621850967913021,
426
+ "grad_norm": 0.6629224419593811,
427
+ "learning_rate": 4.997315897774888e-07,
428
+ "loss": 1.6814,
429
+ "step": 53
430
+ },
431
+ {
432
+ "epoch": 0.005727923627684964,
433
+ "grad_norm": 0.6568951606750488,
434
+ "learning_rate": 4.997264991896272e-07,
435
+ "loss": 1.7656,
436
+ "step": 54
437
+ },
438
+ {
439
+ "epoch": 0.005833996287456908,
440
+ "grad_norm": 0.9639095664024353,
441
+ "learning_rate": 4.997214076246334e-07,
442
+ "loss": 1.6255,
443
+ "step": 55
444
+ },
445
+ {
446
+ "epoch": 0.0059400689472288515,
447
+ "grad_norm": 1.7226521968841553,
448
+ "learning_rate": 4.99716315082226e-07,
449
+ "loss": 1.6617,
450
+ "step": 56
451
+ },
452
+ {
453
+ "epoch": 0.006046141607000796,
454
+ "grad_norm": 0.7537712454795837,
455
+ "learning_rate": 4.997112215621234e-07,
456
+ "loss": 1.7756,
457
+ "step": 57
458
+ },
459
+ {
460
+ "epoch": 0.006152214266772739,
461
+ "grad_norm": 0.5914387702941895,
462
+ "learning_rate": 4.99706127064044e-07,
463
+ "loss": 1.4576,
464
+ "step": 58
465
+ },
466
+ {
467
+ "epoch": 0.006258286926544683,
468
+ "grad_norm": 0.5612177848815918,
469
+ "learning_rate": 4.997010315877063e-07,
470
+ "loss": 1.6828,
471
+ "step": 59
472
+ },
473
+ {
474
+ "epoch": 0.006364359586316627,
475
+ "grad_norm": 0.6786366701126099,
476
+ "learning_rate": 4.996959351328284e-07,
477
+ "loss": 1.7288,
478
+ "step": 60
479
+ },
480
+ {
481
+ "epoch": 0.006364359586316627,
482
+ "eval_loss": 1.537359356880188,
483
+ "eval_runtime": 67.7518,
484
+ "eval_samples_per_second": 1.889,
485
+ "eval_steps_per_second": 0.472,
486
+ "step": 60
487
+ },
488
+ {
489
+ "epoch": 0.0064704322460885704,
490
+ "grad_norm": 0.678156316280365,
491
+ "learning_rate": 4.996908376991283e-07,
492
+ "loss": 1.5268,
493
+ "step": 61
494
+ },
495
+ {
496
+ "epoch": 0.006576504905860515,
497
+ "grad_norm": 0.6791099905967712,
498
+ "learning_rate": 4.99685739286324e-07,
499
+ "loss": 1.5498,
500
+ "step": 62
501
+ },
502
+ {
503
+ "epoch": 0.006682577565632458,
504
+ "grad_norm": 0.702700674533844,
505
+ "learning_rate": 4.996806398941335e-07,
506
+ "loss": 1.6741,
507
+ "step": 63
508
+ },
509
+ {
510
+ "epoch": 0.006788650225404402,
511
+ "grad_norm": 0.8344963788986206,
512
+ "learning_rate": 4.996755395222746e-07,
513
+ "loss": 1.6074,
514
+ "step": 64
515
+ },
516
+ {
517
+ "epoch": 0.006894722885176346,
518
+ "grad_norm": 3.9447405338287354,
519
+ "learning_rate": 4.996704381704648e-07,
520
+ "loss": 1.5762,
521
+ "step": 65
522
+ },
523
+ {
524
+ "epoch": 0.007000795544948289,
525
+ "grad_norm": 0.870587170124054,
526
+ "learning_rate": 4.996653358384218e-07,
527
+ "loss": 1.6515,
528
+ "step": 66
529
+ },
530
+ {
531
+ "epoch": 0.0071068682047202335,
532
+ "grad_norm": 0.5930059552192688,
533
+ "learning_rate": 4.996602325258629e-07,
534
+ "loss": 1.7334,
535
+ "step": 67
536
+ },
537
+ {
538
+ "epoch": 0.007212940864492177,
539
+ "grad_norm": 1.2227728366851807,
540
+ "learning_rate": 4.996551282325055e-07,
541
+ "loss": 1.3723,
542
+ "step": 68
543
+ },
544
+ {
545
+ "epoch": 0.007319013524264121,
546
+ "grad_norm": 1.1175763607025146,
547
+ "learning_rate": 4.996500229580668e-07,
548
+ "loss": 1.4476,
549
+ "step": 69
550
+ },
551
+ {
552
+ "epoch": 0.007425086184036065,
553
+ "grad_norm": 0.591778039932251,
554
+ "learning_rate": 4.99644916702264e-07,
555
+ "loss": 1.2612,
556
+ "step": 70
557
+ },
558
+ {
559
+ "epoch": 0.007425086184036065,
560
+ "eval_loss": 1.5133857727050781,
561
+ "eval_runtime": 68.2036,
562
+ "eval_samples_per_second": 1.877,
563
+ "eval_steps_per_second": 0.469,
564
+ "step": 70
565
+ },
566
+ {
567
+ "epoch": 0.007531158843808008,
568
+ "grad_norm": 0.62836754322052,
569
+ "learning_rate": 4.99639809464814e-07,
570
+ "loss": 1.4586,
571
+ "step": 71
572
+ },
573
+ {
574
+ "epoch": 0.0076372315035799524,
575
+ "grad_norm": 0.779591977596283,
576
+ "learning_rate": 4.996347012454338e-07,
577
+ "loss": 1.4937,
578
+ "step": 72
579
+ },
580
+ {
581
+ "epoch": 0.007743304163351896,
582
+ "grad_norm": 0.7092106342315674,
583
+ "learning_rate": 4.9962959204384e-07,
584
+ "loss": 1.74,
585
+ "step": 73
586
+ },
587
+ {
588
+ "epoch": 0.00784937682312384,
589
+ "grad_norm": 0.5990781188011169,
590
+ "learning_rate": 4.996244818597496e-07,
591
+ "loss": 1.3733,
592
+ "step": 74
593
+ },
594
+ {
595
+ "epoch": 0.007955449482895784,
596
+ "grad_norm": 0.7790846824645996,
597
+ "learning_rate": 4.996193706928789e-07,
598
+ "loss": 1.5198,
599
+ "step": 75
600
+ },
601
+ {
602
+ "epoch": 0.008061522142667728,
603
+ "grad_norm": 0.746094286441803,
604
+ "learning_rate": 4.996142585429444e-07,
605
+ "loss": 1.3825,
606
+ "step": 76
607
+ },
608
+ {
609
+ "epoch": 0.00816759480243967,
610
+ "grad_norm": 1.385066032409668,
611
+ "learning_rate": 4.996091454096626e-07,
612
+ "loss": 1.473,
613
+ "step": 77
614
+ },
615
+ {
616
+ "epoch": 0.008273667462211615,
617
+ "grad_norm": 0.6657389402389526,
618
+ "learning_rate": 4.996040312927497e-07,
619
+ "loss": 1.6375,
620
+ "step": 78
621
+ },
622
+ {
623
+ "epoch": 0.008379740121983559,
624
+ "grad_norm": 1.1722540855407715,
625
+ "learning_rate": 4.995989161919216e-07,
626
+ "loss": 1.5355,
627
+ "step": 79
628
+ },
629
+ {
630
+ "epoch": 0.008485812781755503,
631
+ "grad_norm": 0.7716183066368103,
632
+ "learning_rate": 4.995938001068947e-07,
633
+ "loss": 1.6588,
634
+ "step": 80
635
+ },
636
+ {
637
+ "epoch": 0.008485812781755503,
638
+ "eval_loss": 1.4981476068496704,
639
+ "eval_runtime": 67.8094,
640
+ "eval_samples_per_second": 1.888,
641
+ "eval_steps_per_second": 0.472,
642
+ "step": 80
643
+ }
644
+ ],
645
+ "logging_steps": 1,
646
+ "max_steps": 9427,
647
+ "num_input_tokens_seen": 0,
648
+ "num_train_epochs": 1,
649
+ "save_steps": 10,
650
+ "stateful_callbacks": {
651
+ "TrainerControl": {
652
+ "args": {
653
+ "should_epoch_stop": false,
654
+ "should_evaluate": false,
655
+ "should_log": false,
656
+ "should_save": true,
657
+ "should_training_stop": false
658
+ },
659
+ "attributes": {}
660
+ }
661
+ },
662
+ "total_flos": 1.2900468007108608e+17,
663
+ "train_batch_size": 4,
664
+ "trial_name": null,
665
+ "trial_params": null
666
+ }