LegrandFrederic commited on
Commit
6cd16a6
·
verified ·
1 Parent(s): 8b85d3c

Upload trainer_state.json with huggingface_hub

Browse files
Files changed (1) hide show
  1. trainer_state.json +792 -0
trainer_state.json ADDED
@@ -0,0 +1,792 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 10.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1070,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09345794392523364,
14
+ "grad_norm": 2.701171636581421,
15
+ "learning_rate": 1.6666666666666667e-05,
16
+ "loss": 0.7223,
17
+ "step": 10
18
+ },
19
+ {
20
+ "epoch": 0.18691588785046728,
21
+ "grad_norm": 1.4309473037719727,
22
+ "learning_rate": 3.518518518518519e-05,
23
+ "loss": 0.313,
24
+ "step": 20
25
+ },
26
+ {
27
+ "epoch": 0.2803738317757009,
28
+ "grad_norm": 1.99528968334198,
29
+ "learning_rate": 5.370370370370371e-05,
30
+ "loss": 0.2547,
31
+ "step": 30
32
+ },
33
+ {
34
+ "epoch": 0.37383177570093457,
35
+ "grad_norm": 1.5343451499938965,
36
+ "learning_rate": 7.222222222222222e-05,
37
+ "loss": 0.2301,
38
+ "step": 40
39
+ },
40
+ {
41
+ "epoch": 0.4672897196261682,
42
+ "grad_norm": 1.0875062942504883,
43
+ "learning_rate": 9.074074074074075e-05,
44
+ "loss": 0.1949,
45
+ "step": 50
46
+ },
47
+ {
48
+ "epoch": 0.5607476635514018,
49
+ "grad_norm": 1.1911485195159912,
50
+ "learning_rate": 9.999402437003975e-05,
51
+ "loss": 0.1649,
52
+ "step": 60
53
+ },
54
+ {
55
+ "epoch": 0.6542056074766355,
56
+ "grad_norm": 0.9195408821105957,
57
+ "learning_rate": 9.99462278999732e-05,
58
+ "loss": 0.1468,
59
+ "step": 70
60
+ },
61
+ {
62
+ "epoch": 0.7476635514018691,
63
+ "grad_norm": 1.2133328914642334,
64
+ "learning_rate": 9.985068065535225e-05,
65
+ "loss": 0.1285,
66
+ "step": 80
67
+ },
68
+ {
69
+ "epoch": 0.8411214953271028,
70
+ "grad_norm": 0.8248424530029297,
71
+ "learning_rate": 9.970747398351445e-05,
72
+ "loss": 0.109,
73
+ "step": 90
74
+ },
75
+ {
76
+ "epoch": 0.9345794392523364,
77
+ "grad_norm": 0.8372947573661804,
78
+ "learning_rate": 9.951674479629056e-05,
79
+ "loss": 0.1036,
80
+ "step": 100
81
+ },
82
+ {
83
+ "epoch": 1.02803738317757,
84
+ "grad_norm": 1.1082693338394165,
85
+ "learning_rate": 9.927867543911091e-05,
86
+ "loss": 0.0895,
87
+ "step": 110
88
+ },
89
+ {
90
+ "epoch": 1.1214953271028036,
91
+ "grad_norm": 0.9761236310005188,
92
+ "learning_rate": 9.899349351667522e-05,
93
+ "loss": 0.0844,
94
+ "step": 120
95
+ },
96
+ {
97
+ "epoch": 1.2149532710280373,
98
+ "grad_norm": 0.6534666419029236,
99
+ "learning_rate": 9.866147167535254e-05,
100
+ "loss": 0.0749,
101
+ "step": 130
102
+ },
103
+ {
104
+ "epoch": 1.308411214953271,
105
+ "grad_norm": 0.946822464466095,
106
+ "learning_rate": 9.828292734251944e-05,
107
+ "loss": 0.0713,
108
+ "step": 140
109
+ },
110
+ {
111
+ "epoch": 1.4018691588785046,
112
+ "grad_norm": 0.8314803838729858,
113
+ "learning_rate": 9.785822242308562e-05,
114
+ "loss": 0.0674,
115
+ "step": 150
116
+ },
117
+ {
118
+ "epoch": 1.4953271028037383,
119
+ "grad_norm": 0.6225787997245789,
120
+ "learning_rate": 9.738776295349687e-05,
121
+ "loss": 0.0654,
122
+ "step": 160
123
+ },
124
+ {
125
+ "epoch": 1.588785046728972,
126
+ "grad_norm": 0.6679856777191162,
127
+ "learning_rate": 9.687199871354669e-05,
128
+ "loss": 0.0666,
129
+ "step": 170
130
+ },
131
+ {
132
+ "epoch": 1.6822429906542056,
133
+ "grad_norm": 0.4407249391078949,
134
+ "learning_rate": 9.631142279636706e-05,
135
+ "loss": 0.056,
136
+ "step": 180
137
+ },
138
+ {
139
+ "epoch": 1.7757009345794392,
140
+ "grad_norm": 0.5206877589225769,
141
+ "learning_rate": 9.570657113700985e-05,
142
+ "loss": 0.0595,
143
+ "step": 190
144
+ },
145
+ {
146
+ "epoch": 1.8691588785046729,
147
+ "grad_norm": 0.7172874212265015,
148
+ "learning_rate": 9.50580220000696e-05,
149
+ "loss": 0.0611,
150
+ "step": 200
151
+ },
152
+ {
153
+ "epoch": 1.9626168224299065,
154
+ "grad_norm": 0.5936960577964783,
155
+ "learning_rate": 9.436639542683727e-05,
156
+ "loss": 0.0538,
157
+ "step": 210
158
+ },
159
+ {
160
+ "epoch": 2.05607476635514,
161
+ "grad_norm": 0.4505106806755066,
162
+ "learning_rate": 9.363235264251369e-05,
163
+ "loss": 0.0586,
164
+ "step": 220
165
+ },
166
+ {
167
+ "epoch": 2.149532710280374,
168
+ "grad_norm": 0.45083609223365784,
169
+ "learning_rate": 9.285659542404941e-05,
170
+ "loss": 0.0571,
171
+ "step": 230
172
+ },
173
+ {
174
+ "epoch": 2.2429906542056073,
175
+ "grad_norm": 0.5857895612716675,
176
+ "learning_rate": 9.203986542921532e-05,
177
+ "loss": 0.0496,
178
+ "step": 240
179
+ },
180
+ {
181
+ "epoch": 2.336448598130841,
182
+ "grad_norm": 0.600140392780304,
183
+ "learning_rate": 9.11829434875454e-05,
184
+ "loss": 0.056,
185
+ "step": 250
186
+ },
187
+ {
188
+ "epoch": 2.4299065420560746,
189
+ "grad_norm": 0.46349039673805237,
190
+ "learning_rate": 9.02866488538296e-05,
191
+ "loss": 0.0482,
192
+ "step": 260
193
+ },
194
+ {
195
+ "epoch": 2.5233644859813085,
196
+ "grad_norm": 0.597632884979248,
197
+ "learning_rate": 8.93518384248705e-05,
198
+ "loss": 0.0498,
199
+ "step": 270
200
+ },
201
+ {
202
+ "epoch": 2.616822429906542,
203
+ "grad_norm": 0.6381852030754089,
204
+ "learning_rate": 8.837940592025257e-05,
205
+ "loss": 0.0521,
206
+ "step": 280
207
+ },
208
+ {
209
+ "epoch": 2.710280373831776,
210
+ "grad_norm": 0.5228458642959595,
211
+ "learning_rate": 8.737028102790723e-05,
212
+ "loss": 0.0523,
213
+ "step": 290
214
+ },
215
+ {
216
+ "epoch": 2.803738317757009,
217
+ "grad_norm": 0.6592262983322144,
218
+ "learning_rate": 8.632542851529051e-05,
219
+ "loss": 0.0481,
220
+ "step": 300
221
+ },
222
+ {
223
+ "epoch": 2.897196261682243,
224
+ "grad_norm": 0.4157472848892212,
225
+ "learning_rate": 8.524584730702339e-05,
226
+ "loss": 0.0459,
227
+ "step": 310
228
+ },
229
+ {
230
+ "epoch": 2.9906542056074765,
231
+ "grad_norm": 0.2829182744026184,
232
+ "learning_rate": 8.413256952987611e-05,
233
+ "loss": 0.0479,
234
+ "step": 320
235
+ },
236
+ {
237
+ "epoch": 3.0841121495327104,
238
+ "grad_norm": 0.6042788624763489,
239
+ "learning_rate": 8.298665952600999e-05,
240
+ "loss": 0.0401,
241
+ "step": 330
242
+ },
243
+ {
244
+ "epoch": 3.177570093457944,
245
+ "grad_norm": 0.31885308027267456,
246
+ "learning_rate": 8.180921283541986e-05,
247
+ "loss": 0.0441,
248
+ "step": 340
249
+ },
250
+ {
251
+ "epoch": 3.2710280373831777,
252
+ "grad_norm": 0.3092094659805298,
253
+ "learning_rate": 8.060135514854994e-05,
254
+ "loss": 0.041,
255
+ "step": 350
256
+ },
257
+ {
258
+ "epoch": 3.364485981308411,
259
+ "grad_norm": 0.39221957325935364,
260
+ "learning_rate": 7.936424123008464e-05,
261
+ "loss": 0.0378,
262
+ "step": 360
263
+ },
264
+ {
265
+ "epoch": 3.457943925233645,
266
+ "grad_norm": 0.5164565443992615,
267
+ "learning_rate": 7.809905381494316e-05,
268
+ "loss": 0.0436,
269
+ "step": 370
270
+ },
271
+ {
272
+ "epoch": 3.5514018691588785,
273
+ "grad_norm": 0.4761320650577545,
274
+ "learning_rate": 7.68070024775332e-05,
275
+ "loss": 0.0393,
276
+ "step": 380
277
+ },
278
+ {
279
+ "epoch": 3.6448598130841123,
280
+ "grad_norm": 0.24083180725574493,
281
+ "learning_rate": 7.548932247534506e-05,
282
+ "loss": 0.0432,
283
+ "step": 390
284
+ },
285
+ {
286
+ "epoch": 3.7383177570093458,
287
+ "grad_norm": 0.32576093077659607,
288
+ "learning_rate": 7.414727356799154e-05,
289
+ "loss": 0.04,
290
+ "step": 400
291
+ },
292
+ {
293
+ "epoch": 3.831775700934579,
294
+ "grad_norm": 0.4475277066230774,
295
+ "learning_rate": 7.27821388128227e-05,
296
+ "loss": 0.0423,
297
+ "step": 410
298
+ },
299
+ {
300
+ "epoch": 3.925233644859813,
301
+ "grad_norm": 0.47092771530151367,
302
+ "learning_rate": 7.139522333826707e-05,
303
+ "loss": 0.0408,
304
+ "step": 420
305
+ },
306
+ {
307
+ "epoch": 4.018691588785047,
308
+ "grad_norm": 0.37364593148231506,
309
+ "learning_rate": 6.99878530960719e-05,
310
+ "loss": 0.0361,
311
+ "step": 430
312
+ },
313
+ {
314
+ "epoch": 4.11214953271028,
315
+ "grad_norm": 0.3749428391456604,
316
+ "learning_rate": 6.856137359363533e-05,
317
+ "loss": 0.0347,
318
+ "step": 440
319
+ },
320
+ {
321
+ "epoch": 4.205607476635514,
322
+ "grad_norm": 0.28443631529808044,
323
+ "learning_rate": 6.711714860764266e-05,
324
+ "loss": 0.0378,
325
+ "step": 450
326
+ },
327
+ {
328
+ "epoch": 4.299065420560748,
329
+ "grad_norm": 0.31837236881256104,
330
+ "learning_rate": 6.565655888023618e-05,
331
+ "loss": 0.0363,
332
+ "step": 460
333
+ },
334
+ {
335
+ "epoch": 4.392523364485982,
336
+ "grad_norm": 0.3948926329612732,
337
+ "learning_rate": 6.418100079896556e-05,
338
+ "loss": 0.0388,
339
+ "step": 470
340
+ },
341
+ {
342
+ "epoch": 4.485981308411215,
343
+ "grad_norm": 0.5089420080184937,
344
+ "learning_rate": 6.269188506178019e-05,
345
+ "loss": 0.0377,
346
+ "step": 480
347
+ },
348
+ {
349
+ "epoch": 4.579439252336448,
350
+ "grad_norm": 0.266495943069458,
351
+ "learning_rate": 6.11906353283405e-05,
352
+ "loss": 0.0367,
353
+ "step": 490
354
+ },
355
+ {
356
+ "epoch": 4.672897196261682,
357
+ "grad_norm": 0.33666715025901794,
358
+ "learning_rate": 5.967868685893715e-05,
359
+ "loss": 0.0372,
360
+ "step": 500
361
+ },
362
+ {
363
+ "epoch": 4.766355140186916,
364
+ "grad_norm": 0.37420037388801575,
365
+ "learning_rate": 5.815748514231944e-05,
366
+ "loss": 0.032,
367
+ "step": 510
368
+ },
369
+ {
370
+ "epoch": 4.859813084112149,
371
+ "grad_norm": 0.33954936265945435,
372
+ "learning_rate": 5.6628484513745e-05,
373
+ "loss": 0.0299,
374
+ "step": 520
375
+ },
376
+ {
377
+ "epoch": 4.953271028037383,
378
+ "grad_norm": 0.2828126847743988,
379
+ "learning_rate": 5.5093146764571866e-05,
380
+ "loss": 0.0343,
381
+ "step": 530
382
+ },
383
+ {
384
+ "epoch": 5.046728971962617,
385
+ "grad_norm": 0.48174425959587097,
386
+ "learning_rate": 5.355293974472197e-05,
387
+ "loss": 0.0326,
388
+ "step": 540
389
+ },
390
+ {
391
+ "epoch": 5.140186915887851,
392
+ "grad_norm": 0.42725300788879395,
393
+ "learning_rate": 5.2009335959352666e-05,
394
+ "loss": 0.0338,
395
+ "step": 550
396
+ },
397
+ {
398
+ "epoch": 5.233644859813084,
399
+ "grad_norm": 0.34893345832824707,
400
+ "learning_rate": 5.046381116107742e-05,
401
+ "loss": 0.0322,
402
+ "step": 560
403
+ },
404
+ {
405
+ "epoch": 5.327102803738318,
406
+ "grad_norm": 0.3411373198032379,
407
+ "learning_rate": 4.891784293908192e-05,
408
+ "loss": 0.0312,
409
+ "step": 570
410
+ },
411
+ {
412
+ "epoch": 5.420560747663552,
413
+ "grad_norm": 0.35624727606773376,
414
+ "learning_rate": 4.7372909306484276e-05,
415
+ "loss": 0.0353,
416
+ "step": 580
417
+ },
418
+ {
419
+ "epoch": 5.5140186915887845,
420
+ "grad_norm": 0.3242332935333252,
421
+ "learning_rate": 4.5830487287289966e-05,
422
+ "loss": 0.0288,
423
+ "step": 590
424
+ },
425
+ {
426
+ "epoch": 5.607476635514018,
427
+ "grad_norm": 0.4070276618003845,
428
+ "learning_rate": 4.429205150429241e-05,
429
+ "loss": 0.0322,
430
+ "step": 600
431
+ },
432
+ {
433
+ "epoch": 5.700934579439252,
434
+ "grad_norm": 0.2987990975379944,
435
+ "learning_rate": 4.275907276926918e-05,
436
+ "loss": 0.0293,
437
+ "step": 610
438
+ },
439
+ {
440
+ "epoch": 5.794392523364486,
441
+ "grad_norm": 0.36411428451538086,
442
+ "learning_rate": 4.123301667682171e-05,
443
+ "loss": 0.0296,
444
+ "step": 620
445
+ },
446
+ {
447
+ "epoch": 5.88785046728972,
448
+ "grad_norm": 0.30948546528816223,
449
+ "learning_rate": 3.971534220320291e-05,
450
+ "loss": 0.0274,
451
+ "step": 630
452
+ },
453
+ {
454
+ "epoch": 5.981308411214953,
455
+ "grad_norm": 0.4233947694301605,
456
+ "learning_rate": 3.820750031147211e-05,
457
+ "loss": 0.0316,
458
+ "step": 640
459
+ },
460
+ {
461
+ "epoch": 6.074766355140187,
462
+ "grad_norm": 0.47963348031044006,
463
+ "learning_rate": 3.67109325643111e-05,
464
+ "loss": 0.029,
465
+ "step": 650
466
+ },
467
+ {
468
+ "epoch": 6.168224299065421,
469
+ "grad_norm": 0.33454999327659607,
470
+ "learning_rate": 3.522706974582717e-05,
471
+ "loss": 0.0301,
472
+ "step": 660
473
+ },
474
+ {
475
+ "epoch": 6.261682242990654,
476
+ "grad_norm": 0.22388812899589539,
477
+ "learning_rate": 3.375733049366115e-05,
478
+ "loss": 0.0273,
479
+ "step": 670
480
+ },
481
+ {
482
+ "epoch": 6.355140186915888,
483
+ "grad_norm": 0.3955516219139099,
484
+ "learning_rate": 3.2303119942707796e-05,
485
+ "loss": 0.0259,
486
+ "step": 680
487
+ },
488
+ {
489
+ "epoch": 6.4485981308411215,
490
+ "grad_norm": 0.33848804235458374,
491
+ "learning_rate": 3.086582838174551e-05,
492
+ "loss": 0.029,
493
+ "step": 690
494
+ },
495
+ {
496
+ "epoch": 6.542056074766355,
497
+ "grad_norm": 0.6161491274833679,
498
+ "learning_rate": 2.944682992425959e-05,
499
+ "loss": 0.0273,
500
+ "step": 700
501
+ },
502
+ {
503
+ "epoch": 6.635514018691588,
504
+ "grad_norm": 0.3765656352043152,
505
+ "learning_rate": 2.804748119472969e-05,
506
+ "loss": 0.027,
507
+ "step": 710
508
+ },
509
+ {
510
+ "epoch": 6.728971962616822,
511
+ "grad_norm": 0.22724078595638275,
512
+ "learning_rate": 2.6669120031637663e-05,
513
+ "loss": 0.0259,
514
+ "step": 720
515
+ },
516
+ {
517
+ "epoch": 6.822429906542056,
518
+ "grad_norm": 0.39581674337387085,
519
+ "learning_rate": 2.5313064208435423e-05,
520
+ "loss": 0.0282,
521
+ "step": 730
522
+ },
523
+ {
524
+ "epoch": 6.91588785046729,
525
+ "grad_norm": 0.4597732126712799,
526
+ "learning_rate": 2.3980610173696255e-05,
527
+ "loss": 0.0269,
528
+ "step": 740
529
+ },
530
+ {
531
+ "epoch": 7.009345794392523,
532
+ "grad_norm": 0.39131224155426025,
533
+ "learning_rate": 2.2673031811653034e-05,
534
+ "loss": 0.0309,
535
+ "step": 750
536
+ },
537
+ {
538
+ "epoch": 7.102803738317757,
539
+ "grad_norm": 0.33914846181869507,
540
+ "learning_rate": 2.139157922430956e-05,
541
+ "loss": 0.0249,
542
+ "step": 760
543
+ },
544
+ {
545
+ "epoch": 7.196261682242991,
546
+ "grad_norm": 0.40823277831077576,
547
+ "learning_rate": 2.01374775362883e-05,
548
+ "loss": 0.026,
549
+ "step": 770
550
+ },
551
+ {
552
+ "epoch": 7.289719626168225,
553
+ "grad_norm": 0.20548699796199799,
554
+ "learning_rate": 1.8911925723557806e-05,
555
+ "loss": 0.0214,
556
+ "step": 780
557
+ },
558
+ {
559
+ "epoch": 7.383177570093458,
560
+ "grad_norm": 0.28237590193748474,
561
+ "learning_rate": 1.7716095467159393e-05,
562
+ "loss": 0.0262,
563
+ "step": 790
564
+ },
565
+ {
566
+ "epoch": 7.4766355140186915,
567
+ "grad_norm": 0.29693859815597534,
568
+ "learning_rate": 1.6551130033028827e-05,
569
+ "loss": 0.0233,
570
+ "step": 800
571
+ },
572
+ {
573
+ "epoch": 7.570093457943925,
574
+ "grad_norm": 0.3963673710823059,
575
+ "learning_rate": 1.541814317898425e-05,
576
+ "loss": 0.0247,
577
+ "step": 810
578
+ },
579
+ {
580
+ "epoch": 7.663551401869158,
581
+ "grad_norm": 0.22003985941410065,
582
+ "learning_rate": 1.4318218089924962e-05,
583
+ "loss": 0.0249,
584
+ "step": 820
585
+ },
586
+ {
587
+ "epoch": 7.757009345794392,
588
+ "grad_norm": 0.2355249971151352,
589
+ "learning_rate": 1.3252406342259527e-05,
590
+ "loss": 0.0245,
591
+ "step": 830
592
+ },
593
+ {
594
+ "epoch": 7.850467289719626,
595
+ "grad_norm": 0.45212939381599426,
596
+ "learning_rate": 1.2221726898552665e-05,
597
+ "loss": 0.0228,
598
+ "step": 840
599
+ },
600
+ {
601
+ "epoch": 7.94392523364486,
602
+ "grad_norm": 0.4631796181201935,
603
+ "learning_rate": 1.122716513335262e-05,
604
+ "loss": 0.0215,
605
+ "step": 850
606
+ },
607
+ {
608
+ "epoch": 8.037383177570094,
609
+ "grad_norm": 0.24740639328956604,
610
+ "learning_rate": 1.0269671891130123e-05,
611
+ "loss": 0.0267,
612
+ "step": 860
613
+ },
614
+ {
615
+ "epoch": 8.130841121495328,
616
+ "grad_norm": 0.21961306035518646,
617
+ "learning_rate": 9.350162577229432e-06,
618
+ "loss": 0.0222,
619
+ "step": 870
620
+ },
621
+ {
622
+ "epoch": 8.22429906542056,
623
+ "grad_norm": 0.2656007409095764,
624
+ "learning_rate": 8.46951628270098e-06,
625
+ "loss": 0.0234,
626
+ "step": 880
627
+ },
628
+ {
629
+ "epoch": 8.317757009345794,
630
+ "grad_norm": 0.1633329689502716,
631
+ "learning_rate": 7.628574943851852e-06,
632
+ "loss": 0.0189,
633
+ "step": 890
634
+ },
635
+ {
636
+ "epoch": 8.411214953271028,
637
+ "grad_norm": 0.15491314232349396,
638
+ "learning_rate": 6.82814253731801e-06,
639
+ "loss": 0.0199,
640
+ "step": 900
641
+ },
642
+ {
643
+ "epoch": 8.504672897196262,
644
+ "grad_norm": 0.464375376701355,
645
+ "learning_rate": 6.06898431142745e-06,
646
+ "loss": 0.0216,
647
+ "step": 910
648
+ },
649
+ {
650
+ "epoch": 8.598130841121495,
651
+ "grad_norm": 0.21681295335292816,
652
+ "learning_rate": 5.351826054589393e-06,
653
+ "loss": 0.0203,
654
+ "step": 920
655
+ },
656
+ {
657
+ "epoch": 8.69158878504673,
658
+ "grad_norm": 0.18899723887443542,
659
+ "learning_rate": 4.677353401408974e-06,
660
+ "loss": 0.0184,
661
+ "step": 930
662
+ },
663
+ {
664
+ "epoch": 8.785046728971963,
665
+ "grad_norm": 0.28756183385849,
666
+ "learning_rate": 4.04621117719049e-06,
667
+ "loss": 0.0257,
668
+ "step": 940
669
+ },
670
+ {
671
+ "epoch": 8.878504672897197,
672
+ "grad_norm": 0.27610504627227783,
673
+ "learning_rate": 3.459002781456344e-06,
674
+ "loss": 0.0208,
675
+ "step": 950
676
+ },
677
+ {
678
+ "epoch": 8.97196261682243,
679
+ "grad_norm": 0.21105413138866425,
680
+ "learning_rate": 2.9162896110707163e-06,
681
+ "loss": 0.0219,
682
+ "step": 960
683
+ },
684
+ {
685
+ "epoch": 9.065420560747663,
686
+ "grad_norm": 0.23299843072891235,
687
+ "learning_rate": 2.418590523519687e-06,
688
+ "loss": 0.0217,
689
+ "step": 970
690
+ },
691
+ {
692
+ "epoch": 9.158878504672897,
693
+ "grad_norm": 0.3126954436302185,
694
+ "learning_rate": 1.9663813408607845e-06,
695
+ "loss": 0.0188,
696
+ "step": 980
697
+ },
698
+ {
699
+ "epoch": 9.25233644859813,
700
+ "grad_norm": 0.32796964049339294,
701
+ "learning_rate": 1.5600943948163527e-06,
702
+ "loss": 0.0227,
703
+ "step": 990
704
+ },
705
+ {
706
+ "epoch": 9.345794392523365,
707
+ "grad_norm": 0.2622697949409485,
708
+ "learning_rate": 1.2001181134455475e-06,
709
+ "loss": 0.0205,
710
+ "step": 1000
711
+ },
712
+ {
713
+ "epoch": 9.439252336448599,
714
+ "grad_norm": 0.23692955076694489,
715
+ "learning_rate": 8.867966497901282e-07,
716
+ "loss": 0.0216,
717
+ "step": 1010
718
+ },
719
+ {
720
+ "epoch": 9.532710280373832,
721
+ "grad_norm": 0.23177599906921387,
722
+ "learning_rate": 6.204295528491555e-07,
723
+ "loss": 0.0197,
724
+ "step": 1020
725
+ },
726
+ {
727
+ "epoch": 9.626168224299064,
728
+ "grad_norm": 0.18032234907150269,
729
+ "learning_rate": 4.012714811970464e-07,
730
+ "loss": 0.0223,
731
+ "step": 1030
732
+ },
733
+ {
734
+ "epoch": 9.719626168224298,
735
+ "grad_norm": 0.4851064682006836,
736
+ "learning_rate": 2.295319595188805e-07,
737
+ "loss": 0.0209,
738
+ "step": 1040
739
+ },
740
+ {
741
+ "epoch": 9.813084112149532,
742
+ "grad_norm": 0.2691369950771332,
743
+ "learning_rate": 1.0537517829562472e-07,
744
+ "loss": 0.0195,
745
+ "step": 1050
746
+ },
747
+ {
748
+ "epoch": 9.906542056074766,
749
+ "grad_norm": 0.19837787747383118,
750
+ "learning_rate": 2.8919836830887392e-08,
751
+ "loss": 0.0186,
752
+ "step": 1060
753
+ },
754
+ {
755
+ "epoch": 10.0,
756
+ "grad_norm": 0.2632281482219696,
757
+ "learning_rate": 2.3902976920009423e-10,
758
+ "loss": 0.0186,
759
+ "step": 1070
760
+ },
761
+ {
762
+ "epoch": 10.0,
763
+ "step": 1070,
764
+ "total_flos": 0.0,
765
+ "train_loss": 0.0546069160939377,
766
+ "train_runtime": 1371.0334,
767
+ "train_samples_per_second": 77.708,
768
+ "train_steps_per_second": 0.78
769
+ }
770
+ ],
771
+ "logging_steps": 10,
772
+ "max_steps": 1070,
773
+ "num_input_tokens_seen": 0,
774
+ "num_train_epochs": 10,
775
+ "save_steps": 10000,
776
+ "stateful_callbacks": {
777
+ "TrainerControl": {
778
+ "args": {
779
+ "should_epoch_stop": false,
780
+ "should_evaluate": false,
781
+ "should_log": false,
782
+ "should_save": true,
783
+ "should_training_stop": true
784
+ },
785
+ "attributes": {}
786
+ }
787
+ },
788
+ "total_flos": 0.0,
789
+ "train_batch_size": 100,
790
+ "trial_name": null,
791
+ "trial_params": null
792
+ }