agentlans commited on
Commit
08a1d7d
·
verified ·
1 Parent(s): 9175776

Upload 11 files

Browse files
all_results.json CHANGED
@@ -1,16 +1,16 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.9015177065767285,
4
- "eval_loss": 0.2585154175758362,
5
- "eval_runtime": 1.5991,
6
  "eval_samples": 2965,
7
- "eval_samples_per_second": 1854.212,
8
- "eval_steps_per_second": 232.011,
9
- "num_input_tokens_seen": 15175680,
10
- "total_flos": 1952467720519680.0,
11
- "train_loss": 0.1272777229185529,
12
- "train_runtime": 445.3252,
13
  "train_samples": 11856,
14
- "train_samples_per_second": 266.232,
15
- "train_steps_per_second": 33.279
16
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.92141652613828,
4
+ "eval_loss": 0.21923576295375824,
5
+ "eval_runtime": 1.7241,
6
  "eval_samples": 2965,
7
+ "eval_samples_per_second": 1719.735,
8
+ "eval_steps_per_second": 215.184,
9
+ "num_input_tokens_seen": 4552704,
10
+ "total_flos": 585740316155904.0,
11
+ "train_loss": 0.17994267544765705,
12
+ "train_runtime": 134.4715,
13
  "train_samples": 11856,
14
+ "train_samples_per_second": 264.502,
15
+ "train_steps_per_second": 33.063
16
  }
eval_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 10.0,
3
- "eval_accuracy": 0.9015177065767285,
4
- "eval_loss": 0.2585154175758362,
5
- "eval_runtime": 1.5991,
6
  "eval_samples": 2965,
7
- "eval_samples_per_second": 1854.212,
8
- "eval_steps_per_second": 232.011,
9
- "num_input_tokens_seen": 15175680
10
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "eval_accuracy": 0.92141652613828,
4
+ "eval_loss": 0.21923576295375824,
5
+ "eval_runtime": 1.7241,
6
  "eval_samples": 2965,
7
+ "eval_samples_per_second": 1719.735,
8
+ "eval_steps_per_second": 215.184,
9
+ "num_input_tokens_seen": 4552704
10
  }
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:4058cab6a49e456434b0ed21f33d32bde158795e7e8c36347ffb94bd3c7c06dd
3
  size 470641664
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a6255fcbeca3c18b03cc8d754bfb51fd5a1e00252a433f6637be8e3d7d8199b8
3
  size 470641664
tokenizer_config.json CHANGED
@@ -47,17 +47,10 @@
47
  "eos_token": "</s>",
48
  "extra_special_tokens": {},
49
  "mask_token": "<mask>",
50
- "max_length": 512,
51
  "model_max_length": 512,
52
- "pad_to_multiple_of": null,
53
  "pad_token": "<pad>",
54
- "pad_token_type_id": 0,
55
- "padding_side": "right",
56
  "sep_token": "</s>",
57
  "sp_model_kwargs": {},
58
- "stride": 0,
59
  "tokenizer_class": "XLMRobertaTokenizer",
60
- "truncation_side": "right",
61
- "truncation_strategy": "longest_first",
62
  "unk_token": "<unk>"
63
  }
 
47
  "eos_token": "</s>",
48
  "extra_special_tokens": {},
49
  "mask_token": "<mask>",
 
50
  "model_max_length": 512,
 
51
  "pad_token": "<pad>",
 
 
52
  "sep_token": "</s>",
53
  "sp_model_kwargs": {},
 
54
  "tokenizer_class": "XLMRobertaTokenizer",
 
 
55
  "unk_token": "<unk>"
56
  }
train_results.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "epoch": 10.0,
3
- "num_input_tokens_seen": 15175680,
4
- "total_flos": 1952467720519680.0,
5
- "train_loss": 0.1272777229185529,
6
- "train_runtime": 445.3252,
7
  "train_samples": 11856,
8
- "train_samples_per_second": 266.232,
9
- "train_steps_per_second": 33.279
10
  }
 
1
  {
2
+ "epoch": 3.0,
3
+ "num_input_tokens_seen": 4552704,
4
+ "total_flos": 585740316155904.0,
5
+ "train_loss": 0.17994267544765705,
6
+ "train_runtime": 134.4715,
7
  "train_samples": 11856,
8
+ "train_samples_per_second": 264.502,
9
+ "train_steps_per_second": 33.063
10
  }
trainer_state.json CHANGED
@@ -1,419 +1,139 @@
1
  {
2
- "best_global_step": 1482,
3
- "best_metric": 0.2585154175758362,
4
- "best_model_checkpoint": "multilingual-e5-small-aligned-v2-pii-detector/checkpoint-1482",
5
- "epoch": 10.0,
6
  "eval_steps": 500,
7
- "global_step": 14820,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.33738191632928477,
14
- "grad_norm": 2.627490997314453,
15
- "learning_rate": 4.8316464237516873e-05,
16
- "loss": 0.3028,
17
  "num_input_tokens_seen": 512000,
18
  "step": 500,
19
- "train_runtime": 13.9553,
20
- "train_tokens_per_second": 36688.687
21
  },
22
  {
23
  "epoch": 0.6747638326585695,
24
- "grad_norm": 0.09764809906482697,
25
- "learning_rate": 4.662955465587045e-05,
26
- "loss": 0.238,
27
  "num_input_tokens_seen": 1024000,
28
  "step": 1000,
29
- "train_runtime": 27.6858,
30
- "train_tokens_per_second": 36986.469
31
  },
32
  {
33
  "epoch": 1.0,
34
- "eval_accuracy": 0.9015177065767285,
35
- "eval_loss": 0.2585154175758362,
36
- "eval_runtime": 1.648,
37
- "eval_samples_per_second": 1799.16,
38
- "eval_steps_per_second": 225.122,
39
  "num_input_tokens_seen": 1517568,
40
  "step": 1482
41
  },
42
  {
43
  "epoch": 1.0121457489878543,
44
- "grad_norm": 2.041290521621704,
45
- "learning_rate": 4.494264507422402e-05,
46
- "loss": 0.2022,
47
  "num_input_tokens_seen": 1536000,
48
  "step": 1500,
49
- "train_runtime": 43.9883,
50
- "train_tokens_per_second": 34918.384
51
  },
52
  {
53
  "epoch": 1.349527665317139,
54
- "grad_norm": 0.10939698666334152,
55
- "learning_rate": 4.32557354925776e-05,
56
- "loss": 0.1781,
57
  "num_input_tokens_seen": 2048000,
58
  "step": 2000,
59
- "train_runtime": 57.4658,
60
- "train_tokens_per_second": 35638.593
61
  },
62
  {
63
  "epoch": 1.686909581646424,
64
- "grad_norm": 5.614178657531738,
65
- "learning_rate": 4.156882591093118e-05,
66
- "loss": 0.1922,
67
  "num_input_tokens_seen": 2560000,
68
  "step": 2500,
69
- "train_runtime": 71.0513,
70
- "train_tokens_per_second": 36030.282
71
  },
72
  {
73
  "epoch": 2.0,
74
- "eval_accuracy": 0.9173693086003373,
75
- "eval_loss": 0.3090151250362396,
76
- "eval_runtime": 1.6796,
77
- "eval_samples_per_second": 1765.251,
78
- "eval_steps_per_second": 220.88,
79
  "num_input_tokens_seen": 3035136,
80
  "step": 2964
81
  },
82
  {
83
  "epoch": 2.0242914979757085,
84
- "grad_norm": 0.06700567156076431,
85
- "learning_rate": 3.9881916329284755e-05,
86
- "loss": 0.1585,
87
  "num_input_tokens_seen": 3072000,
88
  "step": 3000,
89
- "train_runtime": 87.6915,
90
- "train_tokens_per_second": 35031.905
91
  },
92
  {
93
  "epoch": 2.361673414304993,
94
- "grad_norm": 0.5916054844856262,
95
- "learning_rate": 3.8195006747638326e-05,
96
- "loss": 0.1549,
97
  "num_input_tokens_seen": 3584000,
98
  "step": 3500,
99
- "train_runtime": 101.7739,
100
- "train_tokens_per_second": 35215.309
101
  },
102
  {
103
  "epoch": 2.699055330634278,
104
- "grad_norm": 0.6893392205238342,
105
- "learning_rate": 3.6508097165991904e-05,
106
- "loss": 0.1517,
107
  "num_input_tokens_seen": 4096000,
108
  "step": 4000,
109
- "train_runtime": 116.1299,
110
- "train_tokens_per_second": 35270.842
111
  },
112
  {
113
  "epoch": 3.0,
114
- "eval_accuracy": 0.9079258010118044,
115
- "eval_loss": 0.28507259488105774,
116
- "eval_runtime": 1.6356,
117
- "eval_samples_per_second": 1812.841,
118
- "eval_steps_per_second": 226.834,
119
  "num_input_tokens_seen": 4552704,
120
  "step": 4446
121
  },
122
  {
123
- "epoch": 3.0364372469635628,
124
- "grad_norm": 6.807932376861572,
125
- "learning_rate": 3.482118758434548e-05,
126
- "loss": 0.156,
127
- "num_input_tokens_seen": 4608000,
128
- "step": 4500,
129
- "train_runtime": 133.056,
130
- "train_tokens_per_second": 34632.022
131
- },
132
- {
133
- "epoch": 3.3738191632928474,
134
- "grad_norm": 52.58000946044922,
135
- "learning_rate": 3.313427800269906e-05,
136
- "loss": 0.1233,
137
- "num_input_tokens_seen": 5120000,
138
- "step": 5000,
139
- "train_runtime": 147.2098,
140
- "train_tokens_per_second": 34780.299
141
- },
142
- {
143
- "epoch": 3.7112010796221324,
144
- "grad_norm": 0.09153091162443161,
145
- "learning_rate": 3.144736842105263e-05,
146
- "loss": 0.1412,
147
- "num_input_tokens_seen": 5632000,
148
- "step": 5500,
149
- "train_runtime": 161.9414,
150
- "train_tokens_per_second": 34778.016
151
- },
152
- {
153
- "epoch": 4.0,
154
- "eval_accuracy": 0.9059021922428331,
155
- "eval_loss": 0.31255096197128296,
156
- "eval_runtime": 1.8022,
157
- "eval_samples_per_second": 1645.244,
158
- "eval_steps_per_second": 205.864,
159
- "num_input_tokens_seen": 6070272,
160
- "step": 5928
161
- },
162
- {
163
- "epoch": 4.048582995951417,
164
- "grad_norm": 0.014544670470058918,
165
- "learning_rate": 2.9760458839406208e-05,
166
- "loss": 0.1248,
167
- "num_input_tokens_seen": 6144000,
168
- "step": 6000,
169
- "train_runtime": 179.8529,
170
- "train_tokens_per_second": 34161.242
171
- },
172
- {
173
- "epoch": 4.385964912280702,
174
- "grad_norm": 0.022761313244700432,
175
- "learning_rate": 2.8073549257759785e-05,
176
- "loss": 0.0975,
177
- "num_input_tokens_seen": 6656000,
178
- "step": 6500,
179
- "train_runtime": 194.2334,
180
- "train_tokens_per_second": 34268.042
181
- },
182
- {
183
- "epoch": 4.723346828609986,
184
- "grad_norm": 0.27537310123443604,
185
- "learning_rate": 2.6386639676113363e-05,
186
- "loss": 0.1271,
187
- "num_input_tokens_seen": 7168000,
188
- "step": 7000,
189
- "train_runtime": 207.8817,
190
- "train_tokens_per_second": 34481.154
191
- },
192
- {
193
- "epoch": 5.0,
194
- "eval_accuracy": 0.9018549747048904,
195
- "eval_loss": 0.34343627095222473,
196
- "eval_runtime": 1.8728,
197
- "eval_samples_per_second": 1583.159,
198
- "eval_steps_per_second": 198.095,
199
- "num_input_tokens_seen": 7587840,
200
- "step": 7410
201
- },
202
- {
203
- "epoch": 5.060728744939271,
204
- "grad_norm": 0.3377642333507538,
205
- "learning_rate": 2.4699730094466937e-05,
206
- "loss": 0.1278,
207
- "num_input_tokens_seen": 7680000,
208
- "step": 7500,
209
- "train_runtime": 224.822,
210
- "train_tokens_per_second": 34160.358
211
- },
212
- {
213
- "epoch": 5.398110661268556,
214
- "grad_norm": 0.011958185583353043,
215
- "learning_rate": 2.301282051282051e-05,
216
- "loss": 0.0988,
217
- "num_input_tokens_seen": 8192000,
218
- "step": 8000,
219
- "train_runtime": 238.5719,
220
- "train_tokens_per_second": 34337.661
221
- },
222
- {
223
- "epoch": 5.7354925775978405,
224
- "grad_norm": 0.01739046536386013,
225
- "learning_rate": 2.132591093117409e-05,
226
- "loss": 0.1018,
227
- "num_input_tokens_seen": 8704000,
228
- "step": 8500,
229
- "train_runtime": 253.495,
230
- "train_tokens_per_second": 34335.983
231
- },
232
- {
233
- "epoch": 6.0,
234
- "eval_accuracy": 0.8930860033726813,
235
- "eval_loss": 0.38996028900146484,
236
- "eval_runtime": 1.6539,
237
- "eval_samples_per_second": 1792.721,
238
- "eval_steps_per_second": 224.317,
239
- "num_input_tokens_seen": 9105408,
240
- "step": 8892
241
- },
242
- {
243
- "epoch": 6.0728744939271255,
244
- "grad_norm": 0.3100438714027405,
245
- "learning_rate": 1.9639001349527667e-05,
246
- "loss": 0.1168,
247
- "num_input_tokens_seen": 9216000,
248
- "step": 9000,
249
- "train_runtime": 271.7013,
250
- "train_tokens_per_second": 33919.602
251
- },
252
- {
253
- "epoch": 6.410256410256411,
254
- "grad_norm": 0.43619751930236816,
255
- "learning_rate": 1.7952091767881245e-05,
256
- "loss": 0.0869,
257
- "num_input_tokens_seen": 9728000,
258
- "step": 9500,
259
- "train_runtime": 285.8275,
260
- "train_tokens_per_second": 34034.516
261
- },
262
- {
263
- "epoch": 6.747638326585695,
264
- "grad_norm": 0.006818657275289297,
265
- "learning_rate": 1.626518218623482e-05,
266
- "loss": 0.1037,
267
- "num_input_tokens_seen": 10240000,
268
- "step": 10000,
269
- "train_runtime": 299.932,
270
- "train_tokens_per_second": 34141.069
271
- },
272
- {
273
- "epoch": 7.0,
274
- "eval_accuracy": 0.8944350758853289,
275
- "eval_loss": 0.39529648423194885,
276
- "eval_runtime": 1.7201,
277
- "eval_samples_per_second": 1723.784,
278
- "eval_steps_per_second": 215.691,
279
- "num_input_tokens_seen": 10622976,
280
- "step": 10374
281
- },
282
- {
283
- "epoch": 7.08502024291498,
284
- "grad_norm": 0.07480119913816452,
285
- "learning_rate": 1.4578272604588395e-05,
286
- "loss": 0.1063,
287
- "num_input_tokens_seen": 10752000,
288
- "step": 10500,
289
- "train_runtime": 316.9384,
290
- "train_tokens_per_second": 33924.575
291
- },
292
- {
293
- "epoch": 7.422402159244265,
294
- "grad_norm": 0.008870264515280724,
295
- "learning_rate": 1.289136302294197e-05,
296
- "loss": 0.0908,
297
- "num_input_tokens_seen": 11264000,
298
- "step": 11000,
299
- "train_runtime": 331.4588,
300
- "train_tokens_per_second": 33983.108
301
- },
302
- {
303
- "epoch": 7.759784075573549,
304
- "grad_norm": 0.33139288425445557,
305
- "learning_rate": 1.1204453441295547e-05,
306
- "loss": 0.0902,
307
- "num_input_tokens_seen": 11776000,
308
- "step": 11500,
309
- "train_runtime": 345.9573,
310
- "train_tokens_per_second": 34038.879
311
- },
312
- {
313
- "epoch": 8.0,
314
- "eval_accuracy": 0.894097807757167,
315
- "eval_loss": 0.5339534282684326,
316
- "eval_runtime": 1.8369,
317
- "eval_samples_per_second": 1614.124,
318
- "eval_steps_per_second": 201.97,
319
- "num_input_tokens_seen": 12140544,
320
- "step": 11856
321
- },
322
- {
323
- "epoch": 8.097165991902834,
324
- "grad_norm": 1.103989839553833,
325
- "learning_rate": 9.517543859649124e-06,
326
- "loss": 0.0792,
327
- "num_input_tokens_seen": 12288000,
328
- "step": 12000,
329
- "train_runtime": 363.0247,
330
- "train_tokens_per_second": 33848.936
331
- },
332
- {
333
- "epoch": 8.434547908232119,
334
- "grad_norm": 0.19695305824279785,
335
- "learning_rate": 7.830634278002699e-06,
336
- "loss": 0.0771,
337
- "num_input_tokens_seen": 12800000,
338
- "step": 12500,
339
- "train_runtime": 376.8777,
340
- "train_tokens_per_second": 33963.276
341
- },
342
- {
343
- "epoch": 8.771929824561404,
344
- "grad_norm": 3.2966551780700684,
345
- "learning_rate": 6.1437246963562756e-06,
346
- "loss": 0.077,
347
- "num_input_tokens_seen": 13312000,
348
- "step": 13000,
349
- "train_runtime": 390.4455,
350
- "train_tokens_per_second": 34094.384
351
- },
352
- {
353
- "epoch": 9.0,
354
- "eval_accuracy": 0.8900505902192243,
355
- "eval_loss": 0.6329491138458252,
356
- "eval_runtime": 1.6301,
357
- "eval_samples_per_second": 1818.909,
358
- "eval_steps_per_second": 227.594,
359
- "num_input_tokens_seen": 13658112,
360
- "step": 13338
361
- },
362
- {
363
- "epoch": 9.109311740890687,
364
- "grad_norm": 2.050884485244751,
365
- "learning_rate": 4.4568151147098515e-06,
366
- "loss": 0.0783,
367
- "num_input_tokens_seen": 13824000,
368
- "step": 13500,
369
- "train_runtime": 406.9294,
370
- "train_tokens_per_second": 33971.492
371
- },
372
- {
373
- "epoch": 9.446693657219972,
374
- "grad_norm": 0.005987819749861956,
375
- "learning_rate": 2.769905533063428e-06,
376
- "loss": 0.0717,
377
- "num_input_tokens_seen": 14336000,
378
- "step": 14000,
379
- "train_runtime": 420.4707,
380
- "train_tokens_per_second": 34095.126
381
- },
382
- {
383
- "epoch": 9.784075573549257,
384
- "grad_norm": 0.0041707539930939674,
385
- "learning_rate": 1.0829959514170041e-06,
386
- "loss": 0.0685,
387
- "num_input_tokens_seen": 14848000,
388
- "step": 14500,
389
- "train_runtime": 433.8559,
390
- "train_tokens_per_second": 34223.346
391
- },
392
- {
393
- "epoch": 10.0,
394
- "eval_accuracy": 0.8920741989881956,
395
- "eval_loss": 0.6290408968925476,
396
- "eval_runtime": 1.6333,
397
- "eval_samples_per_second": 1815.288,
398
- "eval_steps_per_second": 227.141,
399
- "num_input_tokens_seen": 15175680,
400
- "step": 14820
401
- },
402
- {
403
- "epoch": 10.0,
404
- "num_input_tokens_seen": 15175680,
405
- "step": 14820,
406
- "total_flos": 1952467720519680.0,
407
- "train_loss": 0.1272777229185529,
408
- "train_runtime": 445.3252,
409
- "train_samples_per_second": 266.232,
410
- "train_steps_per_second": 33.279
411
  }
412
  ],
413
  "logging_steps": 500,
414
- "max_steps": 14820,
415
- "num_input_tokens_seen": 15175680,
416
- "num_train_epochs": 10,
417
  "save_steps": 500,
418
  "stateful_callbacks": {
419
  "TrainerControl": {
@@ -427,7 +147,7 @@
427
  "attributes": {}
428
  }
429
  },
430
- "total_flos": 1952467720519680.0,
431
  "train_batch_size": 8,
432
  "trial_name": null,
433
  "trial_params": null
 
1
  {
2
+ "best_global_step": 2964,
3
+ "best_metric": 0.21923576295375824,
4
+ "best_model_checkpoint": "multilingual-e5-small-pii-detector/checkpoint-2964",
5
+ "epoch": 3.0,
6
  "eval_steps": 500,
7
+ "global_step": 4446,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
11
  "log_history": [
12
  {
13
  "epoch": 0.33738191632928477,
14
+ "grad_norm": 5.012300491333008,
15
+ "learning_rate": 4.438821412505623e-05,
16
+ "loss": 0.3135,
17
  "num_input_tokens_seen": 512000,
18
  "step": 500,
19
+ "train_runtime": 14.3457,
20
+ "train_tokens_per_second": 35690.033
21
  },
22
  {
23
  "epoch": 0.6747638326585695,
24
+ "grad_norm": 0.07379312068223953,
25
+ "learning_rate": 3.876518218623482e-05,
26
+ "loss": 0.2333,
27
  "num_input_tokens_seen": 1024000,
28
  "step": 1000,
29
+ "train_runtime": 28.4978,
30
+ "train_tokens_per_second": 35932.607
31
  },
32
  {
33
  "epoch": 1.0,
34
+ "eval_accuracy": 0.9038785834738617,
35
+ "eval_loss": 0.25766971707344055,
36
+ "eval_runtime": 1.7391,
37
+ "eval_samples_per_second": 1704.926,
38
+ "eval_steps_per_second": 213.331,
39
  "num_input_tokens_seen": 1517568,
40
  "step": 1482
41
  },
42
  {
43
  "epoch": 1.0121457489878543,
44
+ "grad_norm": 1.9765552282333374,
45
+ "learning_rate": 3.3142150247413403e-05,
46
+ "loss": 0.2038,
47
  "num_input_tokens_seen": 1536000,
48
  "step": 1500,
49
+ "train_runtime": 45.5628,
50
+ "train_tokens_per_second": 33711.681
51
  },
52
  {
53
  "epoch": 1.349527665317139,
54
+ "grad_norm": 0.05636508762836456,
55
+ "learning_rate": 2.7519118308591997e-05,
56
+ "loss": 0.1658,
57
  "num_input_tokens_seen": 2048000,
58
  "step": 2000,
59
+ "train_runtime": 59.6595,
60
+ "train_tokens_per_second": 34328.161
61
  },
62
  {
63
  "epoch": 1.686909581646424,
64
+ "grad_norm": 1.788957118988037,
65
+ "learning_rate": 2.1896086369770583e-05,
66
+ "loss": 0.1719,
67
  "num_input_tokens_seen": 2560000,
68
  "step": 2500,
69
+ "train_runtime": 73.8011,
70
+ "train_tokens_per_second": 34687.829
71
  },
72
  {
73
  "epoch": 2.0,
74
+ "eval_accuracy": 0.92141652613828,
75
+ "eval_loss": 0.21923576295375824,
76
+ "eval_runtime": 1.7041,
77
+ "eval_samples_per_second": 1739.889,
78
+ "eval_steps_per_second": 217.706,
79
  "num_input_tokens_seen": 3035136,
80
  "step": 2964
81
  },
82
  {
83
  "epoch": 2.0242914979757085,
84
+ "grad_norm": 0.0168524831533432,
85
+ "learning_rate": 1.627305443094917e-05,
86
+ "loss": 0.1458,
87
  "num_input_tokens_seen": 3072000,
88
  "step": 3000,
89
+ "train_runtime": 90.7129,
90
+ "train_tokens_per_second": 33865.09
91
  },
92
  {
93
  "epoch": 2.361673414304993,
94
+ "grad_norm": 0.3309631049633026,
95
+ "learning_rate": 1.0650022492127757e-05,
96
+ "loss": 0.1314,
97
  "num_input_tokens_seen": 3584000,
98
  "step": 3500,
99
+ "train_runtime": 104.7568,
100
+ "train_tokens_per_second": 34212.57
101
  },
102
  {
103
  "epoch": 2.699055330634278,
104
+ "grad_norm": 3.9781689643859863,
105
+ "learning_rate": 5.026990553306343e-06,
106
+ "loss": 0.1224,
107
  "num_input_tokens_seen": 4096000,
108
  "step": 4000,
109
+ "train_runtime": 118.8877,
110
+ "train_tokens_per_second": 34452.667
111
  },
112
  {
113
  "epoch": 3.0,
114
+ "eval_accuracy": 0.9133220910623946,
115
+ "eval_loss": 0.27373236417770386,
116
+ "eval_runtime": 1.7591,
117
+ "eval_samples_per_second": 1685.527,
118
+ "eval_steps_per_second": 210.904,
119
  "num_input_tokens_seen": 4552704,
120
  "step": 4446
121
  },
122
  {
123
+ "epoch": 3.0,
124
+ "num_input_tokens_seen": 4552704,
125
+ "step": 4446,
126
+ "total_flos": 585740316155904.0,
127
+ "train_loss": 0.17994267544765705,
128
+ "train_runtime": 134.4715,
129
+ "train_samples_per_second": 264.502,
130
+ "train_steps_per_second": 33.063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
  ],
133
  "logging_steps": 500,
134
+ "max_steps": 4446,
135
+ "num_input_tokens_seen": 4552704,
136
+ "num_train_epochs": 3,
137
  "save_steps": 500,
138
  "stateful_callbacks": {
139
  "TrainerControl": {
 
147
  "attributes": {}
148
  }
149
  },
150
+ "total_flos": 585740316155904.0,
151
  "train_batch_size": 8,
152
  "trial_name": null,
153
  "trial_params": null
training_args.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:32ce1e246e8da9cb1f7bf84495bb07f5e6a549f44f69a69266d1dec339ecb772
3
  size 5201
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9f947f8eec68e36ee3cd7c2830c856229639500785b9261f50ee9178a9ee2a3
3
  size 5201