gsaltintas commited on
Commit
15e3414
·
verified ·
1 Parent(s): 836de3f

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +5 -1
  2. special_tokens_map.json +0 -4
  3. tokenizer.json +29 -47
  4. tokenizer_config.json +0 -20
  5. vocab.json +0 -0
README.md CHANGED
@@ -66,4 +66,8 @@ tokens = tokenizer.encode("Hello, world!")
66
  ## Sample Encoding
67
  | Text | Tokens | Token IDs |
68
  |------|--------|-----------|
69
- | `yirmi iki+dokuz=otuz bir\ntwenty two+nine=thirty one` | `yirmi, Ġ, iki, +, dokuz, =, otuz, Ġ, bir, \, n, tw, ent, y, Ġ, tw, o, +, n, in` | `1091, 223, 727, 3, 722, 4, 1088, 223, 848, 62, 80, 636, 260, 91, 223, 636, 81, 3, 80, 268` |
 
 
 
 
 
66
  ## Sample Encoding
67
  | Text | Tokens | Token IDs |
68
  |------|--------|-----------|
69
+ | `yirmi iki+dokuz=otuz bir\ntwenty two+nine=thirty one` | `yirmi, Ġ, iki, +, dokuz, =, otuz, Ġ, bir, \, n, tw, ent, y, Ġ, tw, o, +, n, in` | `1091, 223, 727, 13, 722, 31, 1088, 223, 848, 62, 80, 636, 260, 91, 223, 636, 81, 13, 80, 268` |
70
+
71
+ Command used to create this tokenizer:
72
+ ```bash
73
+ ['/home/gsa/tokenizers2/flexitok/tokenizer_training/train_tokenizers.py', 'algorithm=bpe', 'vocab_size=42_000', 'langs=[arb_Arab,ces_Latn,cmn_Hani,dan_Latn,deu_Latn,ell_Grek,fra_Latn,fw_edu,hun_Latn,ind_Latn,ita_Latn,jpn_Jpan,nld_Latn,pol_Latn,por_Latn,rus_Cyrl,spa_Latn,swe_Latn,tur_Latn,vie_Latn]', 'data_dir=/scratch/gsa/data/multilingual-addition/', 'output_dir=/scratch/gsa/trained_tokenizers/multilingual_addition', 'pretokenizer=custom:addition', 'number_handling=ltr_3digit', 'add_numbers=false', 'handle_contractions=false', 'unicode_normalization=nfc', 'use_byte_level_regex=false', 'byte_fallback=false', 'strip_zero_width=false', 'cjk_char_split=false', 'add_cjk_chars=false', 'max_lines=-1', 'test_string=yirmi iki+dokuz=otuz bir\\ntwenty two+nine=thirty one', 'hf.publish_to_hf=true', 'hf_repo_prefix=flexitok/', 'hf.hf_repo_id=flexitok/maddition_AllL_42000', 'hf.collections=[flexitok/multilingual_addition_tokenizers]']
special_tokens_map.json CHANGED
@@ -1,8 +1,4 @@
1
  {
2
- "additional_special_tokens": [
3
- "+",
4
- "="
5
- ],
6
  "bos_token": "<s>",
7
  "eos_token": "</s>",
8
  "pad_token": "<pad>"
 
1
  {
 
 
 
 
2
  "bos_token": "<s>",
3
  "eos_token": "</s>",
4
  "pad_token": "<pad>"
tokenizer.json CHANGED
@@ -29,24 +29,6 @@
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
32
- },
33
- {
34
- "id": 3,
35
- "content": "+",
36
- "single_word": false,
37
- "lstrip": false,
38
- "rstrip": false,
39
- "normalized": false,
40
- "special": true
41
- },
42
- {
43
- "id": 4,
44
- "content": "=",
45
- "single_word": false,
46
- "lstrip": false,
47
- "rstrip": false,
48
- "normalized": false,
49
- "special": true
50
  }
51
  ],
52
  "normalizer": {
@@ -99,35 +81,35 @@
99
  "<s>": 0,
100
  "</s>": 1,
101
  "<pad>": 2,
102
- "+": 3,
103
- "=": 4,
104
- "!": 5,
105
- "\"": 6,
106
- "#": 7,
107
- "$": 8,
108
- "%": 9,
109
- "&": 10,
110
- "'": 11,
111
- "(": 12,
112
- ")": 13,
113
- "*": 14,
114
- ",": 15,
115
- "-": 16,
116
- ".": 17,
117
- "/": 18,
118
- "0": 19,
119
- "1": 20,
120
- "2": 21,
121
- "3": 22,
122
- "4": 23,
123
- "5": 24,
124
- "6": 25,
125
- "7": 26,
126
- "8": 27,
127
- "9": 28,
128
- ":": 29,
129
- ";": 30,
130
- "<": 31,
131
  ">": 32,
132
  "?": 33,
133
  "@": 34,
 
29
  "rstrip": false,
30
  "normalized": false,
31
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  }
33
  ],
34
  "normalizer": {
 
81
  "<s>": 0,
82
  "</s>": 1,
83
  "<pad>": 2,
84
+ "!": 3,
85
+ "\"": 4,
86
+ "#": 5,
87
+ "$": 6,
88
+ "%": 7,
89
+ "&": 8,
90
+ "'": 9,
91
+ "(": 10,
92
+ ")": 11,
93
+ "*": 12,
94
+ "+": 13,
95
+ ",": 14,
96
+ "-": 15,
97
+ ".": 16,
98
+ "/": 17,
99
+ "0": 18,
100
+ "1": 19,
101
+ "2": 20,
102
+ "3": 21,
103
+ "4": 22,
104
+ "5": 23,
105
+ "6": 24,
106
+ "7": 25,
107
+ "8": 26,
108
+ "9": 27,
109
+ ":": 28,
110
+ ";": 29,
111
+ "<": 30,
112
+ "=": 31,
113
  ">": 32,
114
  "?": 33,
115
  "@": 34,
tokenizer_config.json CHANGED
@@ -23,28 +23,8 @@
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
26
- },
27
- "3": {
28
- "content": "+",
29
- "lstrip": false,
30
- "normalized": false,
31
- "rstrip": false,
32
- "single_word": false,
33
- "special": true
34
- },
35
- "4": {
36
- "content": "=",
37
- "lstrip": false,
38
- "normalized": false,
39
- "rstrip": false,
40
- "single_word": false,
41
- "special": true
42
  }
43
  },
44
- "additional_special_tokens": [
45
- "+",
46
- "="
47
- ],
48
  "bos_token": "<s>",
49
  "clean_up_tokenization_spaces": false,
50
  "eos_token": "</s>",
 
23
  "rstrip": false,
24
  "single_word": false,
25
  "special": true
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  }
27
  },
 
 
 
 
28
  "bos_token": "<s>",
29
  "clean_up_tokenization_spaces": false,
30
  "eos_token": "</s>",
vocab.json CHANGED
The diff for this file is too large to render. See raw diff