Upload folder using huggingface_hub

Files changed (5) hide show

README.md CHANGED Viewed

@@ -66,4 +66,8 @@ tokens = tokenizer.encode("Hello, world!")
 ## Sample Encoding
 | Text | Tokens | Token IDs |
 |------|--------|-----------|
-| `yirmi iki+dokuz=otuz bir\ntwenty two+nine=thirty one` | `yirmi, Ġ, iki, +, dokuz, =, otuz, Ġ, bir, \, n, tw, ent, y, Ġ, tw, o, +, n, in` | `1091, 223, 727, 3, 722, 4, 1088, 223, 848, 62, 80, 636, 260, 91, 223, 636, 81, 3, 80, 268` |

 ## Sample Encoding
 | Text | Tokens | Token IDs |
 |------|--------|-----------|
+| `yirmi iki+dokuz=otuz bir\ntwenty two+nine=thirty one` | `yirmi, Ġ, iki, +, dokuz, =, otuz, Ġ, bir, \, n, tw, ent, y, Ġ, tw, o, +, n, in` | `1091, 223, 727, 13, 722, 31, 1088, 223, 848, 62, 80, 636, 260, 91, 223, 636, 81, 13, 80, 268` |
+Command used to create this tokenizer:
+```bash
+['/home/gsa/tokenizers2/flexitok/tokenizer_training/train_tokenizers.py', 'algorithm=bpe', 'vocab_size=42_000', 'langs=[arb_Arab,ces_Latn,cmn_Hani,dan_Latn,deu_Latn,ell_Grek,fra_Latn,fw_edu,hun_Latn,ind_Latn,ita_Latn,jpn_Jpan,nld_Latn,pol_Latn,por_Latn,rus_Cyrl,spa_Latn,swe_Latn,tur_Latn,vie_Latn]', 'data_dir=/scratch/gsa/data/multilingual-addition/', 'output_dir=/scratch/gsa/trained_tokenizers/multilingual_addition', 'pretokenizer=custom:addition', 'number_handling=ltr_3digit', 'add_numbers=false', 'handle_contractions=false', 'unicode_normalization=nfc', 'use_byte_level_regex=false', 'byte_fallback=false', 'strip_zero_width=false', 'cjk_char_split=false', 'add_cjk_chars=false', 'max_lines=-1', 'test_string=yirmi iki+dokuz=otuz bir\\ntwenty two+nine=thirty one', 'hf.publish_to_hf=true', 'hf_repo_prefix=flexitok/', 'hf.hf_repo_id=flexitok/maddition_AllL_42000', 'hf.collections=[flexitok/multilingual_addition_tokenizers]']

special_tokens_map.json CHANGED Viewed

@@ -1,8 +1,4 @@
 {
-  "additional_special_tokens": [
-    "+",
-    "="
-  ],
   "bos_token": "<s>",
   "eos_token": "</s>",
   "pad_token": "<pad>"

 {
   "bos_token": "<s>",
   "eos_token": "</s>",
   "pad_token": "<pad>"

tokenizer.json CHANGED Viewed

@@ -29,24 +29,6 @@
       "rstrip": false,
       "normalized": false,
       "special": true
-    },
-    {
-      "id": 3,
-      "content": "+",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
-    },
-    {
-      "id": 4,
-      "content": "=",
-      "single_word": false,
-      "lstrip": false,
-      "rstrip": false,
-      "normalized": false,
-      "special": true
     }
   ],
   "normalizer": {
@@ -99,35 +81,35 @@
       "<s>": 0,
       "</s>": 1,
       "<pad>": 2,
-      "+": 3,
-      "=": 4,
-      "!": 5,
-      "\"": 6,
-      "#": 7,
-      "$": 8,
-      "%": 9,
-      "&": 10,
-      "'": 11,
-      "(": 12,
-      ")": 13,
-      "*": 14,
-      ",": 15,
-      "-": 16,
-      ".": 17,
-      "/": 18,
-      "0": 19,
-      "1": 20,
-      "2": 21,
-      "3": 22,
-      "4": 23,
-      "5": 24,
-      "6": 25,
-      "7": 26,
-      "8": 27,
-      "9": 28,
-      ":": 29,
-      ";": 30,
-      "<": 31,
       ">": 32,
       "?": 33,
       "@": 34,

       "rstrip": false,
       "normalized": false,
       "special": true
     }
   ],
   "normalizer": {
       "<s>": 0,
       "</s>": 1,
       "<pad>": 2,
+      "!": 3,
+      "\"": 4,
+      "#": 5,
+      "$": 6,
+      "%": 7,
+      "&": 8,
+      "'": 9,
+      "(": 10,
+      ")": 11,
+      "*": 12,
+      "+": 13,
+      ",": 14,
+      "-": 15,
+      ".": 16,
+      "/": 17,
+      "0": 18,
+      "1": 19,
+      "2": 20,
+      "3": 21,
+      "4": 22,
+      "5": 23,
+      "6": 24,
+      "7": 25,
+      "8": 26,
+      "9": 27,
+      ":": 28,
+      ";": 29,
+      "<": 30,
+      "=": 31,
       ">": 32,
       "?": 33,
       "@": 34,

tokenizer_config.json CHANGED Viewed

@@ -23,28 +23,8 @@
       "rstrip": false,
       "single_word": false,
       "special": true
-    },
-    "3": {
-      "content": "+",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
-    },
-    "4": {
-      "content": "=",
-      "lstrip": false,
-      "normalized": false,
-      "rstrip": false,
-      "single_word": false,
-      "special": true
     }
   },
-  "additional_special_tokens": [
-    "+",
-    "="
-  ],
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",

       "rstrip": false,
       "single_word": false,
       "special": true
     }
   },
   "bos_token": "<s>",
   "clean_up_tokenization_spaces": false,
   "eos_token": "</s>",

vocab.json CHANGED Viewed

The diff for this file is too large to render. See raw diff