| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | """ Testing suite for the PyTorch CpmBee tokenizer. """ |
| |
|
| | import os |
| | import unittest |
| |
|
| | from transformers.models.cpmbee.tokenization_cpmbee import VOCAB_FILES_NAMES, CpmBeeTokenizer |
| | from transformers.tokenization_utils import AddedToken |
| |
|
| | from ...test_tokenization_common import TokenizerTesterMixin |
| |
|
| |
|
| | class CPMBeeTokenizationTest(TokenizerTesterMixin, unittest.TestCase): |
| | tokenizer_class = CpmBeeTokenizer |
| | test_rust_tokenizer = False |
| |
|
| | def setUp(self): |
| | super().setUp() |
| |
|
| | vocab_tokens = [ |
| | "<d>", |
| | "</d>", |
| | "<s>", |
| | "</s>", |
| | "</_>", |
| | "<unk>", |
| | "<pad>", |
| | "<mask>", |
| | "</n>", |
| | "我", |
| | "是", |
| | "C", |
| | "P", |
| | "M", |
| | "B", |
| | "e", |
| | "e", |
| | ] |
| | self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) |
| | vocab_tokens = list(set(vocab_tokens)) |
| | with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: |
| | vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) |
| |
|
| | |
| | def test_add_tokens_tokenizer(self): |
| | tokenizers = self.get_tokenizers(do_lower_case=False) |
| | for tokenizer in tokenizers: |
| | with self.subTest(f"{tokenizer.__class__.__name__}"): |
| | vocab_size = tokenizer.vocab_size |
| | all_size = len(tokenizer) |
| |
|
| | self.assertNotEqual(vocab_size, 0) |
| |
|
| | |
| | |
| | |
| |
|
| | new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"] |
| | added_toks = tokenizer.add_tokens(new_toks) |
| | vocab_size_2 = tokenizer.vocab_size |
| | all_size_2 = len(tokenizer) |
| |
|
| | self.assertNotEqual(vocab_size_2, 0) |
| | self.assertEqual(vocab_size, vocab_size_2) |
| | self.assertEqual(added_toks, len(new_toks)) |
| | self.assertEqual(all_size_2, all_size + len(new_toks)) |
| |
|
| | tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) |
| |
|
| | self.assertGreaterEqual(len(tokens), 4) |
| | self.assertGreater(tokens[0], tokenizer.vocab_size - 1) |
| | self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) |
| |
|
| | new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||;;;||;"} |
| | added_toks_2 = tokenizer.add_special_tokens(new_toks_2) |
| | vocab_size_3 = tokenizer.vocab_size |
| | all_size_3 = len(tokenizer) |
| |
|
| | self.assertNotEqual(vocab_size_3, 0) |
| | self.assertEqual(vocab_size, vocab_size_3) |
| | self.assertEqual(added_toks_2, len(new_toks_2)) |
| | self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) |
| |
|
| | tokens = tokenizer.encode( |
| | ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||;;;||; l", add_special_tokens=False |
| | ) |
| |
|
| | self.assertGreaterEqual(len(tokens), 6) |
| | self.assertGreater(tokens[0], tokenizer.vocab_size - 1) |
| | self.assertGreater(tokens[0], tokens[1]) |
| | self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) |
| | self.assertGreater(tokens[-2], tokens[-3]) |
| | self.assertEqual(tokens[0], tokenizer.eos_token_id) |
| | self.assertEqual(tokens[-2], tokenizer.pad_token_id) |
| |
|
| | def test_added_tokens_do_lower_case(self): |
| | tokenizers = self.get_tokenizers(do_lower_case=True) |
| | for tokenizer in tokenizers: |
| | with self.subTest(f"{tokenizer.__class__.__name__}"): |
| | if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case: |
| | continue |
| |
|
| | special_token = tokenizer.all_special_tokens[0] |
| |
|
| | text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token |
| | text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token |
| |
|
| | toks_before_adding = tokenizer.tokenize(text) |
| |
|
| | new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] |
| | added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks]) |
| |
|
| | toks_after_adding = tokenizer.tokenize(text) |
| | toks_after_adding2 = tokenizer.tokenize(text2) |
| |
|
| | |
| | |
| | self.assertIn(added, [2, 4]) |
| |
|
| | self.assertListEqual(toks_after_adding, toks_after_adding2) |
| | self.assertTrue( |
| | len(toks_before_adding) > len(toks_after_adding), |
| | ) |
| |
|
| | |
| | sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B" |
| | |
| | |
| | |
| | tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens)) |
| |
|
| | for special_token in tokenizer.all_special_tokens: |
| | self.assertTrue(special_token in tokenized_sequence) |
| |
|
| | tokenizers = self.get_tokenizers(do_lower_case=True) |
| | for tokenizer in tokenizers: |
| | with self.subTest(f"{tokenizer.__class__.__name__}"): |
| | if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case: |
| | continue |
| |
|
| | special_token = tokenizer.all_special_tokens[0] |
| |
|
| | text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token |
| | text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token |
| |
|
| | toks_before_adding = tokenizer.tokenize(text) |
| |
|
| | new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] |
| | added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks]) |
| | self.assertIn(added, [2, 4]) |
| |
|
| | toks_after_adding = tokenizer.tokenize(text) |
| | toks_after_adding2 = tokenizer.tokenize(text2) |
| |
|
| | self.assertEqual(len(toks_after_adding), len(toks_after_adding2)) |
| | self.assertNotEqual( |
| | toks_after_adding[1], toks_after_adding2[1] |
| | ) |
| | self.assertTrue( |
| | len(toks_before_adding) > len(toks_after_adding), |
| | ) |
| |
|
| | def test_pre_tokenization(self): |
| | tokenizer = CpmBeeTokenizer.from_pretrained("openbmb/cpm-bee-10b") |
| | texts = {"input": "你好,", "<ans>": ""} |
| | tokens = tokenizer(texts) |
| | tokens = tokens["input_ids"][0] |
| |
|
| | input_tokens = [6, 8, 7, 6, 65678, 7, 6, 10273, 246, 7, 6, 9, 7] |
| | self.assertListEqual(tokens, input_tokens) |
| |
|
| | normalized_text = "<s><root></s><s>input</s><s>你好,</s><s><ans></s>" |
| | reconstructed_text = tokenizer.decode(tokens) |
| | self.assertEqual(reconstructed_text, normalized_text) |
| |
|