diff --git a/bindings/python/tests/bindings/test_tokenizer.py b/bindings/python/tests/bindings/test_tokenizer.py index adb76b60..8915d054 100644 --- a/bindings/python/tests/bindings/test_tokenizer.py +++ b/bindings/python/tests/bindings/test_tokenizer.py @@ -1,5 +1,10 @@ +from ..utils import data_dir, roberta_files + from tokenizers import AddedToken, Tokenizer from tokenizers.models import Model, BPE +from tokenizers.pre_tokenizers import ByteLevel +from tokenizers.processors import RobertaProcessing +from tokenizers.normalizers import Lowercase class TestAddedToken: @@ -61,3 +66,156 @@ class TestTokenizer: assert tokenizer.pre_tokenizer is None assert tokenizer.post_processor is None assert tokenizer.decoder is None + + def test_add_tokens(self): + tokenizer = Tokenizer(BPE.empty()) + added = tokenizer.add_tokens(["my", "name", "is", "john"]) + assert added == 4 + + added = tokenizer.add_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)]) + assert added == 2 + + def test_add_special_tokens(self): + tokenizer = Tokenizer(BPE.empty()) + + # Can add special tokens as `str` + added = tokenizer.add_special_tokens(["my", "name", "is", "john"]) + assert added == 4 + + # Can add special tokens as `AddedToken` + added = tokenizer.add_special_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)]) + assert added == 2 + + def test_encode(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + + # Can encode single sequence + output = tokenizer.encode("my name is john") + assert output.tokens == ["my", "name", "is", "john"] + assert type(output.ids) == list + assert type(output.type_ids) == list + assert type(output.offsets) == list + assert type(output.words) == list + assert type(output.special_tokens_mask) == list + assert type(output.attention_mask) == list + assert type(output.overflowing) == list + + # Can encode a pair of sequences + output = tokenizer.encode("my name is john", "pair") + assert output.tokens == ["my", "name", "is", "john", "pair"] + + # Can encode a batch with both a single sequence and a pair of sequences + output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) + assert len(output) == 2 + + def test_encode_add_special_tokens(self, roberta_files): + tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"])) + tokenizer.add_special_tokens(["", ""]) + + tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True) + tokenizer.post_processor = RobertaProcessing( + ("", tokenizer.token_to_id("")), ("", tokenizer.token_to_id("")), + ) + + # Can encode with special tokens + output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True) + assert output_with_specials.tokens == ["", "ĠMy", "Ġname", "Ġis", "ĠJohn", ""] + + # Can encode without special tokens + output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False) + assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"] + + def test_truncation(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + tokenizer.enable_truncation(2) + + # Can truncate single sequences + output = tokenizer.encode("my name is john") + assert output.tokens == ["my", "name"] + + # Can truncate pair sequences as well + output = tokenizer.encode("my name is john", "pair") + assert output.tokens == ["my", "pair"] + + def test_padding(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + + # By default it does nothing when encoding single sequence + tokenizer.enable_padding() + output = tokenizer.encode("my name") + assert output.tokens == ["my", "name"] + + # Can pad to the longest in a batch + output = tokenizer.encode_batch(["my name", "my name is john"]) + assert all([len(encoding) == 4 for encoding in output]) + + # Can pad to the specified max length otherwise + tokenizer.enable_padding(max_length=4) + output = tokenizer.encode("my name") + assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] + output = tokenizer.encode("my name", "pair") + assert output.tokens == ["my", "name", "pair", "[PAD]"] + + def test_decode(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + + # Can decode single sequences + output = tokenizer.decode([0, 1, 2, 3]) + assert output == "my name is john" + + # Can decode batch + output = tokenizer.decode_batch([[0, 1, 2, 3], [4]]) + assert output == ["my name is john", "pair"] + + def test_get_vocab(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + + # Can retrieve vocab with added tokens + vocab = tokenizer.get_vocab(with_added_tokens=True) + assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4} + + # Can retrieve vocab without added tokens + vocab = tokenizer.get_vocab(with_added_tokens=False) + assert vocab == {} + + def test_get_vocab_size(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + + # Can retrieve vocab's size with added tokens + size = tokenizer.get_vocab_size(with_added_tokens=True) + assert size == 5 + + # Can retrieve vocab's size without added tokens + size = tokenizer.get_vocab_size(with_added_tokens=False) + assert size == 0 + + def test_normalize(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + tokenizer.normalizer = Lowercase() + + output = tokenizer.normalize("My Name Is John") + assert output == "my name is john" + + def test_post_process(self): + tokenizer = Tokenizer(BPE.empty()) + tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) + tokenizer.enable_truncation(2) + tokenizer.enable_padding(max_length=4) + + encoding = tokenizer.encode("my name is john") + pair_encoding = tokenizer.encode("pair") + + # Can post process a single encoding + output = tokenizer.post_process(encoding) + assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] + + # Can post process a pair of encodings + output = tokenizer.post_process(encoding, pair_encoding) + assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]