mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-07 13:18:31 +00:00
Python - Improve tests on Tokenizer
This commit is contained in:
@@ -1,5 +1,10 @@
|
||||
from ..utils import data_dir, roberta_files
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer
|
||||
from tokenizers.models import Model, BPE
|
||||
from tokenizers.pre_tokenizers import ByteLevel
|
||||
from tokenizers.processors import RobertaProcessing
|
||||
from tokenizers.normalizers import Lowercase
|
||||
|
||||
|
||||
class TestAddedToken:
|
||||
@@ -61,3 +66,156 @@ class TestTokenizer:
|
||||
assert tokenizer.pre_tokenizer is None
|
||||
assert tokenizer.post_processor is None
|
||||
assert tokenizer.decoder is None
|
||||
|
||||
def test_add_tokens(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
added = tokenizer.add_tokens(["my", "name", "is", "john"])
|
||||
assert added == 4
|
||||
|
||||
added = tokenizer.add_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)])
|
||||
assert added == 2
|
||||
|
||||
def test_add_special_tokens(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
|
||||
# Can add special tokens as `str`
|
||||
added = tokenizer.add_special_tokens(["my", "name", "is", "john"])
|
||||
assert added == 4
|
||||
|
||||
# Can add special tokens as `AddedToken`
|
||||
added = tokenizer.add_special_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)])
|
||||
assert added == 2
|
||||
|
||||
def test_encode(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
|
||||
# Can encode single sequence
|
||||
output = tokenizer.encode("my name is john")
|
||||
assert output.tokens == ["my", "name", "is", "john"]
|
||||
assert type(output.ids) == list
|
||||
assert type(output.type_ids) == list
|
||||
assert type(output.offsets) == list
|
||||
assert type(output.words) == list
|
||||
assert type(output.special_tokens_mask) == list
|
||||
assert type(output.attention_mask) == list
|
||||
assert type(output.overflowing) == list
|
||||
|
||||
# Can encode a pair of sequences
|
||||
output = tokenizer.encode("my name is john", "pair")
|
||||
assert output.tokens == ["my", "name", "is", "john", "pair"]
|
||||
|
||||
# Can encode a batch with both a single sequence and a pair of sequences
|
||||
output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
|
||||
assert len(output) == 2
|
||||
|
||||
def test_encode_add_special_tokens(self, roberta_files):
|
||||
tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
|
||||
tokenizer.add_special_tokens(["<s>", "</s>"])
|
||||
|
||||
tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
|
||||
tokenizer.post_processor = RobertaProcessing(
|
||||
("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
|
||||
)
|
||||
|
||||
# Can encode with special tokens
|
||||
output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
|
||||
assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"]
|
||||
|
||||
# Can encode without special tokens
|
||||
output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
|
||||
assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
||||
|
||||
def test_truncation(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
tokenizer.enable_truncation(2)
|
||||
|
||||
# Can truncate single sequences
|
||||
output = tokenizer.encode("my name is john")
|
||||
assert output.tokens == ["my", "name"]
|
||||
|
||||
# Can truncate pair sequences as well
|
||||
output = tokenizer.encode("my name is john", "pair")
|
||||
assert output.tokens == ["my", "pair"]
|
||||
|
||||
def test_padding(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
|
||||
# By default it does nothing when encoding single sequence
|
||||
tokenizer.enable_padding()
|
||||
output = tokenizer.encode("my name")
|
||||
assert output.tokens == ["my", "name"]
|
||||
|
||||
# Can pad to the longest in a batch
|
||||
output = tokenizer.encode_batch(["my name", "my name is john"])
|
||||
assert all([len(encoding) == 4 for encoding in output])
|
||||
|
||||
# Can pad to the specified max length otherwise
|
||||
tokenizer.enable_padding(max_length=4)
|
||||
output = tokenizer.encode("my name")
|
||||
assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
|
||||
output = tokenizer.encode("my name", "pair")
|
||||
assert output.tokens == ["my", "name", "pair", "[PAD]"]
|
||||
|
||||
def test_decode(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
|
||||
# Can decode single sequences
|
||||
output = tokenizer.decode([0, 1, 2, 3])
|
||||
assert output == "my name is john"
|
||||
|
||||
# Can decode batch
|
||||
output = tokenizer.decode_batch([[0, 1, 2, 3], [4]])
|
||||
assert output == ["my name is john", "pair"]
|
||||
|
||||
def test_get_vocab(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
|
||||
# Can retrieve vocab with added tokens
|
||||
vocab = tokenizer.get_vocab(with_added_tokens=True)
|
||||
assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4}
|
||||
|
||||
# Can retrieve vocab without added tokens
|
||||
vocab = tokenizer.get_vocab(with_added_tokens=False)
|
||||
assert vocab == {}
|
||||
|
||||
def test_get_vocab_size(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
|
||||
# Can retrieve vocab's size with added tokens
|
||||
size = tokenizer.get_vocab_size(with_added_tokens=True)
|
||||
assert size == 5
|
||||
|
||||
# Can retrieve vocab's size without added tokens
|
||||
size = tokenizer.get_vocab_size(with_added_tokens=False)
|
||||
assert size == 0
|
||||
|
||||
def test_normalize(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
tokenizer.normalizer = Lowercase()
|
||||
|
||||
output = tokenizer.normalize("My Name Is John")
|
||||
assert output == "my name is john"
|
||||
|
||||
def test_post_process(self):
|
||||
tokenizer = Tokenizer(BPE.empty())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
tokenizer.enable_truncation(2)
|
||||
tokenizer.enable_padding(max_length=4)
|
||||
|
||||
encoding = tokenizer.encode("my name is john")
|
||||
pair_encoding = tokenizer.encode("pair")
|
||||
|
||||
# Can post process a single encoding
|
||||
output = tokenizer.post_process(encoding)
|
||||
assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
|
||||
|
||||
# Can post process a pair of encodings
|
||||
output = tokenizer.post_process(encoding, pair_encoding)
|
||||
assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
|
||||
|
||||
Reference in New Issue
Block a user