Python - Improve tests for new encode/encode_batch

This commit is contained in:
Anthony MOI
2020-04-28 17:37:17 -04:00
parent efaa6f589a
commit 3fb8033770
4 changed files with 49 additions and 14 deletions

View File

@@ -19,7 +19,7 @@ class TestBertProcessing:
tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1)) tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
output = tokenizer.encode(("my name", "pair")) output = tokenizer.encode("my name", "pair")
assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"] assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"]
assert output.ids == [1, 2, 3, 0, 6, 0] assert output.ids == [1, 2, 3, 0, 6, 0]
@@ -37,7 +37,7 @@ class TestRobertaProcessing:
tokenizer.add_tokens(["my", "name", "is", "john", "pair"]) tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0)) tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))
output = tokenizer.encode(("my name", "pair")) output = tokenizer.encode("my name", "pair")
assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"] assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"]
assert output.ids == [0, 2, 3, 1, 1, 6, 1] assert output.ids == [0, 2, 3, 1, 1, 6, 1]

View File

@@ -1,10 +1,12 @@
from ..utils import data_dir, roberta_files import pytest
from ..utils import data_dir, roberta_files, bert_files
from tokenizers import AddedToken, Tokenizer from tokenizers import AddedToken, Tokenizer, Encoding
from tokenizers.models import Model, BPE from tokenizers.models import Model, BPE, WordPiece
from tokenizers.pre_tokenizers import ByteLevel from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.processors import RobertaProcessing from tokenizers.processors import RobertaProcessing, BertProcessing
from tokenizers.normalizers import Lowercase from tokenizers.normalizers import Lowercase
from tokenizers.implementations import BertWordPieceTokenizer
class TestAddedToken: class TestAddedToken:
@@ -102,17 +104,50 @@ class TestTokenizer:
assert type(output.overflowing) == list assert type(output.overflowing) == list
# Can encode a pair of sequences # Can encode a pair of sequences
output = tokenizer.encode(("my name is john", "pair")) output = tokenizer.encode("my name is john", "pair")
assert output.tokens == ["my", "name", "is", "john", "pair"] assert output.tokens == ["my", "name", "is", "john", "pair"]
# Can encode a single pre-tokenized sequence # Can encode a single pre-tokenized sequence
output = tokenizer.encode(["my", "name", "is", "john"]) output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
assert output.tokens == ["my", "name", "is", "john"] assert output.tokens == ["my", "name", "is", "john"]
# Can encode a batch with both a single sequence and a pair of sequences # Can encode a batch with both a single sequence and a pair of sequences
output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")]) output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
assert len(output) == 2 assert len(output) == 2
def test_encode_formats(self, bert_files):
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
# Well formed
output = tokenizer.encode("my name is john")
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
output = tokenizer.encode("my name is john", "pair")
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
output = tokenizer.encode(["my", "name", "is", "john"], is_pretokenized=True)
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
output = tokenizer.encode(["my", "name", "is", "john"], ["pair"], is_pretokenized=True)
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
output = tokenizer.encode_batch(["My name is John", "My name is Georges"])
assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
assert output[1].tokens == ["[CLS]", "my", "name", "is", "georges", "[SEP]"]
output = tokenizer.encode_batch([("my name is john", "pair"), ("my name is john", "pair")])
assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
assert output[1].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
output = tokenizer.encode_batch([["my", "name", "is", "john"]], is_pretokenized=True)
assert output[0].tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]"]
# Mal formed
with pytest.raises(ValueError, match="InputSequence must be str"):
tokenizer.encode([["my", "name"]])
tokenizer.encode("My name is john", [["pair"]])
tokenizer.encode("my name is john", ["pair"])
with pytest.raises(ValueError, match="InputSequence must be Union[List[str]"):
tokenizer.encode("My name is john", is_pretokenized=True)
tokenizer.encode("My name is john", ["pair"], is_pretokenized=True)
tokenizer.encode(["My", "name", "is", "John"], "pair", is_pretokenized=True)
def test_encode_add_special_tokens(self, roberta_files): def test_encode_add_special_tokens(self, roberta_files):
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
tokenizer.add_special_tokens(["<s>", "</s>"]) tokenizer.add_special_tokens(["<s>", "</s>"])
@@ -140,7 +175,7 @@ class TestTokenizer:
assert output.tokens == ["my", "name"] assert output.tokens == ["my", "name"]
# Can truncate pair sequences as well # Can truncate pair sequences as well
output = tokenizer.encode(("my name is john", "pair")) output = tokenizer.encode("my name is john", "pair")
assert output.tokens == ["my", "pair"] assert output.tokens == ["my", "pair"]
def test_padding(self): def test_padding(self):
@@ -160,7 +195,7 @@ class TestTokenizer:
tokenizer.enable_padding(max_length=4) tokenizer.enable_padding(max_length=4)
output = tokenizer.encode("my name") output = tokenizer.encode("my name")
assert output.tokens == ["my", "name", "[PAD]", "[PAD]"] assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
output = tokenizer.encode(("my name", "pair")) output = tokenizer.encode("my name", "pair")
assert output.tokens == ["my", "name", "pair", "[PAD]"] assert output.tokens == ["my", "name", "pair", "[PAD]"]
def test_decode(self): def test_decode(self):

View File

@@ -7,14 +7,14 @@ class TestBertWordPieceBPE:
tokenizer = BertWordPieceTokenizer(bert_files["vocab"]) tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
# Encode with special tokens by default # Encode with special tokens by default
output = tokenizer.encode(("My name is John", "pair")) output = tokenizer.encode("My name is John", "pair")
assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102] assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"] assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)] assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1] assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
# Can encode without the special tokens # Can encode without the special tokens
output = tokenizer.encode(("My name is John", "pair"), add_special_tokens=False) output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
assert output.ids == [2026, 2171, 2003, 2198, 3940] assert output.ids == [2026, 2171, 2003, 2198, 3940]
assert output.tokens == ["my", "name", "is", "john", "pair"] assert output.tokens == ["my", "name", "is", "john", "pair"]
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]

View File

@@ -6,7 +6,7 @@ class TestBertWordPieceBPE:
def test_basic_encode(self, openai_files): def test_basic_encode(self, openai_files):
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"]) tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
output = tokenizer.encode(("My name is John", "pair")) output = tokenizer.encode("My name is John", "pair")
assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688] assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
assert output.tokens == [ assert output.tokens == [
"<unk>", "<unk>",
@@ -32,7 +32,7 @@ class TestBertWordPieceBPE:
def test_lowercase(self, openai_files): def test_lowercase(self, openai_files):
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True) tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
output = tokenizer.encode(("My name is John", "pair"), add_special_tokens=False) output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
assert output.ids == [547, 1362, 544, 2476, 2688] assert output.ids == [547, 1362, 544, 2476, 2688]
assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"] assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)] assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]