This commit is contained in:
Arthur Zucker
2023-09-05 16:42:45 +00:00
parent b57e1c3f5d
commit 26fdfc2bc3
2 changed files with 16 additions and 10 deletions

View File

@ -376,7 +376,13 @@ class TestTokenizer:
# Can retrieve added token decoder
vocab = tokenizer.get_added_tokens_decoder()
assert vocab == {0: AddedToken("my", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),1: AddedToken("name", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),2: AddedToken("is", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),3: AddedToken("john", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),4: AddedToken("pair", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False)}
assert vocab == {
0: AddedToken("my", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
1: AddedToken("name", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
2: AddedToken("is", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
3: AddedToken("john", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
4: AddedToken("pair", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
}
def test_get_vocab_size(self):
tokenizer = Tokenizer(BPE())