mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Support None
to reset pre_tokenizers and normalizers, and index sequences (#1590)
* initial commit * support None * fix clippy * cleanup * clean? * propagate to pre_tokenizer * fix test * fix rust tests * fix node * propagate to decoder and post processor * fix calls * lint * fmt * node be happy I am fixing you * initial commit * support None * fix clippy * cleanup * clean? * propagate to pre_tokenizer * fix test * fix rust tests * fix node * propagate to decoder and post processor * fix calls * lint * fmt * node be happy I am fixing you * add a small test * styling * style merge * fix merge test * fmt * nits * update tset
This commit is contained in:
@ -67,6 +67,14 @@ class TestSequence:
|
||||
output = normalizer.normalize_str(" HELLO ")
|
||||
assert output == "hello"
|
||||
|
||||
def test_items(self):
|
||||
normalizers = Sequence([BertNormalizer(True, True), Prepend()])
|
||||
assert normalizers[1].__class__ == Prepend
|
||||
normalizers[0].lowercase = False
|
||||
assert not normalizers[0].lowercase
|
||||
with pytest.raises(IndexError):
|
||||
print(normalizers[2])
|
||||
|
||||
|
||||
class TestLowercase:
|
||||
def test_instantiate(self):
|
||||
|
@ -169,6 +169,13 @@ class TestSequence:
|
||||
("?", (29, 30)),
|
||||
]
|
||||
|
||||
def test_items(self):
|
||||
pre_tokenizers = Sequence([Metaspace("a", "never", split=True), Punctuation()])
|
||||
assert pre_tokenizers[1].__class__ == Punctuation
|
||||
assert pre_tokenizers[0].__class__ == Metaspace
|
||||
pre_tokenizers[0].split = False
|
||||
assert not pre_tokenizers[0].split
|
||||
|
||||
|
||||
class TestDigits:
|
||||
def test_instantiate(self):
|
||||
|
@ -6,10 +6,11 @@ import pytest
|
||||
from tokenizers import AddedToken, Encoding, Tokenizer
|
||||
from tokenizers.implementations import BertWordPieceTokenizer
|
||||
from tokenizers.models import BPE, Model, Unigram
|
||||
from tokenizers.pre_tokenizers import ByteLevel
|
||||
from tokenizers.pre_tokenizers import ByteLevel, Metaspace
|
||||
from tokenizers.processors import RobertaProcessing, TemplateProcessing
|
||||
from tokenizers.normalizers import Strip, Lowercase, Sequence
|
||||
|
||||
|
||||
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
|
||||
|
||||
|
||||
@ -551,6 +552,16 @@ class TestTokenizer:
|
||||
assert output == "name is john"
|
||||
assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)
|
||||
|
||||
def test_setting_to_none(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.normalizer = Strip()
|
||||
tokenizer.normalizer = None
|
||||
assert tokenizer.normalizer == None
|
||||
|
||||
tokenizer.pre_tokenizer = Metaspace()
|
||||
tokenizer.pre_tokenizer = None
|
||||
assert tokenizer.pre_tokenizer == None
|
||||
|
||||
|
||||
class TestTokenizerRepr:
|
||||
def test_repr(self):
|
||||
|
Reference in New Issue
Block a user