Support None to reset pre_tokenizers and normalizers, and index sequences (#1590)

* initial commit

* support None

* fix clippy

* cleanup

* clean?

* propagate to pre_tokenizer

* fix test

* fix rust tests

* fix node

* propagate to decoder and post processor

* fix calls

* lint

* fmt

* node be happy I am fixing you

* initial commit

* support None

* fix clippy

* cleanup

* clean?

* propagate to pre_tokenizer

* fix test

* fix rust tests

* fix node

* propagate to decoder and post processor

* fix calls

* lint

* fmt

* node be happy I am fixing you

* add a small test

* styling

* style merge

* fix merge test

* fmt

* nits

* update tset
This commit is contained in:
Arthur
2024-08-07 12:52:35 +02:00
committed by GitHub
parent eea8e1ae6f
commit bded212356
13 changed files with 134 additions and 67 deletions

View File

@ -67,6 +67,14 @@ class TestSequence:
output = normalizer.normalize_str(" HELLO ")
assert output == "hello"
def test_items(self):
normalizers = Sequence([BertNormalizer(True, True), Prepend()])
assert normalizers[1].__class__ == Prepend
normalizers[0].lowercase = False
assert not normalizers[0].lowercase
with pytest.raises(IndexError):
print(normalizers[2])
class TestLowercase:
def test_instantiate(self):

View File

@ -169,6 +169,13 @@ class TestSequence:
("?", (29, 30)),
]
def test_items(self):
pre_tokenizers = Sequence([Metaspace("a", "never", split=True), Punctuation()])
assert pre_tokenizers[1].__class__ == Punctuation
assert pre_tokenizers[0].__class__ == Metaspace
pre_tokenizers[0].split = False
assert not pre_tokenizers[0].split
class TestDigits:
def test_instantiate(self):

View File

@ -6,10 +6,11 @@ import pytest
from tokenizers import AddedToken, Encoding, Tokenizer
from tokenizers.implementations import BertWordPieceTokenizer
from tokenizers.models import BPE, Model, Unigram
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.pre_tokenizers import ByteLevel, Metaspace
from tokenizers.processors import RobertaProcessing, TemplateProcessing
from tokenizers.normalizers import Strip, Lowercase, Sequence
from ..utils import bert_files, data_dir, multiprocessing_with_parallelism, roberta_files
@ -551,6 +552,16 @@ class TestTokenizer:
assert output == "name is john"
assert tokenizer.get_added_tokens_decoder()[0] == AddedToken("my", special=True)
def test_setting_to_none(self):
tokenizer = Tokenizer(BPE())
tokenizer.normalizer = Strip()
tokenizer.normalizer = None
assert tokenizer.normalizer == None
tokenizer.pre_tokenizer = Metaspace()
tokenizer.pre_tokenizer = None
assert tokenizer.pre_tokenizer == None
class TestTokenizerRepr:
def test_repr(self):