mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-18 06:19:14 +00:00
Adding truncation_side within TruncationParams. (#860)
* Add truncation to enable_truncation * Fix typo * Adding truncation_side within `TruncationParams`. * Node serialization of this direction param. * Update the test. * Fixing warnings/lint. * Adding stuff (can't local debug :( ) * Slow loop... ;( * Stub.py. Co-authored-by: Niels Rogge <niels.rogge1@gmail.com>
This commit is contained in:
@@ -154,7 +154,7 @@ class TestTemplateProcessing:
|
||||
with pytest.raises(Exception, match="Cannot build Piece"):
|
||||
processor = TemplateProcessing(single="[CLS] $A: [SEP]")
|
||||
# Special tokens must be provided when used in template:
|
||||
with pytest.raises(Exception, match="Missing SpecialToken\(s\) with id\(s\)"):
|
||||
with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
|
||||
processor = TemplateProcessing(single=["[CLS]"])
|
||||
|
||||
def test_bert_parity(self):
|
||||
|
||||
@@ -125,7 +125,9 @@ class TestTokenizer:
|
||||
assert type(output.ids) == list
|
||||
assert type(output.type_ids) == list
|
||||
assert type(output.offsets) == list
|
||||
assert type(output.words) == list
|
||||
with pytest.warns(DeprecationWarning):
|
||||
assert type(output.words) == list
|
||||
assert type(output.word_ids) == list
|
||||
assert type(output.special_tokens_mask) == list
|
||||
assert type(output.attention_mask) == list
|
||||
assert type(output.overflowing) == list
|
||||
@@ -311,6 +313,14 @@ class TestTokenizer:
|
||||
trunc = tokenizer.truncation
|
||||
tokenizer.enable_truncation(**trunc)
|
||||
|
||||
# Left truncation direction
|
||||
tokenizer.enable_truncation(2, direction="left")
|
||||
output = tokenizer.encode("my name is john")
|
||||
assert output.tokens == ["is", "john"]
|
||||
|
||||
output = tokenizer.encode("my name is john", "pair")
|
||||
assert output.tokens == ["john", "pair"]
|
||||
|
||||
def test_padding(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
|
||||
|
||||
Reference in New Issue
Block a user