Adding truncation_side within TruncationParams. (#860)

* Add truncation to enable_truncation

* Fix typo

* Adding truncation_side within `TruncationParams`.

* Node serialization of this direction param.

* Update the test.

* Fixing warnings/lint.

* Adding stuff (can't local debug :( )

* Slow loop... ;(

* Stub.py.

Co-authored-by: Niels Rogge <niels.rogge1@gmail.com>
This commit is contained in:
Nicolas Patry
2021-12-28 12:37:06 +01:00
committed by GitHub
parent c4c9de23a5
commit 152880ab3e
16 changed files with 478 additions and 26 deletions

View File

@@ -154,7 +154,7 @@ class TestTemplateProcessing:
with pytest.raises(Exception, match="Cannot build Piece"):
processor = TemplateProcessing(single="[CLS] $A: [SEP]")
# Special tokens must be provided when used in template:
with pytest.raises(Exception, match="Missing SpecialToken\(s\) with id\(s\)"):
with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
processor = TemplateProcessing(single=["[CLS]"])
def test_bert_parity(self):

View File

@@ -125,7 +125,9 @@ class TestTokenizer:
assert type(output.ids) == list
assert type(output.type_ids) == list
assert type(output.offsets) == list
assert type(output.words) == list
with pytest.warns(DeprecationWarning):
assert type(output.words) == list
assert type(output.word_ids) == list
assert type(output.special_tokens_mask) == list
assert type(output.attention_mask) == list
assert type(output.overflowing) == list
@@ -311,6 +313,14 @@ class TestTokenizer:
trunc = tokenizer.truncation
tokenizer.enable_truncation(**trunc)
# Left truncation direction
tokenizer.enable_truncation(2, direction="left")
output = tokenizer.encode("my name is john")
assert output.tokens == ["is", "john"]
output = tokenizer.encode("my name is john", "pair")
assert output.tokens == ["john", "pair"]
def test_padding(self):
tokenizer = Tokenizer(BPE())
tokenizer.add_tokens(["my", "name", "is", "john", "pair"])