Adding truncation_side within TruncationParams. (#860)

* Add truncation to enable_truncation * Fix typo * Adding truncation_side within `TruncationParams`. * Node serialization of this direction param. * Update the test. * Fixing warnings/lint. * Adding stuff (can't local debug :( ) * Slow loop... ;( * Stub.py. Co-authored-by: Niels Rogge <niels.rogge1@gmail.com>
2025-12-18 06:19:14 +00:00 · 2021-12-28 12:37:06 +01:00
parent c4c9de23a5
commit 152880ab3e
16 changed files with 478 additions and 26 deletions
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@@ -154,7 +154,7 @@ class TestTemplateProcessing:
        with pytest.raises(Exception, match="Cannot build Piece"):
            processor = TemplateProcessing(single="[CLS] $A: [SEP]")
        # Special tokens must be provided when used in template:
-        with pytest.raises(Exception, match="Missing SpecialToken\(s\) with id\(s\)"):
+        with pytest.raises(Exception, match="Missing SpecialToken\\(s\\) with id\\(s\\)"):
            processor = TemplateProcessing(single=["[CLS]"])

    def test_bert_parity(self):
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@@ -125,7 +125,9 @@ class TestTokenizer:
        assert type(output.ids) == list
        assert type(output.type_ids) == list
        assert type(output.offsets) == list
-        assert type(output.words) == list
+        with pytest.warns(DeprecationWarning):
+            assert type(output.words) == list
+        assert type(output.word_ids) == list
        assert type(output.special_tokens_mask) == list
        assert type(output.attention_mask) == list
        assert type(output.overflowing) == list
@@ -311,6 +313,14 @@ class TestTokenizer:
        trunc = tokenizer.truncation
        tokenizer.enable_truncation(**trunc)

+        # Left truncation direction
+        tokenizer.enable_truncation(2, direction="left")
+        output = tokenizer.encode("my name is john")
+        assert output.tokens == ["is", "john"]
+
+        output = tokenizer.encode("my name is john", "pair")
+        assert output.tokens == ["john", "pair"]
+
    def test_padding(self):
        tokenizer = Tokenizer(BPE())
        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])