Adding ByteFallback support for tokenizers. (#1183)

* Adding ByteFallback support for `tokenizers`. Two items added: - A flag `byte_fallback` for the `BPE` model. This will be in charge of using `<0x61>` instead of unk on unknown tokens. - A ByteFallback decoder, which will be in charge of putting everything back into string whenever possible. Showing � when the byte decoding fails (behavior checked against LlamaTokenizer in `transformers`. * Update rustdoc. * Clippy + Add BPE(byte_fallback) into bindings. * Stupid file. * Test artifacts removed. * Update stub. * Fix. * Bad file. * CRITICAL FIX: wrapper order because of untagged.... * Remove prints. * Fixing <16 byte fallback.
2025-12-18 06:19:14 +00:00 · 2023-03-23 16:04:32 +01:00
parent b8fbea00a9
commit 73637a0004
16 changed files with 359 additions and 21 deletions
--- a/bindings/python/tests/bindings/test_decoders.py
+++ b/bindings/python/tests/bindings/test_decoders.py
@@ -3,7 +3,7 @@ import pickle

 import pytest

-from tokenizers.decoders import CTC, BPEDecoder, ByteLevel, Decoder, Metaspace, Sequence, WordPiece
+from tokenizers.decoders import CTC, BPEDecoder, ByteLevel, Decoder, Metaspace, Sequence, WordPiece, ByteFallback


 class TestByteLevel:
@@ -54,6 +54,24 @@ class TestWordPiece:
        assert decoder.cleanup == True


+class TestByteFallback:
+    def test_instantiate(self):
+        assert ByteFallback() is not None
+        assert isinstance(ByteFallback(), Decoder)
+        assert isinstance(ByteFallback(), ByteFallback)
+        assert isinstance(pickle.loads(pickle.dumps(ByteFallback())), ByteFallback)
+
+    def test_decoding(self):
+        decoder = ByteFallback()
+        assert decoder.decode(["My", " na", "me"]) == "My name"
+        assert decoder.decode(["<0x61>"]) == "a"
+        assert decoder.decode(["<0xE5>"]) == "<EFBFBD>"
+        assert decoder.decode(["<0xE5>", "<0x8f>"]) == "<EFBFBD><EFBFBD>"
+        assert decoder.decode(["<0xE5>", "<0x8f>", "<0xab>"]) == "叫"
+        assert decoder.decode(["<0xE5>", "<0x8f>", "a"]) == "<EFBFBD><EFBFBD>a"
+        assert decoder.decode(["<0xE5>", "<0x8f>", "<0xab>", "a"]) == "叫a"
+
+
 class TestMetaspace:
    def test_instantiate(self):
        assert Metaspace() is not None
--- a/bindings/python/tests/bindings/test_models.py
+++ b/bindings/python/tests/bindings/test_models.py
@@ -54,6 +54,7 @@ class TestBPE:
        assert model.continuing_subword_prefix == "__prefix__"
        assert model.end_of_word_suffix == "__suffix__"
        assert model.fuse_unk == False
+        assert model.byte_fallback == False

        # Modify these
        model.dropout = 0.1
@@ -66,6 +67,8 @@ class TestBPE:
        assert model.end_of_word_suffix == "suff"
        model.fuse_unk = True
        assert model.fuse_unk == True
+        model.byte_fallback = True
+        assert model.byte_fallback == True


 class TestWordPiece: