Adding ByteFallback support for tokenizers. (#1183)

* Adding ByteFallback support for `tokenizers`.

Two items added:

- A flag `byte_fallback` for the `BPE` model. This will be in charge
  of using `<0x61>` instead of unk on unknown tokens.
- A ByteFallback decoder, which will be in charge of putting everything
  back into string whenever possible. Showing � when the byte decoding
  fails (behavior checked against LlamaTokenizer in `transformers`.

* Update rustdoc.

* Clippy + Add BPE(byte_fallback) into bindings.

* Stupid file.

* Test artifacts removed.

* Update stub.

* Fix.

* Bad file.

* CRITICAL FIX: wrapper order because of untagged....

* Remove prints.

* Fixing <16 byte fallback.
This commit is contained in:
Nicolas Patry
2023-03-23 16:04:32 +01:00
committed by GitHub
parent b8fbea00a9
commit 73637a0004
16 changed files with 359 additions and 21 deletions

View File

@@ -3,7 +3,7 @@ import pickle
import pytest
from tokenizers.decoders import CTC, BPEDecoder, ByteLevel, Decoder, Metaspace, Sequence, WordPiece
from tokenizers.decoders import CTC, BPEDecoder, ByteLevel, Decoder, Metaspace, Sequence, WordPiece, ByteFallback
class TestByteLevel:
@@ -54,6 +54,24 @@ class TestWordPiece:
assert decoder.cleanup == True
class TestByteFallback:
def test_instantiate(self):
assert ByteFallback() is not None
assert isinstance(ByteFallback(), Decoder)
assert isinstance(ByteFallback(), ByteFallback)
assert isinstance(pickle.loads(pickle.dumps(ByteFallback())), ByteFallback)
def test_decoding(self):
decoder = ByteFallback()
assert decoder.decode(["My", " na", "me"]) == "My name"
assert decoder.decode(["<0x61>"]) == "a"
assert decoder.decode(["<0xE5>"]) == "<EFBFBD>"
assert decoder.decode(["<0xE5>", "<0x8f>"]) == "<EFBFBD><EFBFBD>"
assert decoder.decode(["<0xE5>", "<0x8f>", "<0xab>"]) == ""
assert decoder.decode(["<0xE5>", "<0x8f>", "a"]) == "<EFBFBD><EFBFBD>a"
assert decoder.decode(["<0xE5>", "<0x8f>", "<0xab>", "a"]) == "叫a"
class TestMetaspace:
def test_instantiate(self):
assert Metaspace() is not None

View File

@@ -54,6 +54,7 @@ class TestBPE:
assert model.continuing_subword_prefix == "__prefix__"
assert model.end_of_word_suffix == "__suffix__"
assert model.fuse_unk == False
assert model.byte_fallback == False
# Modify these
model.dropout = 0.1
@@ -66,6 +67,8 @@ class TestBPE:
assert model.end_of_word_suffix == "suff"
model.fuse_unk = True
assert model.fuse_unk == True
model.byte_fallback = True
assert model.byte_fallback == True
class TestWordPiece: