mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-18 06:19:14 +00:00
Adding ByteFallback support for tokenizers. (#1183)
* Adding ByteFallback support for `tokenizers`. Two items added: - A flag `byte_fallback` for the `BPE` model. This will be in charge of using `<0x61>` instead of unk on unknown tokens. - A ByteFallback decoder, which will be in charge of putting everything back into string whenever possible. Showing � when the byte decoding fails (behavior checked against LlamaTokenizer in `transformers`. * Update rustdoc. * Clippy + Add BPE(byte_fallback) into bindings. * Stupid file. * Test artifacts removed. * Update stub. * Fix. * Bad file. * CRITICAL FIX: wrapper order because of untagged.... * Remove prints. * Fixing <16 byte fallback.
This commit is contained in:
@@ -3,7 +3,7 @@ import pickle
|
||||
|
||||
import pytest
|
||||
|
||||
from tokenizers.decoders import CTC, BPEDecoder, ByteLevel, Decoder, Metaspace, Sequence, WordPiece
|
||||
from tokenizers.decoders import CTC, BPEDecoder, ByteLevel, Decoder, Metaspace, Sequence, WordPiece, ByteFallback
|
||||
|
||||
|
||||
class TestByteLevel:
|
||||
@@ -54,6 +54,24 @@ class TestWordPiece:
|
||||
assert decoder.cleanup == True
|
||||
|
||||
|
||||
class TestByteFallback:
|
||||
def test_instantiate(self):
|
||||
assert ByteFallback() is not None
|
||||
assert isinstance(ByteFallback(), Decoder)
|
||||
assert isinstance(ByteFallback(), ByteFallback)
|
||||
assert isinstance(pickle.loads(pickle.dumps(ByteFallback())), ByteFallback)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = ByteFallback()
|
||||
assert decoder.decode(["My", " na", "me"]) == "My name"
|
||||
assert decoder.decode(["<0x61>"]) == "a"
|
||||
assert decoder.decode(["<0xE5>"]) == "<EFBFBD>"
|
||||
assert decoder.decode(["<0xE5>", "<0x8f>"]) == "<EFBFBD><EFBFBD>"
|
||||
assert decoder.decode(["<0xE5>", "<0x8f>", "<0xab>"]) == "叫"
|
||||
assert decoder.decode(["<0xE5>", "<0x8f>", "a"]) == "<EFBFBD><EFBFBD>a"
|
||||
assert decoder.decode(["<0xE5>", "<0x8f>", "<0xab>", "a"]) == "叫a"
|
||||
|
||||
|
||||
class TestMetaspace:
|
||||
def test_instantiate(self):
|
||||
assert Metaspace() is not None
|
||||
|
||||
@@ -54,6 +54,7 @@ class TestBPE:
|
||||
assert model.continuing_subword_prefix == "__prefix__"
|
||||
assert model.end_of_word_suffix == "__suffix__"
|
||||
assert model.fuse_unk == False
|
||||
assert model.byte_fallback == False
|
||||
|
||||
# Modify these
|
||||
model.dropout = 0.1
|
||||
@@ -66,6 +67,8 @@ class TestBPE:
|
||||
assert model.end_of_word_suffix == "suff"
|
||||
model.fuse_unk = True
|
||||
assert model.fuse_unk == True
|
||||
model.byte_fallback = True
|
||||
assert model.byte_fallback == True
|
||||
|
||||
|
||||
class TestWordPiece:
|
||||
|
||||
Reference in New Issue
Block a user