Adding 2 new decoders: (#1196)

* Adding 2 new decoders:

- Fuse will simply concatenate all tokens into 1 string
- Strip will remove n char from left or right

Sequence(Replace("_", " "), Fuse(), Strip(1, 0)) should be what we want
for the `Metaspace` thing.

- Note: Added a new dependency from better parsing of decoders.
This is due to untagged enums which can match anything the `MustBe`
ensure there's no issue between Fuse and ByteFallback.
Since both are new the chances for backward incompatibility is low.

* Fixing picking/unpickling (using default args.).

* Stub.

* Black.

* Fixing node.
This commit is contained in:
Nicolas Patry
2023-03-24 00:50:54 +01:00
committed by GitHub
parent d2c8190a0f
commit e4aea890d5
13 changed files with 311 additions and 7 deletions

View File

@@ -13,6 +13,8 @@ from tokenizers.decoders import (
WordPiece,
ByteFallback,
Replace,
Strip,
Fuse,
)
@@ -94,6 +96,30 @@ class TestByteFallback:
assert decoder.decode(["<0xE5>", "<0x8f>", "<0xab>", "a"]) == "叫a"
class TestFuse:
def test_instantiate(self):
assert Fuse() is not None
assert isinstance(Fuse(), Decoder)
assert isinstance(Fuse(), Fuse)
assert isinstance(pickle.loads(pickle.dumps(Fuse())), Fuse)
def test_decoding(self):
decoder = Fuse()
assert decoder.decode(["My", " na", "me"]) == "My name"
class TestStrip:
def test_instantiate(self):
assert Strip(left=0, right=0) is not None
assert isinstance(Strip(left=0, right=0), Decoder)
assert isinstance(Strip(left=0, right=0), Strip)
assert isinstance(pickle.loads(pickle.dumps(Strip(left=0, right=0))), Strip)
def test_decoding(self):
decoder = Strip(left=1, right=0)
assert decoder.decode(["My", " na", "me"]) == "ynae"
class TestMetaspace:
def test_instantiate(self):
assert Metaspace() is not None