mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-18 06:19:14 +00:00
Adding 2 new decoders: (#1196)
* Adding 2 new decoders:
- Fuse will simply concatenate all tokens into 1 string
- Strip will remove n char from left or right
Sequence(Replace("_", " "), Fuse(), Strip(1, 0)) should be what we want
for the `Metaspace` thing.
- Note: Added a new dependency from better parsing of decoders.
This is due to untagged enums which can match anything the `MustBe`
ensure there's no issue between Fuse and ByteFallback.
Since both are new the chances for backward incompatibility is low.
* Fixing picking/unpickling (using default args.).
* Stub.
* Black.
* Fixing node.
This commit is contained in:
@@ -13,6 +13,8 @@ from tokenizers.decoders import (
|
||||
WordPiece,
|
||||
ByteFallback,
|
||||
Replace,
|
||||
Strip,
|
||||
Fuse,
|
||||
)
|
||||
|
||||
|
||||
@@ -94,6 +96,30 @@ class TestByteFallback:
|
||||
assert decoder.decode(["<0xE5>", "<0x8f>", "<0xab>", "a"]) == "叫a"
|
||||
|
||||
|
||||
class TestFuse:
|
||||
def test_instantiate(self):
|
||||
assert Fuse() is not None
|
||||
assert isinstance(Fuse(), Decoder)
|
||||
assert isinstance(Fuse(), Fuse)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Fuse())), Fuse)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = Fuse()
|
||||
assert decoder.decode(["My", " na", "me"]) == "My name"
|
||||
|
||||
|
||||
class TestStrip:
|
||||
def test_instantiate(self):
|
||||
assert Strip(left=0, right=0) is not None
|
||||
assert isinstance(Strip(left=0, right=0), Decoder)
|
||||
assert isinstance(Strip(left=0, right=0), Strip)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Strip(left=0, right=0))), Strip)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = Strip(left=1, right=0)
|
||||
assert decoder.decode(["My", " na", "me"]) == "ynae"
|
||||
|
||||
|
||||
class TestMetaspace:
|
||||
def test_instantiate(self):
|
||||
assert Metaspace() is not None
|
||||
|
||||
Reference in New Issue
Block a user