mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-06 20:58:22 +00:00
Python - Test Decoders
This commit is contained in:
@@ -107,7 +107,7 @@ impl BPEDecoder {
|
||||
#[new]
|
||||
#[args(kwargs = "**")]
|
||||
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
|
||||
let mut suffix = String::from("</w");
|
||||
let mut suffix = String::from("</w>");
|
||||
|
||||
if let Some(kwargs) = kwargs {
|
||||
for (key, value) in kwargs {
|
||||
|
||||
61
bindings/python/tests/bindings/test_decoders.py
Normal file
61
bindings/python/tests/bindings/test_decoders.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import pytest
|
||||
|
||||
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
|
||||
|
||||
|
||||
class TestByteLevel:
|
||||
def test_instantiate(self):
|
||||
assert ByteLevel() is not None
|
||||
assert isinstance(ByteLevel(), Decoder)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = ByteLevel()
|
||||
assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John"
|
||||
|
||||
|
||||
class TestWordPiece:
|
||||
def test_instantiate(self):
|
||||
assert WordPiece() is not None
|
||||
assert WordPiece(prefix="__") is not None
|
||||
assert WordPiece(cleanup=True) is not None
|
||||
assert isinstance(WordPiece(), Decoder)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = WordPiece()
|
||||
assert decoder.decode(["My", "na", "##me", "is", "Jo", "##hn"]) == "My name is John"
|
||||
assert decoder.decode(["I", "'m", "Jo", "##hn"]) == "I'm John"
|
||||
decoder = WordPiece(prefix="__", cleanup=False)
|
||||
assert decoder.decode(["My", "na", "__me", "is", "Jo", "__hn"]) == "My name is John"
|
||||
assert decoder.decode(["I", "'m", "Jo", "__hn"]) == "I 'm John"
|
||||
|
||||
|
||||
class TestMetaspace:
|
||||
def test_instantiate(self):
|
||||
assert Metaspace() is not None
|
||||
assert Metaspace(replacement="-") is not None
|
||||
with pytest.raises(Exception, match="replacement must be a character"):
|
||||
Metaspace(replacement="")
|
||||
assert Metaspace(add_prefix_space=True) is not None
|
||||
assert isinstance(Metaspace(), Decoder)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = Metaspace()
|
||||
assert decoder.decode(["▁My", "▁name", "▁is", "▁John"]) == "My name is John"
|
||||
decoder = Metaspace(replacement="-", add_prefix_space=False)
|
||||
assert decoder.decode(["-My", "-name", "-is", "-John"]) == " My name is John"
|
||||
|
||||
|
||||
class TestBPEDecoder:
|
||||
def test_instantiate(self):
|
||||
assert BPEDecoder() is not None
|
||||
assert BPEDecoder(suffix="_") is not None
|
||||
assert isinstance(BPEDecoder(), Decoder)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = BPEDecoder()
|
||||
assert (
|
||||
decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"])
|
||||
== "My name is John"
|
||||
)
|
||||
decoder = BPEDecoder(suffix="_")
|
||||
assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == "My name is John"
|
||||
Reference in New Issue
Block a user