From a9f4c5950a76616308da39d896ee6d81e46ffb80 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Tue, 31 Mar 2020 18:20:45 -0400 Subject: [PATCH] Python - Test Decoders --- bindings/python/src/decoders.rs | 2 +- .../python/tests/bindings/test_decoders.py | 61 +++++++++++++++++++ 2 files changed, 62 insertions(+), 1 deletion(-) create mode 100644 bindings/python/tests/bindings/test_decoders.py diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 502758db..215aaff4 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -107,7 +107,7 @@ impl BPEDecoder { #[new] #[args(kwargs = "**")] fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> { - let mut suffix = String::from(""); if let Some(kwargs) = kwargs { for (key, value) in kwargs { diff --git a/bindings/python/tests/bindings/test_decoders.py b/bindings/python/tests/bindings/test_decoders.py new file mode 100644 index 00000000..6c8ba060 --- /dev/null +++ b/bindings/python/tests/bindings/test_decoders.py @@ -0,0 +1,61 @@ +import pytest + +from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder + + +class TestByteLevel: + def test_instantiate(self): + assert ByteLevel() is not None + assert isinstance(ByteLevel(), Decoder) + + def test_decoding(self): + decoder = ByteLevel() + assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John" + + +class TestWordPiece: + def test_instantiate(self): + assert WordPiece() is not None + assert WordPiece(prefix="__") is not None + assert WordPiece(cleanup=True) is not None + assert isinstance(WordPiece(), Decoder) + + def test_decoding(self): + decoder = WordPiece() + assert decoder.decode(["My", "na", "##me", "is", "Jo", "##hn"]) == "My name is John" + assert decoder.decode(["I", "'m", "Jo", "##hn"]) == "I'm John" + decoder = WordPiece(prefix="__", cleanup=False) + assert decoder.decode(["My", "na", "__me", "is", "Jo", "__hn"]) == "My name is John" + assert decoder.decode(["I", "'m", "Jo", "__hn"]) == "I 'm John" + + +class TestMetaspace: + def test_instantiate(self): + assert Metaspace() is not None + assert Metaspace(replacement="-") is not None + with pytest.raises(Exception, match="replacement must be a character"): + Metaspace(replacement="") + assert Metaspace(add_prefix_space=True) is not None + assert isinstance(Metaspace(), Decoder) + + def test_decoding(self): + decoder = Metaspace() + assert decoder.decode(["▁My", "▁name", "▁is", "▁John"]) == "My name is John" + decoder = Metaspace(replacement="-", add_prefix_space=False) + assert decoder.decode(["-My", "-name", "-is", "-John"]) == " My name is John" + + +class TestBPEDecoder: + def test_instantiate(self): + assert BPEDecoder() is not None + assert BPEDecoder(suffix="_") is not None + assert isinstance(BPEDecoder(), Decoder) + + def test_decoding(self): + decoder = BPEDecoder() + assert ( + decoder.decode(["My", "na", "me", "is", "Jo", "hn"]) + == "My name is John" + ) + decoder = BPEDecoder(suffix="_") + assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == "My name is John"