Merge pull request #213 from huggingface/python-tests

Add Python tests
2025-09-03 07:49:22 +00:00 · 2020-04-02 14:09:22 -04:00
parent d3fb1d12f4 b03fea1d66
commit 2a4e5f81de
21 changed files with 781 additions and 13 deletions
--- a/.github/workflows/python.yml
+++ b/.github/workflows/python.yml
@ -59,16 +59,24 @@ jobs:
          path: ./bindings/python/target
          key: ${{ runner.os }}-cargo-python-build-${{ hashFiles('**/Cargo.toml') }}

-      - name: Build
-        uses: actions-rs/cargo@v1
-        with:
-          toolchain: nightly
-          command: build
-          args: --verbose --manifest-path ./bindings/python/Cargo.toml
-
      - name: Lint with RustFmt
        uses: actions-rs/cargo@v1
        with:
          toolchain: nightly
          command: fmt
          args: --manifest-path ./bindings/python/Cargo.toml -- --check
+
+      - name: Install Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.6
+          architecture: "x64"
+
+      - name: Run tests
+        working-directory: ./bindings/python
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          pip install pytest requests maturin
+          maturin develop --release
+          make test
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,7 @@ Cargo.lock

 /data
 tokenizers/data
+bindings/python/tests/data
 /docs

 __pycache__
--- a/bindings/python/Makefile
+++ b/bindings/python/Makefile
@ -1,9 +1,13 @@
-.PHONY: style check-style
+.PHONY: style check-style test

 # Format source code automatically
-
 style:
-	black --line-length 100 --target-version py35 examples tokenizers
+	black --line-length 100 --target-version py35 examples tokenizers tests

+# Check the source code is formatted correctly
 check-style:
-	black --check --line-length 100 --target-version py35 examples tokenizers
+	black --check --line-length 100 --target-version py35 examples tokenizers tests
+
+# Launch the test suite
+test:
+	python -m pytest -s -v tests
--- a/bindings/python/setup.py
+++ b/bindings/python/setup.py
@ -1,6 +1,9 @@
 from setuptools import setup
 from setuptools_rust import Binding, RustExtension

+extras = {}
+extras["testing"] = ["pytest"]
+
 setup(
    name="tokenizers",
    version="0.7.0-rc3",
@ -13,6 +16,7 @@ setup(
    url="https://github.com/huggingface/tokenizers",
    license="Apache License 2.0",
    rust_extensions=[RustExtension("tokenizers.tokenizers", binding=Binding.PyO3)],
+    extras_require=extras,
    classifiers=[
        "Development Status :: 5 - Production/Stable",
        "Intended Audience :: Developers",
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@ -107,7 +107,7 @@ impl BPEDecoder {
    #[new]
    #[args(kwargs = "**")]
    fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
-        let mut suffix = String::from("</w");
+        let mut suffix = String::from("</w>");

        if let Some(kwargs) = kwargs {
            for (key, value) in kwargs {
--- a/bindings/python/src/models.rs
+++ b/bindings/python/src/models.rs
@ -288,4 +288,11 @@ impl WordLevel {
            }),
        }
    }
+
+    #[staticmethod]
+    fn empty() -> Model {
+        Model {
+            model: Container::Owned(Box::new(tk::models::wordlevel::WordLevel::default())),
+        }
+    }
 }
--- a/bindings/python/src/tokenizer.rs
+++ b/bindings/python/src/tokenizer.rs
@ -46,6 +46,26 @@ impl AddedToken {
        obj.init({ AddedToken { token } });
        Ok(())
    }
+
+    #[getter]
+    fn get_content(&self) -> &str {
+        &self.token.content
+    }
+
+    #[getter]
+    fn get_rstrip(&self) -> bool {
+        self.token.rstrip
+    }
+
+    #[getter]
+    fn get_lstrip(&self) -> bool {
+        self.token.lstrip
+    }
+
+    #[getter]
+    fn get_single_word(&self) -> bool {
+        self.token.single_word
+    }
 }
 #[pyproto]
 impl PyObjectProtocol for AddedToken {
@ -54,9 +74,17 @@ impl PyObjectProtocol for AddedToken {
    }

    fn __repr__(&self) -> PyResult<String> {
+        let bool_to_python = |p| match p {
+            true => "True",
+            false => "False",
+        };
+
        Ok(format!(
            "AddedToken(\"{}\", rstrip={}, lstrip={}, single_word={})",
-            self.token.content, self.token.rstrip, self.token.lstrip, self.token.single_word
+            self.token.content,
+            bool_to_python(self.token.rstrip),
+            bool_to_python(self.token.lstrip),
+            bool_to_python(self.token.single_word)
        ))
    }
 }
--- a/bindings/python/tests/init.py
+++ b/bindings/python/tests/init.py
--- a/bindings/python/tests/bindings/init.py
+++ b/bindings/python/tests/bindings/init.py
--- a/bindings/python/tests/bindings/test_decoders.py
+++ b/bindings/python/tests/bindings/test_decoders.py
@ -0,0 +1,61 @@
+import pytest
+
+from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
+
+
+class TestByteLevel:
+    def test_instantiate(self):
+        assert ByteLevel() is not None
+        assert isinstance(ByteLevel(), Decoder)
+
+    def test_decoding(self):
+        decoder = ByteLevel()
+        assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John"
+
+
+class TestWordPiece:
+    def test_instantiate(self):
+        assert WordPiece() is not None
+        assert WordPiece(prefix="__") is not None
+        assert WordPiece(cleanup=True) is not None
+        assert isinstance(WordPiece(), Decoder)
+
+    def test_decoding(self):
+        decoder = WordPiece()
+        assert decoder.decode(["My", "na", "##me", "is", "Jo", "##hn"]) == "My name is John"
+        assert decoder.decode(["I", "'m", "Jo", "##hn"]) == "I'm John"
+        decoder = WordPiece(prefix="__", cleanup=False)
+        assert decoder.decode(["My", "na", "__me", "is", "Jo", "__hn"]) == "My name is John"
+        assert decoder.decode(["I", "'m", "Jo", "__hn"]) == "I 'm John"
+
+
+class TestMetaspace:
+    def test_instantiate(self):
+        assert Metaspace() is not None
+        assert Metaspace(replacement="-") is not None
+        with pytest.raises(Exception, match="replacement must be a character"):
+            Metaspace(replacement="")
+        assert Metaspace(add_prefix_space=True) is not None
+        assert isinstance(Metaspace(), Decoder)
+
+    def test_decoding(self):
+        decoder = Metaspace()
+        assert decoder.decode(["▁My", "▁name", "▁is", "▁John"]) == "My name is John"
+        decoder = Metaspace(replacement="-", add_prefix_space=False)
+        assert decoder.decode(["-My", "-name", "-is", "-John"]) == " My name is John"
+
+
+class TestBPEDecoder:
+    def test_instantiate(self):
+        assert BPEDecoder() is not None
+        assert BPEDecoder(suffix="_") is not None
+        assert isinstance(BPEDecoder(), Decoder)
+
+    def test_decoding(self):
+        decoder = BPEDecoder()
+        assert (
+            decoder.decode(["My</w>", "na", "me</w>", "is</w>", "Jo", "hn</w>"])
+            == "My name is John"
+        )
+        decoder = BPEDecoder(suffix="_")
+        assert decoder.decode(["My_", "na", "me_", "is_", "Jo", "hn_"]) == "My name is John"
--- a/bindings/python/tests/bindings/test_models.py
+++ b/bindings/python/tests/bindings/test_models.py
@ -0,0 +1,23 @@
+from ..utils import data_dir, roberta_files, bert_files
+
+from tokenizers.models import Model, BPE, WordPiece, WordLevel
+
+
+class TestBPE:
+    def test_instantiate(self, roberta_files):
+        assert isinstance(BPE.empty(), Model)
+        assert isinstance(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]), Model)
+
+
+class TestWordPiece:
+    def test_instantiate(self, bert_files):
+        assert isinstance(WordPiece.empty(), Model)
+        assert isinstance(WordPiece.from_files(bert_files["vocab"]), Model)
+
+
+class TestWordLevel:
+    def test_instantiate(self, roberta_files):
+        assert isinstance(WordLevel.empty(), Model)
+        # The WordLevel model expects a vocab.json using the same format as roberta
+        # so we can just try to load with this file
+        assert isinstance(WordLevel.from_files(roberta_files["vocab"]), Model)
--- a/bindings/python/tests/bindings/test_normalizers.py
+++ b/bindings/python/tests/bindings/test_normalizers.py
@ -0,0 +1,82 @@
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.normalizers import BertNormalizer, Sequence, Lowercase, Strip
+
+
+class TestBertNormalizer:
+    def test_strip_accents(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = BertNormalizer(
+            strip_accents=True, lowercase=False, handle_chinese_chars=False, clean_text=False
+        )
+
+        output = tokenizer.normalize("Héllò")
+        assert output == "Hello"
+
+    def test_handle_chinese_chars(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = BertNormalizer(
+            strip_accents=False, lowercase=False, handle_chinese_chars=True, clean_text=False
+        )
+
+        output = tokenizer.normalize("你好")
+        assert output == " 你  好 "
+
+    def test_clean_text(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = BertNormalizer(
+            strip_accents=False, lowercase=False, handle_chinese_chars=False, clean_text=True
+        )
+
+        output = tokenizer.normalize("\ufeffHello")
+        assert output == "Hello"
+
+    def test_lowercase(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = BertNormalizer(
+            strip_accents=False, lowercase=True, handle_chinese_chars=False, clean_text=False
+        )
+
+        output = tokenizer.normalize("Héllò")
+        assert output == "héllò"
+
+
+class TestSequence:
+    def test_can_make_sequences(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = Sequence([Lowercase(), Strip()])
+
+        output = tokenizer.normalize("  HELLO  ")
+        assert output == "hello"
+
+
+class TestLowercase:
+    def test_lowercase(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = Lowercase()
+
+        output = tokenizer.normalize("HELLO")
+        assert output == "hello"
+
+
+class TestStrip:
+    def test_left_strip(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = Strip(left=True, right=False)
+
+        output = tokenizer.normalize("  hello  ")
+        assert output == "hello  "
+
+    def test_right_strip(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = Strip(left=False, right=True)
+
+        output = tokenizer.normalize("  hello  ")
+        assert output == "  hello"
+
+    def test_full_strip(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.normalizer = Strip(left=True, right=True)
+
+        output = tokenizer.normalize("  hello  ")
+        assert output == "hello"
--- a/bindings/python/tests/bindings/test_pre_tokenizers.py
+++ b/bindings/python/tests/bindings/test_pre_tokenizers.py
@ -0,0 +1,59 @@
+import pytest
+
+from tokenizers.pre_tokenizers import (
+    PreTokenizer,
+    ByteLevel,
+    Whitespace,
+    WhitespaceSplit,
+    BertPreTokenizer,
+    Metaspace,
+    CharDelimiterSplit,
+)
+
+
+class TestByteLevel:
+    def test_instantiate(self):
+        assert ByteLevel() is not None
+        assert ByteLevel(add_prefix_space=True) is not None
+        assert ByteLevel(add_prefix_space=False) is not None
+        assert isinstance(ByteLevel(), PreTokenizer)
+
+    def test_has_alphabet(self):
+        assert isinstance(ByteLevel.alphabet(), list)
+        assert len(ByteLevel.alphabet()) == 256
+
+
+class TestWhitespace:
+    def test_instantiate(self):
+        assert Whitespace() is not None
+        assert isinstance(Whitespace(), PreTokenizer)
+
+
+class TestWhitespaceSplit:
+    def test_instantiate(self):
+        assert WhitespaceSplit() is not None
+        assert isinstance(WhitespaceSplit(), PreTokenizer)
+
+
+class TestBertPreTokenizer:
+    def test_instantiate(self):
+        assert BertPreTokenizer() is not None
+        assert isinstance(BertPreTokenizer(), PreTokenizer)
+
+
+class TestMetaspace:
+    def test_instantiate(self):
+        assert Metaspace() is not None
+        assert Metaspace(replacement="-") is not None
+        with pytest.raises(Exception, match="replacement must be a character"):
+            Metaspace(replacement="")
+        assert Metaspace(add_prefix_space=True) is not None
+        assert isinstance(Metaspace(), PreTokenizer)
+
+
+class TestCharDelimiterSplit:
+    def test_instantiate(self):
+        assert CharDelimiterSplit("-") is not None
+        with pytest.raises(Exception, match="delimiter must be a single character"):
+            CharDelimiterSplit("")
+        assert isinstance(CharDelimiterSplit(" "), PreTokenizer)
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@ -0,0 +1,62 @@
+from ..utils import data_dir, roberta_files
+
+from tokenizers import Tokenizer
+from tokenizers.models import BPE
+from tokenizers.pre_tokenizers import ByteLevel as ByteLevelPreTokenizer
+from tokenizers.processors import PostProcessor, BertProcessing, RobertaProcessing, ByteLevel
+
+
+class TestBertProcessing:
+    def test_instantiate(self):
+        processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
+        assert processor is not None
+        assert isinstance(processor, PostProcessor)
+
+    def test_processing(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_special_tokens(["[SEP]", "[CLS]"])
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+        tokenizer.post_processor = BertProcessing(("[SEP]", 0), ("[CLS]", 1))
+
+        output = tokenizer.encode("my name", "pair")
+        assert output.tokens == ["[CLS]", "my", "name", "[SEP]", "pair", "[SEP]"]
+        assert output.ids == [1, 2, 3, 0, 6, 0]
+
+
+class TestRobertaProcessing:
+    def test_instantiate(self):
+        processor = RobertaProcessing(("</s>", 1), ("<s>", 0))
+        assert processor is not None
+        assert isinstance(processor, PostProcessor)
+
+    def test_processing(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_special_tokens(["<s>", "</s>"])
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+        tokenizer.post_processor = RobertaProcessing(("</s>", 1), ("<s>", 0))
+
+        output = tokenizer.encode("my name", "pair")
+        assert output.tokens == ["<s>", "my", "name", "</s>", "</s>", "pair", "</s>"]
+        assert output.ids == [0, 2, 3, 1, 1, 6, 1]
+
+
+class TestByteLevelProcessing:
+    def test_instantiate(self):
+        assert ByteLevel() is not None
+        assert ByteLevel(trim_offsets=True) is not None
+        assert isinstance(ByteLevel(), PostProcessor)
+
+    def test_processing(self, roberta_files):
+        tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
+        tokenizer.pre_tokenizer = ByteLevelPreTokenizer(add_prefix_space=True)
+
+        # Keeps original offsets
+        output = tokenizer.encode("My name is John")
+        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+        assert output.offsets == [(0, 2), (2, 7), (7, 10), (10, 15)]
+
+        # Trims offsets when activated
+        tokenizer.post_processor = ByteLevel(trim_offsets=True)
+        output = tokenizer.encode("My name is John")
+        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@ -0,0 +1,221 @@
+from ..utils import data_dir, roberta_files
+
+from tokenizers import AddedToken, Tokenizer
+from tokenizers.models import Model, BPE
+from tokenizers.pre_tokenizers import ByteLevel
+from tokenizers.processors import RobertaProcessing
+from tokenizers.normalizers import Lowercase
+
+
+class TestAddedToken:
+    def test_instantiate_with_content_only(self):
+        added_token = AddedToken("<mask>")
+        assert type(added_token) == AddedToken
+        assert str(added_token) == "<mask>"
+        assert (
+            repr(added_token)
+            == 'AddedToken("<mask>", rstrip=False, lstrip=False, single_word=False)'
+        )
+        assert added_token.rstrip == False
+        assert added_token.lstrip == False
+        assert added_token.single_word == False
+
+    def test_can_set_rstrip(self):
+        added_token = AddedToken("<mask>", rstrip=True)
+        assert added_token.rstrip == True
+        assert added_token.lstrip == False
+        assert added_token.single_word == False
+
+    def test_can_set_lstrip(self):
+        added_token = AddedToken("<mask>", lstrip=True)
+        assert added_token.rstrip == False
+        assert added_token.lstrip == True
+        assert added_token.single_word == False
+
+    def test_can_set_single_world(self):
+        added_token = AddedToken("<mask>", single_word=True)
+        assert added_token.rstrip == False
+        assert added_token.lstrip == False
+        assert added_token.single_word == True
+
+
+class TestTokenizer:
+    def test_has_expected_type_and_methods(self):
+        tokenizer = Tokenizer(BPE.empty())
+        assert type(tokenizer) == Tokenizer
+        assert callable(tokenizer.num_special_tokens_to_add)
+        assert callable(tokenizer.get_vocab)
+        assert callable(tokenizer.get_vocab_size)
+        assert callable(tokenizer.enable_truncation)
+        assert callable(tokenizer.no_truncation)
+        assert callable(tokenizer.enable_padding)
+        assert callable(tokenizer.no_padding)
+        assert callable(tokenizer.normalize)
+        assert callable(tokenizer.encode)
+        assert callable(tokenizer.encode_batch)
+        assert callable(tokenizer.decode)
+        assert callable(tokenizer.decode_batch)
+        assert callable(tokenizer.token_to_id)
+        assert callable(tokenizer.id_to_token)
+        assert callable(tokenizer.add_tokens)
+        assert callable(tokenizer.add_special_tokens)
+        assert callable(tokenizer.train)
+        assert callable(tokenizer.post_process)
+        assert isinstance(tokenizer.model, Model)
+        assert tokenizer.normalizer is None
+        assert tokenizer.pre_tokenizer is None
+        assert tokenizer.post_processor is None
+        assert tokenizer.decoder is None
+
+    def test_add_tokens(self):
+        tokenizer = Tokenizer(BPE.empty())
+        added = tokenizer.add_tokens(["my", "name", "is", "john"])
+        assert added == 4
+
+        added = tokenizer.add_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)])
+        assert added == 2
+
+    def test_add_special_tokens(self):
+        tokenizer = Tokenizer(BPE.empty())
+
+        # Can add special tokens as `str`
+        added = tokenizer.add_special_tokens(["my", "name", "is", "john"])
+        assert added == 4
+
+        # Can add special tokens as `AddedToken`
+        added = tokenizer.add_special_tokens([AddedToken("the"), AddedToken("quick", rstrip=True)])
+        assert added == 2
+
+    def test_encode(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+
+        # Can encode single sequence
+        output = tokenizer.encode("my name is john")
+        assert output.tokens == ["my", "name", "is", "john"]
+        assert type(output.ids) == list
+        assert type(output.type_ids) == list
+        assert type(output.offsets) == list
+        assert type(output.words) == list
+        assert type(output.special_tokens_mask) == list
+        assert type(output.attention_mask) == list
+        assert type(output.overflowing) == list
+
+        # Can encode a pair of sequences
+        output = tokenizer.encode("my name is john", "pair")
+        assert output.tokens == ["my", "name", "is", "john", "pair"]
+
+        # Can encode a batch with both a single sequence and a pair of sequences
+        output = tokenizer.encode_batch(["my name is john", ("my name is john", "pair")])
+        assert len(output) == 2
+
+    def test_encode_add_special_tokens(self, roberta_files):
+        tokenizer = Tokenizer(BPE.from_files(roberta_files["vocab"], roberta_files["merges"]))
+        tokenizer.add_special_tokens(["<s>", "</s>"])
+
+        tokenizer.pre_tokenizer = ByteLevel(add_prefix_space=True)
+        tokenizer.post_processor = RobertaProcessing(
+            ("</s>", tokenizer.token_to_id("</s>")), ("<s>", tokenizer.token_to_id("<s>")),
+        )
+
+        # Can encode with special tokens
+        output_with_specials = tokenizer.encode("My name is John", add_special_tokens=True)
+        assert output_with_specials.tokens == ["<s>", "ĠMy", "Ġname", "Ġis", "ĠJohn", "</s>"]
+
+        # Can encode without special tokens
+        output_without_specials = tokenizer.encode("My name is John", add_special_tokens=False)
+        assert output_without_specials.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
+
+    def test_truncation(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+        tokenizer.enable_truncation(2)
+
+        # Can truncate single sequences
+        output = tokenizer.encode("my name is john")
+        assert output.tokens == ["my", "name"]
+
+        # Can truncate pair sequences as well
+        output = tokenizer.encode("my name is john", "pair")
+        assert output.tokens == ["my", "pair"]
+
+    def test_padding(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+
+        # By default it does nothing when encoding single sequence
+        tokenizer.enable_padding()
+        output = tokenizer.encode("my name")
+        assert output.tokens == ["my", "name"]
+
+        # Can pad to the longest in a batch
+        output = tokenizer.encode_batch(["my name", "my name is john"])
+        assert all([len(encoding) == 4 for encoding in output])
+
+        # Can pad to the specified max length otherwise
+        tokenizer.enable_padding(max_length=4)
+        output = tokenizer.encode("my name")
+        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
+        output = tokenizer.encode("my name", "pair")
+        assert output.tokens == ["my", "name", "pair", "[PAD]"]
+
+    def test_decode(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+
+        # Can decode single sequences
+        output = tokenizer.decode([0, 1, 2, 3])
+        assert output == "my name is john"
+
+        # Can decode batch
+        output = tokenizer.decode_batch([[0, 1, 2, 3], [4]])
+        assert output == ["my name is john", "pair"]
+
+    def test_get_vocab(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+
+        # Can retrieve vocab with added tokens
+        vocab = tokenizer.get_vocab(with_added_tokens=True)
+        assert vocab == {"is": 2, "john": 3, "my": 0, "name": 1, "pair": 4}
+
+        # Can retrieve vocab without added tokens
+        vocab = tokenizer.get_vocab(with_added_tokens=False)
+        assert vocab == {}
+
+    def test_get_vocab_size(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+
+        # Can retrieve vocab's size with added tokens
+        size = tokenizer.get_vocab_size(with_added_tokens=True)
+        assert size == 5
+
+        # Can retrieve vocab's size without added tokens
+        size = tokenizer.get_vocab_size(with_added_tokens=False)
+        assert size == 0
+
+    def test_normalize(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+        tokenizer.normalizer = Lowercase()
+
+        output = tokenizer.normalize("My Name Is John")
+        assert output == "my name is john"
+
+    def test_post_process(self):
+        tokenizer = Tokenizer(BPE.empty())
+        tokenizer.add_tokens(["my", "name", "is", "john", "pair"])
+        tokenizer.enable_truncation(2)
+        tokenizer.enable_padding(max_length=4)
+
+        encoding = tokenizer.encode("my name is john")
+        pair_encoding = tokenizer.encode("pair")
+
+        # Can post process a single encoding
+        output = tokenizer.post_process(encoding)
+        assert output.tokens == ["my", "name", "[PAD]", "[PAD]"]
+
+        # Can post process a pair of encodings
+        output = tokenizer.post_process(encoding, pair_encoding)
+        assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
--- a/bindings/python/tests/implementations/init.py
+++ b/bindings/python/tests/implementations/init.py
--- a/bindings/python/tests/implementations/test_bert_wordpiece.py
+++ b/bindings/python/tests/implementations/test_bert_wordpiece.py
@ -0,0 +1,21 @@
+from ..utils import data_dir, bert_files
+from tokenizers import BertWordPieceTokenizer
+
+
+class TestBertWordPieceBPE:
+    def test_basic_encode(self, bert_files):
+        tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
+
+        # Encode with special tokens by default
+        output = tokenizer.encode("My name is John", "pair")
+        assert output.ids == [101, 2026, 2171, 2003, 2198, 102, 3940, 102]
+        assert output.tokens == ["[CLS]", "my", "name", "is", "john", "[SEP]", "pair", "[SEP]"]
+        assert output.offsets == [(0, 0), (0, 2), (3, 7), (8, 10), (11, 15), (0, 0), (0, 4), (0, 0)]
+        assert output.type_ids == [0, 0, 0, 0, 0, 0, 1, 1]
+
+        # Can encode without the special tokens
+        output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
+        assert output.ids == [2026, 2171, 2003, 2198, 3940]
+        assert output.tokens == ["my", "name", "is", "john", "pair"]
+        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
+        assert output.type_ids == [0, 0, 0, 0, 1]
--- a/bindings/python/tests/implementations/test_byte_level_bpe.py
+++ b/bindings/python/tests/implementations/test_byte_level_bpe.py
@ -0,0 +1,81 @@
+from ..utils import data_dir, roberta_files
+from tokenizers import ByteLevelBPETokenizer
+
+
+class TestByteLevelBPE:
+    def test_basic_encode(self, roberta_files):
+        tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
+        output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
+
+        assert output.ids == [133, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
+        assert output.tokens == [
+            "The",
+            "Ġquick",
+            "Ġbrown",
+            "Ġfox",
+            "Ġjumps",
+            "Ġover",
+            "Ġthe",
+            "Ġlazy",
+            "Ġdog",
+        ]
+        assert output.offsets == [
+            (0, 3),
+            (3, 9),
+            (9, 15),
+            (15, 19),
+            (19, 25),
+            (25, 30),
+            (30, 34),
+            (34, 39),
+            (39, 43),
+        ]
+
+    def test_add_prefix_space(self, roberta_files):
+        tokenizer = ByteLevelBPETokenizer(
+            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True
+        )
+        output = tokenizer.encode("The quick brown fox jumps over the lazy dog")
+
+        assert output.ids == [20, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
+        assert output.tokens == [
+            "ĠThe",
+            "Ġquick",
+            "Ġbrown",
+            "Ġfox",
+            "Ġjumps",
+            "Ġover",
+            "Ġthe",
+            "Ġlazy",
+            "Ġdog",
+        ]
+        assert output.offsets == [
+            (0, 3),
+            (3, 9),
+            (9, 15),
+            (15, 19),
+            (19, 25),
+            (25, 30),
+            (30, 34),
+            (34, 39),
+            (39, 43),
+        ]
+
+    def test_lowerspace(self, roberta_files):
+        tokenizer = ByteLevelBPETokenizer(
+            roberta_files["vocab"], roberta_files["merges"], add_prefix_space=True, lowercase=True
+        )
+        output = tokenizer.encode("The Quick Brown Fox Jumps Over The Lazy Dog")
+
+        assert output.ids == [5, 2119, 6219, 23602, 13855, 81, 5, 22414, 2335]
+        assert output.tokens == [
+            "Ġthe",
+            "Ġquick",
+            "Ġbrown",
+            "Ġfox",
+            "Ġjumps",
+            "Ġover",
+            "Ġthe",
+            "Ġlazy",
+            "Ġdog",
+        ]
--- a/bindings/python/tests/implementations/test_char_bpe.py
+++ b/bindings/python/tests/implementations/test_char_bpe.py
@ -0,0 +1,44 @@
+from ..utils import data_dir, openai_files
+from tokenizers import CharBPETokenizer
+
+
+class TestBertWordPieceBPE:
+    def test_basic_encode(self, openai_files):
+        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
+
+        output = tokenizer.encode("My name is John", "pair")
+        assert output.ids == [0, 253, 1362, 544, 0, 7, 12662, 2688]
+        assert output.tokens == [
+            "<unk>",
+            "y</w>",
+            "name</w>",
+            "is</w>",
+            "<unk>",
+            "o",
+            "hn</w>",
+            "pair</w>",
+        ]
+        assert output.offsets == [
+            (0, 1),
+            (1, 2),
+            (3, 7),
+            (8, 10),
+            (11, 12),
+            (12, 13),
+            (13, 15),
+            (0, 4),
+        ]
+        assert output.type_ids == [0, 0, 0, 0, 0, 0, 0, 1]
+
+    def test_lowercase(self, openai_files):
+        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
+        output = tokenizer.encode("My name is John", "pair", add_special_tokens=False)
+        assert output.ids == [547, 1362, 544, 2476, 2688]
+        assert output.tokens == ["my</w>", "name</w>", "is</w>", "john</w>", "pair</w>"]
+        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
+        assert output.type_ids == [0, 0, 0, 0, 1]
+
+    def test_decoding(self, openai_files):
+        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
+        decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
+        assert decoded == "my name is john"
--- a/bindings/python/tests/utils.py
+++ b/bindings/python/tests/utils.py
@ -0,0 +1,58 @@
+import os
+import requests
+import pytest
+
+DATA_PATH = os.path.join("tests", "data")
+
+
+def download(url):
+    filename = url.rsplit("/")[-1]
+    filepath = os.path.join(DATA_PATH, filename)
+    if not os.path.exists(filepath):
+        with open(filepath, "wb") as f:
+            response = requests.get(url, stream=True)
+            response.raise_for_status()
+            for chunk in response.iter_content(1024):
+                f.write(chunk)
+    return filepath
+
+
+@pytest.fixture(scope="session")
+def data_dir():
+    assert os.getcwd().endswith("python")
+    exist = os.path.exists(DATA_PATH) and os.path.isdir(DATA_PATH)
+    if not exist:
+        os.mkdir(DATA_PATH)
+
+
+@pytest.fixture(scope="session")
+def roberta_files(data_dir):
+    return {
+        "vocab": download(
+            "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json"
+        ),
+        "merges": download(
+            "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt"
+        ),
+    }
+
+
+@pytest.fixture(scope="session")
+def bert_files(data_dir):
+    return {
+        "vocab": download(
+            "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt"
+        ),
+    }
+
+
+@pytest.fixture(scope="session")
+def openai_files(data_dir):
+    return {
+        "vocab": download(
+            "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json"
+        ),
+        "merges": download(
+            "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
+        ),
+    }
--- a/bindings/python/tokenizers/models/init.pyi
+++ b/bindings/python/tokenizers/models/init.pyi
@ -158,3 +158,7 @@ class WordLevel(Model):
                The unknown token to be used by the model.
        """
        pass
+    @staticmethod
+    def empty() -> Model:
+        """ Instantiate an empty WordLevel Model. """
+        pass