Add a way to specify the unknown token in SentencePieceUnigramTokenizer python implem (#762)

* add a way to specify the unknown token in `SentencePieceUnigramTokenizer` * add test that verify that an exception is raised for the missing unknown token * style * add test tokens
2025-08-23 16:49:27 +00:00 · 2021-08-12 15:42:44 +02:00
parent 46bed542fa
commit da4c7b10e4
3 changed files with 77 additions and 3 deletions
--- a/bindings/python/tests/implementations/test_sentencepiece.py
+++ b/bindings/python/tests/implementations/test_sentencepiece.py
@ -1,5 +1,7 @@
+import os
 import pytest

+
 from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer


@ -14,6 +16,32 @@ class TestSentencePieceBPE:


 class TestSentencePieceUnigram:
+    def test_train(self, tmpdir):
+        p = tmpdir.mkdir("tmpdir").join("file.txt")
+        p.write("A first sentence\nAnother sentence\nAnd a last one")
+
+        tokenizer = SentencePieceUnigramTokenizer()
+        tokenizer.train(files=str(p), show_progress=False)
+
+        output = tokenizer.encode("A sentence")
+        assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
+
+        with pytest.raises(Exception) as excinfo:
+            _ = tokenizer.encode("A sentence 🤗")
+        assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
+
+    def test_train_with_unk_token(self, tmpdir):
+        p = tmpdir.mkdir("tmpdir").join("file.txt")
+        p.write("A first sentence\nAnother sentence\nAnd a last one")
+
+        tokenizer = SentencePieceUnigramTokenizer()
+        tokenizer.train(
+            files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
+        )
+        output = tokenizer.encode("A sentence 🤗")
+        assert output.ids[-1] == 0
+        assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
+
    def test_train_from_iterator(self):
        text = ["A first sentence", "Another sentence", "And a last one"]
        tokenizer = SentencePieceUnigramTokenizer()
@ -21,3 +49,17 @@ class TestSentencePieceUnigram:

        output = tokenizer.encode("A sentence")
        assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
+
+        with pytest.raises(Exception) as excinfo:
+            _ = tokenizer.encode("A sentence 🤗")
+        assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
+
+    def test_train_from_iterator_with_unk_token(self):
+        text = ["A first sentence", "Another sentence", "And a last one"]
+        tokenizer = SentencePieceUnigramTokenizer()
+        tokenizer.train_from_iterator(
+            text, vocab_size=100, show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
+        )
+        output = tokenizer.encode("A sentence 🤗")
+        assert output.ids[-1] == 0
+        assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]