diff --git a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py index ffb6c2fd..f5011f42 100644 --- a/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py +++ b/bindings/python/py_src/tokenizers/implementations/sentencepiece_unigram.py @@ -49,13 +49,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): vocab_size: int = 8000, show_progress: bool = True, special_tokens: List[Union[str, AddedToken]] = [], + unk_token: Optional[str] = None, ): - """ Train the model using the given files """ + """ + Train the model using the given files + + Args: + files (:obj:`List[str]`): + A list of path to the files that we should use for training + vocab_size (:obj:`int`): + The size of the final vocabulary, including all tokens and alphabet. + show_progress (:obj:`bool`): + Whether to show progress bars while training. + special_tokens (:obj:`List[Union[str, AddedToken]]`): + A list of special tokens the model should know of. + unk_token (:obj:`str`, `optional`): + The unknown token to be used by the model. + """ trainer = trainers.UnigramTrainer( vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, + unk_token=unk_token, ) if isinstance(files, str): @@ -68,13 +84,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer): vocab_size: int = 8000, show_progress: bool = True, special_tokens: List[Union[str, AddedToken]] = [], + unk_token: Optional[str] = None, ): - """ Train the model using the given iterator """ + """ + Train the model using the given iterator + + Args: + iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`): + Any iterator over strings or list of strings + vocab_size (:obj:`int`): + The size of the final vocabulary, including all tokens and alphabet. + show_progress (:obj:`bool`): + Whether to show progress bars while training. + special_tokens (:obj:`List[Union[str, AddedToken]]`): + A list of special tokens the model should know of. + unk_token (:obj:`str`, `optional`): + The unknown token to be used by the model. + """ trainer = trainers.UnigramTrainer( vocab_size=vocab_size, special_tokens=special_tokens, show_progress=show_progress, + unk_token=unk_token, ) self._tokenizer.train_from_iterator(iterator, trainer=trainer) diff --git a/bindings/python/setup.py b/bindings/python/setup.py index 9e20c7ad..4b1b66ae 100644 --- a/bindings/python/setup.py +++ b/bindings/python/setup.py @@ -2,7 +2,7 @@ from setuptools import setup from setuptools_rust import Binding, RustExtension extras = {} -extras["testing"] = ["pytest"] +extras["testing"] = ["pytest", "requests", "numpy", "datasets"] setup( name="tokenizers", diff --git a/bindings/python/tests/implementations/test_sentencepiece.py b/bindings/python/tests/implementations/test_sentencepiece.py index 166b2d14..32495cc4 100644 --- a/bindings/python/tests/implementations/test_sentencepiece.py +++ b/bindings/python/tests/implementations/test_sentencepiece.py @@ -1,5 +1,7 @@ +import os import pytest + from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer @@ -14,6 +16,32 @@ class TestSentencePieceBPE: class TestSentencePieceUnigram: + def test_train(self, tmpdir): + p = tmpdir.mkdir("tmpdir").join("file.txt") + p.write("A first sentence\nAnother sentence\nAnd a last one") + + tokenizer = SentencePieceUnigramTokenizer() + tokenizer.train(files=str(p), show_progress=False) + + output = tokenizer.encode("A sentence") + assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"] + + with pytest.raises(Exception) as excinfo: + _ = tokenizer.encode("A sentence 🤗") + assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing" + + def test_train_with_unk_token(self, tmpdir): + p = tmpdir.mkdir("tmpdir").join("file.txt") + p.write("A first sentence\nAnother sentence\nAnd a last one") + + tokenizer = SentencePieceUnigramTokenizer() + tokenizer.train( + files=str(p), show_progress=False, special_tokens=[""], unk_token="" + ) + output = tokenizer.encode("A sentence 🤗") + assert output.ids[-1] == 0 + assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"] + def test_train_from_iterator(self): text = ["A first sentence", "Another sentence", "And a last one"] tokenizer = SentencePieceUnigramTokenizer() @@ -21,3 +49,17 @@ class TestSentencePieceUnigram: output = tokenizer.encode("A sentence") assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"] + + with pytest.raises(Exception) as excinfo: + _ = tokenizer.encode("A sentence 🤗") + assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing" + + def test_train_from_iterator_with_unk_token(self): + text = ["A first sentence", "Another sentence", "And a last one"] + tokenizer = SentencePieceUnigramTokenizer() + tokenizer.train_from_iterator( + text, vocab_size=100, show_progress=False, special_tokens=[""], unk_token="" + ) + output = tokenizer.encode("A sentence 🤗") + assert output.ids[-1] == 0 + assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]