Add a way to specify the unknown token in SentencePieceUnigramTokenizer python implem (#762)

* add a way to specify the unknown token in `SentencePieceUnigramTokenizer`

* add test that verify that an exception is raised for the missing unknown token

* style

* add test tokens
This commit is contained in:
SaulLu
2021-08-12 15:42:44 +02:00
committed by GitHub
parent 46bed542fa
commit da4c7b10e4
3 changed files with 77 additions and 3 deletions

View File

@ -49,13 +49,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
vocab_size: int = 8000, vocab_size: int = 8000,
show_progress: bool = True, show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [], special_tokens: List[Union[str, AddedToken]] = [],
unk_token: Optional[str] = None,
): ):
""" Train the model using the given files """ """
Train the model using the given files
Args:
files (:obj:`List[str]`):
A list of path to the files that we should use for training
vocab_size (:obj:`int`):
The size of the final vocabulary, including all tokens and alphabet.
show_progress (:obj:`bool`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`):
A list of special tokens the model should know of.
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
"""
trainer = trainers.UnigramTrainer( trainer = trainers.UnigramTrainer(
vocab_size=vocab_size, vocab_size=vocab_size,
special_tokens=special_tokens, special_tokens=special_tokens,
show_progress=show_progress, show_progress=show_progress,
unk_token=unk_token,
) )
if isinstance(files, str): if isinstance(files, str):
@ -68,13 +84,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
vocab_size: int = 8000, vocab_size: int = 8000,
show_progress: bool = True, show_progress: bool = True,
special_tokens: List[Union[str, AddedToken]] = [], special_tokens: List[Union[str, AddedToken]] = [],
unk_token: Optional[str] = None,
): ):
""" Train the model using the given iterator """ """
Train the model using the given iterator
Args:
iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
Any iterator over strings or list of strings
vocab_size (:obj:`int`):
The size of the final vocabulary, including all tokens and alphabet.
show_progress (:obj:`bool`):
Whether to show progress bars while training.
special_tokens (:obj:`List[Union[str, AddedToken]]`):
A list of special tokens the model should know of.
unk_token (:obj:`str`, `optional`):
The unknown token to be used by the model.
"""
trainer = trainers.UnigramTrainer( trainer = trainers.UnigramTrainer(
vocab_size=vocab_size, vocab_size=vocab_size,
special_tokens=special_tokens, special_tokens=special_tokens,
show_progress=show_progress, show_progress=show_progress,
unk_token=unk_token,
) )
self._tokenizer.train_from_iterator(iterator, trainer=trainer) self._tokenizer.train_from_iterator(iterator, trainer=trainer)

View File

@ -2,7 +2,7 @@ from setuptools import setup
from setuptools_rust import Binding, RustExtension from setuptools_rust import Binding, RustExtension
extras = {} extras = {}
extras["testing"] = ["pytest"] extras["testing"] = ["pytest", "requests", "numpy", "datasets"]
setup( setup(
name="tokenizers", name="tokenizers",

View File

@ -1,5 +1,7 @@
import os
import pytest import pytest
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
@ -14,6 +16,32 @@ class TestSentencePieceBPE:
class TestSentencePieceUnigram: class TestSentencePieceUnigram:
def test_train(self, tmpdir):
p = tmpdir.mkdir("tmpdir").join("file.txt")
p.write("A first sentence\nAnother sentence\nAnd a last one")
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train(files=str(p), show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["▁A", "", "s", "en", "t", "en", "c", "e"]
with pytest.raises(Exception) as excinfo:
_ = tokenizer.encode("A sentence 🤗")
assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
def test_train_with_unk_token(self, tmpdir):
p = tmpdir.mkdir("tmpdir").join("file.txt")
p.write("A first sentence\nAnother sentence\nAnd a last one")
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train(
files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
)
output = tokenizer.encode("A sentence 🤗")
assert output.ids[-1] == 0
assert output.tokens == ["▁A", "", "s", "en", "t", "en", "c", "e", "", "🤗"]
def test_train_from_iterator(self): def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"] text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceUnigramTokenizer() tokenizer = SentencePieceUnigramTokenizer()
@ -21,3 +49,17 @@ class TestSentencePieceUnigram:
output = tokenizer.encode("A sentence") output = tokenizer.encode("A sentence")
assert output.tokens == ["▁A", "", "s", "en", "t", "en", "c", "e"] assert output.tokens == ["▁A", "", "s", "en", "t", "en", "c", "e"]
with pytest.raises(Exception) as excinfo:
_ = tokenizer.encode("A sentence 🤗")
assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
def test_train_from_iterator_with_unk_token(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train_from_iterator(
text, vocab_size=100, show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
)
output = tokenizer.encode("A sentence 🤗")
assert output.ids[-1] == 0
assert output.tokens == ["▁A", "", "s", "en", "t", "en", "c", "e", "", "🤗"]