mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add a way to specify the unknown token in SentencePieceUnigramTokenizer
python implem (#762)
* add a way to specify the unknown token in `SentencePieceUnigramTokenizer` * add test that verify that an exception is raised for the missing unknown token * style * add test tokens
This commit is contained in:
@ -49,13 +49,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
vocab_size: int = 8000,
|
vocab_size: int = 8000,
|
||||||
show_progress: bool = True,
|
show_progress: bool = True,
|
||||||
special_tokens: List[Union[str, AddedToken]] = [],
|
special_tokens: List[Union[str, AddedToken]] = [],
|
||||||
|
unk_token: Optional[str] = None,
|
||||||
):
|
):
|
||||||
""" Train the model using the given files """
|
"""
|
||||||
|
Train the model using the given files
|
||||||
|
|
||||||
|
Args:
|
||||||
|
files (:obj:`List[str]`):
|
||||||
|
A list of path to the files that we should use for training
|
||||||
|
vocab_size (:obj:`int`):
|
||||||
|
The size of the final vocabulary, including all tokens and alphabet.
|
||||||
|
show_progress (:obj:`bool`):
|
||||||
|
Whether to show progress bars while training.
|
||||||
|
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
||||||
|
A list of special tokens the model should know of.
|
||||||
|
unk_token (:obj:`str`, `optional`):
|
||||||
|
The unknown token to be used by the model.
|
||||||
|
"""
|
||||||
|
|
||||||
trainer = trainers.UnigramTrainer(
|
trainer = trainers.UnigramTrainer(
|
||||||
vocab_size=vocab_size,
|
vocab_size=vocab_size,
|
||||||
special_tokens=special_tokens,
|
special_tokens=special_tokens,
|
||||||
show_progress=show_progress,
|
show_progress=show_progress,
|
||||||
|
unk_token=unk_token,
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(files, str):
|
if isinstance(files, str):
|
||||||
@ -68,13 +84,29 @@ class SentencePieceUnigramTokenizer(BaseTokenizer):
|
|||||||
vocab_size: int = 8000,
|
vocab_size: int = 8000,
|
||||||
show_progress: bool = True,
|
show_progress: bool = True,
|
||||||
special_tokens: List[Union[str, AddedToken]] = [],
|
special_tokens: List[Union[str, AddedToken]] = [],
|
||||||
|
unk_token: Optional[str] = None,
|
||||||
):
|
):
|
||||||
""" Train the model using the given iterator """
|
"""
|
||||||
|
Train the model using the given iterator
|
||||||
|
|
||||||
|
Args:
|
||||||
|
iterator (:obj:`Union[Iterator[str], Iterator[Iterator[str]]]`):
|
||||||
|
Any iterator over strings or list of strings
|
||||||
|
vocab_size (:obj:`int`):
|
||||||
|
The size of the final vocabulary, including all tokens and alphabet.
|
||||||
|
show_progress (:obj:`bool`):
|
||||||
|
Whether to show progress bars while training.
|
||||||
|
special_tokens (:obj:`List[Union[str, AddedToken]]`):
|
||||||
|
A list of special tokens the model should know of.
|
||||||
|
unk_token (:obj:`str`, `optional`):
|
||||||
|
The unknown token to be used by the model.
|
||||||
|
"""
|
||||||
|
|
||||||
trainer = trainers.UnigramTrainer(
|
trainer = trainers.UnigramTrainer(
|
||||||
vocab_size=vocab_size,
|
vocab_size=vocab_size,
|
||||||
special_tokens=special_tokens,
|
special_tokens=special_tokens,
|
||||||
show_progress=show_progress,
|
show_progress=show_progress,
|
||||||
|
unk_token=unk_token,
|
||||||
)
|
)
|
||||||
|
|
||||||
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
self._tokenizer.train_from_iterator(iterator, trainer=trainer)
|
||||||
|
@ -2,7 +2,7 @@ from setuptools import setup
|
|||||||
from setuptools_rust import Binding, RustExtension
|
from setuptools_rust import Binding, RustExtension
|
||||||
|
|
||||||
extras = {}
|
extras = {}
|
||||||
extras["testing"] = ["pytest"]
|
extras["testing"] = ["pytest", "requests", "numpy", "datasets"]
|
||||||
|
|
||||||
setup(
|
setup(
|
||||||
name="tokenizers",
|
name="tokenizers",
|
||||||
|
@ -1,5 +1,7 @@
|
|||||||
|
import os
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
||||||
|
|
||||||
|
|
||||||
@ -14,6 +16,32 @@ class TestSentencePieceBPE:
|
|||||||
|
|
||||||
|
|
||||||
class TestSentencePieceUnigram:
|
class TestSentencePieceUnigram:
|
||||||
|
def test_train(self, tmpdir):
|
||||||
|
p = tmpdir.mkdir("tmpdir").join("file.txt")
|
||||||
|
p.write("A first sentence\nAnother sentence\nAnd a last one")
|
||||||
|
|
||||||
|
tokenizer = SentencePieceUnigramTokenizer()
|
||||||
|
tokenizer.train(files=str(p), show_progress=False)
|
||||||
|
|
||||||
|
output = tokenizer.encode("A sentence")
|
||||||
|
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
|
||||||
|
|
||||||
|
with pytest.raises(Exception) as excinfo:
|
||||||
|
_ = tokenizer.encode("A sentence 🤗")
|
||||||
|
assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
|
||||||
|
|
||||||
|
def test_train_with_unk_token(self, tmpdir):
|
||||||
|
p = tmpdir.mkdir("tmpdir").join("file.txt")
|
||||||
|
p.write("A first sentence\nAnother sentence\nAnd a last one")
|
||||||
|
|
||||||
|
tokenizer = SentencePieceUnigramTokenizer()
|
||||||
|
tokenizer.train(
|
||||||
|
files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
|
||||||
|
)
|
||||||
|
output = tokenizer.encode("A sentence 🤗")
|
||||||
|
assert output.ids[-1] == 0
|
||||||
|
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
|
||||||
|
|
||||||
def test_train_from_iterator(self):
|
def test_train_from_iterator(self):
|
||||||
text = ["A first sentence", "Another sentence", "And a last one"]
|
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||||
tokenizer = SentencePieceUnigramTokenizer()
|
tokenizer = SentencePieceUnigramTokenizer()
|
||||||
@ -21,3 +49,17 @@ class TestSentencePieceUnigram:
|
|||||||
|
|
||||||
output = tokenizer.encode("A sentence")
|
output = tokenizer.encode("A sentence")
|
||||||
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
|
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
|
||||||
|
|
||||||
|
with pytest.raises(Exception) as excinfo:
|
||||||
|
_ = tokenizer.encode("A sentence 🤗")
|
||||||
|
assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
|
||||||
|
|
||||||
|
def test_train_from_iterator_with_unk_token(self):
|
||||||
|
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||||
|
tokenizer = SentencePieceUnigramTokenizer()
|
||||||
|
tokenizer.train_from_iterator(
|
||||||
|
text, vocab_size=100, show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
|
||||||
|
)
|
||||||
|
output = tokenizer.encode("A sentence 🤗")
|
||||||
|
assert output.ids[-1] == 0
|
||||||
|
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
|
||||||
|
Reference in New Issue
Block a user