mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Add a way to specify the unknown token in SentencePieceUnigramTokenizer
python implem (#762)
* add a way to specify the unknown token in `SentencePieceUnigramTokenizer` * add test that verify that an exception is raised for the missing unknown token * style * add test tokens
This commit is contained in:
@ -1,5 +1,7 @@
|
||||
import os
|
||||
import pytest
|
||||
|
||||
|
||||
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
||||
|
||||
|
||||
@ -14,6 +16,32 @@ class TestSentencePieceBPE:
|
||||
|
||||
|
||||
class TestSentencePieceUnigram:
|
||||
def test_train(self, tmpdir):
|
||||
p = tmpdir.mkdir("tmpdir").join("file.txt")
|
||||
p.write("A first sentence\nAnother sentence\nAnd a last one")
|
||||
|
||||
tokenizer = SentencePieceUnigramTokenizer()
|
||||
tokenizer.train(files=str(p), show_progress=False)
|
||||
|
||||
output = tokenizer.encode("A sentence")
|
||||
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
|
||||
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
_ = tokenizer.encode("A sentence 🤗")
|
||||
assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
|
||||
|
||||
def test_train_with_unk_token(self, tmpdir):
|
||||
p = tmpdir.mkdir("tmpdir").join("file.txt")
|
||||
p.write("A first sentence\nAnother sentence\nAnd a last one")
|
||||
|
||||
tokenizer = SentencePieceUnigramTokenizer()
|
||||
tokenizer.train(
|
||||
files=str(p), show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
|
||||
)
|
||||
output = tokenizer.encode("A sentence 🤗")
|
||||
assert output.ids[-1] == 0
|
||||
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
|
||||
|
||||
def test_train_from_iterator(self):
|
||||
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||
tokenizer = SentencePieceUnigramTokenizer()
|
||||
@ -21,3 +49,17 @@ class TestSentencePieceUnigram:
|
||||
|
||||
output = tokenizer.encode("A sentence")
|
||||
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
|
||||
|
||||
with pytest.raises(Exception) as excinfo:
|
||||
_ = tokenizer.encode("A sentence 🤗")
|
||||
assert str(excinfo.value) == "Encountered an unknown token but `unk_id` is missing"
|
||||
|
||||
def test_train_from_iterator_with_unk_token(self):
|
||||
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||
tokenizer = SentencePieceUnigramTokenizer()
|
||||
tokenizer.train_from_iterator(
|
||||
text, vocab_size=100, show_progress=False, special_tokens=["<unk>"], unk_token="<unk>"
|
||||
)
|
||||
output = tokenizer.encode("A sentence 🤗")
|
||||
assert output.ids[-1] == 0
|
||||
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e", "▁", "🤗"]
|
||||
|
Reference in New Issue
Block a user