Files
tokenizers/bindings/python/tests/implementations/test_sentencepiece.py
2021-01-07 09:02:20 -05:00

24 lines
851 B
Python

import pytest
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
class TestSentencePieceBPE:
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["▁A", "▁sentence"]
class TestSentencePieceUnigram:
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["▁A", "", "s", "en", "t", "en", "c", "e"]