mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-24 00:59:19 +00:00
24 lines
851 B
Python
24 lines
851 B
Python
import pytest
|
|
|
|
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
|
|
|
|
|
class TestSentencePieceBPE:
|
|
def test_train_from_iterator(self):
|
|
text = ["A first sentence", "Another sentence", "And a last one"]
|
|
tokenizer = SentencePieceBPETokenizer()
|
|
tokenizer.train_from_iterator(text, show_progress=False)
|
|
|
|
output = tokenizer.encode("A sentence")
|
|
assert output.tokens == ["▁A", "▁sentence"]
|
|
|
|
|
|
class TestSentencePieceUnigram:
|
|
def test_train_from_iterator(self):
|
|
text = ["A first sentence", "Another sentence", "And a last one"]
|
|
tokenizer = SentencePieceUnigramTokenizer()
|
|
tokenizer.train_from_iterator(text, show_progress=False)
|
|
|
|
output = tokenizer.encode("A sentence")
|
|
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
|