mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Python - Add train_from_iterator to implementations
This commit is contained in:
@ -4,7 +4,7 @@ from ..utils import data_dir, bert_files, multiprocessing_with_parallelism
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
|
||||
|
||||
class TestBertWordPieceBPE:
|
||||
class TestBertWordPieceTokenizer:
|
||||
def test_basic_encode(self, bert_files):
|
||||
tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
|
||||
|
||||
@ -44,3 +44,11 @@ class TestBertWordPieceBPE:
|
||||
tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
||||
def test_train_from_iterator(self):
|
||||
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||
tokenizer = BertWordPieceTokenizer()
|
||||
tokenizer.train_from_iterator(text, show_progress=False)
|
||||
|
||||
output = tokenizer.encode("A sentence")
|
||||
assert output.tokens == ["a", "sentence"]
|
||||
|
@ -89,3 +89,11 @@ class TestByteLevelBPE:
|
||||
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
||||
def test_train_from_iterator(self):
|
||||
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||
tokenizer = ByteLevelBPETokenizer()
|
||||
tokenizer.train_from_iterator(text, show_progress=False)
|
||||
|
||||
output = tokenizer.encode("A sentence")
|
||||
assert output.tokens == ["A", "Ġsentence"]
|
||||
|
@ -4,7 +4,7 @@ from ..utils import data_dir, openai_files, multiprocessing_with_parallelism
|
||||
from tokenizers import CharBPETokenizer
|
||||
|
||||
|
||||
class TestBertWordPieceBPE:
|
||||
class TestCharBPETokenizer:
|
||||
def test_basic_encode(self, openai_files):
|
||||
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
|
||||
|
||||
@ -53,3 +53,11 @@ class TestBertWordPieceBPE:
|
||||
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
||||
def test_train_from_iterator(self):
|
||||
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||
tokenizer = CharBPETokenizer()
|
||||
tokenizer.train_from_iterator(text, show_progress=False)
|
||||
|
||||
output = tokenizer.encode("A sentence")
|
||||
assert output.tokens == ["A</w>", "sentence</w>"]
|
||||
|
23
bindings/python/tests/implementations/test_sentencepiece.py
Normal file
23
bindings/python/tests/implementations/test_sentencepiece.py
Normal file
@ -0,0 +1,23 @@
|
||||
import pytest
|
||||
|
||||
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
|
||||
|
||||
|
||||
class TestSentencePieceBPE:
|
||||
def test_train_from_iterator(self):
|
||||
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||
tokenizer = SentencePieceBPETokenizer()
|
||||
tokenizer.train_from_iterator(text, show_progress=False)
|
||||
|
||||
output = tokenizer.encode("A sentence")
|
||||
assert output.tokens == ["▁A", "▁sentence"]
|
||||
|
||||
|
||||
class TestSentencePieceUnigram:
|
||||
def test_train_from_iterator(self):
|
||||
text = ["A first sentence", "Another sentence", "And a last one"]
|
||||
tokenizer = SentencePieceUnigramTokenizer()
|
||||
tokenizer.train_from_iterator(text, show_progress=False)
|
||||
|
||||
output = tokenizer.encode("A sentence")
|
||||
assert output.tokens == ["▁A", "▁", "s", "en", "t", "en", "c", "e"]
|
Reference in New Issue
Block a user