Python - Add train_from_iterator to implementations

This commit is contained in:
Anthony MOI
2021-01-06 17:07:56 -05:00
committed by Anthony MOI
parent 817c5ad317
commit d94fa220b6
9 changed files with 166 additions and 7 deletions

View File

@ -4,7 +4,7 @@ from ..utils import data_dir, bert_files, multiprocessing_with_parallelism
from tokenizers import BertWordPieceTokenizer
class TestBertWordPieceBPE:
class TestBertWordPieceTokenizer:
def test_basic_encode(self, bert_files):
tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
@ -44,3 +44,11 @@ class TestBertWordPieceBPE:
tokenizer = BertWordPieceTokenizer.from_file(bert_files["vocab"])
multiprocessing_with_parallelism(tokenizer, False)
multiprocessing_with_parallelism(tokenizer, True)
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = BertWordPieceTokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["a", "sentence"]

View File

@ -89,3 +89,11 @@ class TestByteLevelBPE:
tokenizer = ByteLevelBPETokenizer.from_file(roberta_files["vocab"], roberta_files["merges"])
multiprocessing_with_parallelism(tokenizer, False)
multiprocessing_with_parallelism(tokenizer, True)
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = ByteLevelBPETokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["A", "Ġsentence"]

View File

@ -4,7 +4,7 @@ from ..utils import data_dir, openai_files, multiprocessing_with_parallelism
from tokenizers import CharBPETokenizer
class TestBertWordPieceBPE:
class TestCharBPETokenizer:
def test_basic_encode(self, openai_files):
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
@ -53,3 +53,11 @@ class TestBertWordPieceBPE:
tokenizer = CharBPETokenizer.from_file(openai_files["vocab"], openai_files["merges"])
multiprocessing_with_parallelism(tokenizer, False)
multiprocessing_with_parallelism(tokenizer, True)
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = CharBPETokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["A</w>", "sentence</w>"]

View File

@ -0,0 +1,23 @@
import pytest
from tokenizers import SentencePieceBPETokenizer, SentencePieceUnigramTokenizer
class TestSentencePieceBPE:
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceBPETokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["▁A", "▁sentence"]
class TestSentencePieceUnigram:
def test_train_from_iterator(self):
text = ["A first sentence", "Another sentence", "And a last one"]
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train_from_iterator(text, show_progress=False)
output = tokenizer.encode("A sentence")
assert output.tokens == ["▁A", "", "s", "en", "t", "en", "c", "e"]