mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-04 03:38:23 +00:00
Fixing Trainer with u8 instead of chars. (#452)
* Fixing Trainer with u8 instead of chars. Now check both optimized and unoptimized encodings schemes for Unigram. * Small fixes. * Fixing makefile.
This commit is contained in:
@@ -7,10 +7,9 @@ from ..utils import data_dir, train_files
|
||||
|
||||
|
||||
class TestUnigram:
|
||||
@pytest.mark.slow
|
||||
def test_train(self, train_files):
|
||||
tokenizer = SentencePieceUnigramTokenizer()
|
||||
tokenizer.train(train_files["big"], show_progress=False)
|
||||
tokenizer.train(train_files["small"], show_progress=False)
|
||||
|
||||
filename = "tests/data/unigram_trained.json"
|
||||
tokenizer.save(filename)
|
||||
|
||||
@@ -61,8 +61,17 @@ def openai_files(data_dir):
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def train_files(data_dir):
|
||||
big = download("https://norvig.com/big.txt")
|
||||
small = os.path.join(DATA_PATH, "small.txt")
|
||||
with open(small, "w") as f:
|
||||
with open(big, "r") as g:
|
||||
for i, line in enumerate(g):
|
||||
f.write(line)
|
||||
if i > 100:
|
||||
break
|
||||
return {
|
||||
"big": download("https://norvig.com/big.txt"),
|
||||
"small": small,
|
||||
"big": big,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user