Fixing Trainer with u8 instead of chars. (#452)

* Fixing Trainer with u8 instead of chars.

Now check both optimized and unoptimized encodings schemes for Unigram.

* Small fixes.

* Fixing makefile.
This commit is contained in:
Nicolas Patry
2020-10-09 18:57:14 +02:00
committed by GitHub
parent 35feff0042
commit fbca797b3d
6 changed files with 106 additions and 66 deletions

View File

@@ -7,10 +7,9 @@ from ..utils import data_dir, train_files
class TestUnigram:
@pytest.mark.slow
def test_train(self, train_files):
tokenizer = SentencePieceUnigramTokenizer()
tokenizer.train(train_files["big"], show_progress=False)
tokenizer.train(train_files["small"], show_progress=False)
filename = "tests/data/unigram_trained.json"
tokenizer.save(filename)

View File

@@ -61,8 +61,17 @@ def openai_files(data_dir):
@pytest.fixture(scope="session")
def train_files(data_dir):
big = download("https://norvig.com/big.txt")
small = os.path.join(DATA_PATH, "small.txt")
with open(small, "w") as f:
with open(big, "r") as g:
for i, line in enumerate(g):
f.write(line)
if i > 100:
break
return {
"big": download("https://norvig.com/big.txt"),
"small": small,
"big": big,
}