Fixing Trainer with u8 instead of chars. (#452)

* Fixing Trainer with u8 instead of chars.

Now check both optimized and unoptimized encodings schemes for Unigram.

* Small fixes.

* Fixing makefile.
This commit is contained in:
Nicolas Patry
2020-10-09 18:57:14 +02:00
committed by GitHub
parent 35feff0042
commit fbca797b3d
6 changed files with 106 additions and 66 deletions

View File

@ -61,8 +61,17 @@ def openai_files(data_dir):
@pytest.fixture(scope="session")
def train_files(data_dir):
big = download("https://norvig.com/big.txt")
small = os.path.join(DATA_PATH, "small.txt")
with open(small, "w") as f:
with open(big, "r") as g:
for i, line in enumerate(g):
f.write(line)
if i > 100:
break
return {
"big": download("https://norvig.com/big.txt"),
"small": small,
"big": big,
}