mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 08:45:38 +00:00
Fixing Trainer with u8 instead of chars. (#452)
* Fixing Trainer with u8 instead of chars. Now check both optimized and unoptimized encodings schemes for Unigram. * Small fixes. * Fixing makefile.
This commit is contained in:
@ -61,8 +61,17 @@ def openai_files(data_dir):
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def train_files(data_dir):
|
||||
big = download("https://norvig.com/big.txt")
|
||||
small = os.path.join(DATA_PATH, "small.txt")
|
||||
with open(small, "w") as f:
|
||||
with open(big, "r") as g:
|
||||
for i, line in enumerate(g):
|
||||
f.write(line)
|
||||
if i > 100:
|
||||
break
|
||||
return {
|
||||
"big": download("https://norvig.com/big.txt"),
|
||||
"small": small,
|
||||
"big": big,
|
||||
}
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user