mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fix data path in test_continuing_prefix_trainer_mismatch (#1747)
This commit is contained in:
@ -14,7 +14,7 @@ from tokenizers import (
|
|||||||
trainers,
|
trainers,
|
||||||
)
|
)
|
||||||
|
|
||||||
from ..utils import data_dir, train_files
|
from ..utils import data_dir, train_files, DATA_PATH
|
||||||
|
|
||||||
|
|
||||||
class TestBpeTrainer:
|
class TestBpeTrainer:
|
||||||
@ -287,7 +287,7 @@ class TestUnigram:
|
|||||||
trainer.initial_alphabet = ["d", "z"]
|
trainer.initial_alphabet = ["d", "z"]
|
||||||
assert sorted(trainer.initial_alphabet) == ["d", "z"]
|
assert sorted(trainer.initial_alphabet) == ["d", "z"]
|
||||||
|
|
||||||
def test_continuing_prefix_trainer_mismatch(self):
|
def test_continuing_prefix_trainer_mismatch(self, train_files):
|
||||||
UNK = "[UNK]"
|
UNK = "[UNK]"
|
||||||
special_tokens = [UNK]
|
special_tokens = [UNK]
|
||||||
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
|
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
|
||||||
@ -295,8 +295,9 @@ class TestUnigram:
|
|||||||
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
|
||||||
[pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)]
|
[pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)]
|
||||||
)
|
)
|
||||||
tokenizer.train(files=["data/big.txt"], trainer=trainer)
|
tokenizer.train(files=[train_files["big"]], trainer=trainer)
|
||||||
|
|
||||||
tokenizer.save("data/tokenizer.json")
|
tokenizer_json = os.path.join(DATA_PATH, "tokenizer.json")
|
||||||
|
tokenizer.save(tokenizer_json)
|
||||||
|
|
||||||
tokenizer.from_file("data/tokenizer.json")
|
tokenizer.from_file(tokenizer_json)
|
||||||
|
Reference in New Issue
Block a user