Fix data path in test_continuing_prefix_trainer_mismatch (#1747)

This commit is contained in:
Gaétan Lepage
2025-05-27 08:48:27 +02:00
committed by GitHub
parent fd1b361b76
commit 23e7e42adf

View File

@ -14,7 +14,7 @@ from tokenizers import (
trainers, trainers,
) )
from ..utils import data_dir, train_files from ..utils import data_dir, train_files, DATA_PATH
class TestBpeTrainer: class TestBpeTrainer:
@ -287,7 +287,7 @@ class TestUnigram:
trainer.initial_alphabet = ["d", "z"] trainer.initial_alphabet = ["d", "z"]
assert sorted(trainer.initial_alphabet) == ["d", "z"] assert sorted(trainer.initial_alphabet) == ["d", "z"]
def test_continuing_prefix_trainer_mismatch(self): def test_continuing_prefix_trainer_mismatch(self, train_files):
UNK = "[UNK]" UNK = "[UNK]"
special_tokens = [UNK] special_tokens = [UNK]
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##")) tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
@ -295,8 +295,9 @@ class TestUnigram:
tokenizer.pre_tokenizer = pre_tokenizers.Sequence( tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
[pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)] [pre_tokenizers.Whitespace(), pre_tokenizers.Digits(individual_digits=True)]
) )
tokenizer.train(files=["data/big.txt"], trainer=trainer) tokenizer.train(files=[train_files["big"]], trainer=trainer)
tokenizer.save("data/tokenizer.json") tokenizer_json = os.path.join(DATA_PATH, "tokenizer.json")
tokenizer.save(tokenizer_json)
tokenizer.from_file("data/tokenizer.json") tokenizer.from_file(tokenizer_json)