mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fixing the vocab size of the trained Unigram model (#952)
* Fixing the vocab size of the trained Unigram model * add test for the vocab size of the trained Unigram model * Revert "add test for the vocab size of the trained Unigram model" This reverts commit fb8955c831b357d1037548ceaa8789734d544646. * Fixing the vocab size of the trained Unigram model * format codes * get the position of vocab-size calculation out of loop
This commit is contained in:
@ -238,6 +238,28 @@ class TestUnigram:
|
||||
"[SEP]",
|
||||
]
|
||||
|
||||
tokenizer = Tokenizer(models.Unigram())
|
||||
trainer = trainers.UnigramTrainer(
|
||||
show_progress=False,
|
||||
special_tokens=["[PAD]", "[SEP]", "[CLS]"],
|
||||
unk_token="[UNK]",
|
||||
vocab_size=100,
|
||||
)
|
||||
tokenizer.train([filename], trainer=trainer)
|
||||
|
||||
assert tokenizer.get_vocab_size() == 100
|
||||
|
||||
tokenizer = Tokenizer(models.Unigram())
|
||||
trainer = trainers.UnigramTrainer(
|
||||
show_progress=False,
|
||||
special_tokens=["[PAD]", "[SEP]", "[CLS]", "[UNK]"],
|
||||
unk_token="[UNK]",
|
||||
vocab_size=100,
|
||||
)
|
||||
tokenizer.train([filename], trainer=trainer)
|
||||
|
||||
assert tokenizer.get_vocab_size() == 100
|
||||
|
||||
def test_cannot_train_different_model(self):
|
||||
tokenizer = Tokenizer(models.BPE())
|
||||
trainer = trainers.UnigramTrainer(show_progress=False)
|
||||
|
Reference in New Issue
Block a user