Fixing the vocab size of the trained Unigram model (#952)

* Fixing the vocab size of the trained Unigram model

* add test for the vocab size of the trained Unigram model

* Revert "add test for the vocab size of the trained Unigram model"

This reverts commit fb8955c831b357d1037548ceaa8789734d544646.

* Fixing the vocab size of the trained Unigram model

* format codes

* get the position of vocab-size calculation out of loop
This commit is contained in:
Kaito Sugimoto
2022-03-19 02:13:17 +09:00
committed by GitHub
parent daa4dd2288
commit 1bb9884f45
2 changed files with 42 additions and 12 deletions

View File

@ -238,6 +238,28 @@ class TestUnigram:
"[SEP]",
]
tokenizer = Tokenizer(models.Unigram())
trainer = trainers.UnigramTrainer(
show_progress=False,
special_tokens=["[PAD]", "[SEP]", "[CLS]"],
unk_token="[UNK]",
vocab_size=100,
)
tokenizer.train([filename], trainer=trainer)
assert tokenizer.get_vocab_size() == 100
tokenizer = Tokenizer(models.Unigram())
trainer = trainers.UnigramTrainer(
show_progress=False,
special_tokens=["[PAD]", "[SEP]", "[CLS]", "[UNK]"],
unk_token="[UNK]",
vocab_size=100,
)
tokenizer.train([filename], trainer=trainer)
assert tokenizer.get_vocab_size() == 100
def test_cannot_train_different_model(self):
tokenizer = Tokenizer(models.BPE())
trainer = trainers.UnigramTrainer(show_progress=False)