Fixing the vocab size of the trained Unigram model (#952)

* Fixing the vocab size of the trained Unigram model * add test for the vocab size of the trained Unigram model * Revert "add test for the vocab size of the trained Unigram model" This reverts commit fb8955c831b357d1037548ceaa8789734d544646. * Fixing the vocab size of the trained Unigram model * format codes * get the position of vocab-size calculation out of loop
2025-08-22 16:25:30 +00:00 · 2022-03-19 02:13:17 +09:00
parent daa4dd2288
commit 1bb9884f45
2 changed files with 42 additions and 12 deletions
--- a/bindings/python/tests/bindings/test_trainers.py
+++ b/bindings/python/tests/bindings/test_trainers.py
@ -238,6 +238,28 @@ class TestUnigram:
            "[SEP]",
        ]

+        tokenizer = Tokenizer(models.Unigram())
+        trainer = trainers.UnigramTrainer(
+            show_progress=False,
+            special_tokens=["[PAD]", "[SEP]", "[CLS]"],
+            unk_token="[UNK]",
+            vocab_size=100,
+        )
+        tokenizer.train([filename], trainer=trainer)
+
+        assert tokenizer.get_vocab_size() == 100
+
+        tokenizer = Tokenizer(models.Unigram())
+        trainer = trainers.UnigramTrainer(
+            show_progress=False,
+            special_tokens=["[PAD]", "[SEP]", "[CLS]", "[UNK]"],
+            unk_token="[UNK]",
+            vocab_size=100,
+        )
+        tokenizer.train([filename], trainer=trainer)
+
+        assert tokenizer.get_vocab_size() == 100
+
    def test_cannot_train_different_model(self):
        tokenizer = Tokenizer(models.BPE())
        trainer = trainers.UnigramTrainer(show_progress=False)