update the get_vocab_size to compute actual length of the get_vocab function

This commit is contained in:
Arthur Zucker
2023-09-05 15:19:50 +00:00
parent f1da83f358
commit 531b06f6db

View File

@ -668,11 +668,10 @@ where
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
// TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
// now some tokens can be both in the added_tokens_encoder and in the vocab
self.model.get_vocab_size()
+ if with_added_tokens {
self.added_vocabulary.len()
if with_added_tokens {
self.get_vocab(with_added_tokens).len()
} else {
0
self.model.get_vocab_size()
}
}