update the get_vocab_size to compute actual length of the get_vocab function

This commit is contained in:
Arthur Zucker
2023-09-05 15:19:50 +00:00
parent f1da83f358
commit 531b06f6db

View File

@ -668,12 +668,11 @@ where
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize { pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
// TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because // TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
// now some tokens can be both in the added_tokens_encoder and in the vocab // now some tokens can be both in the added_tokens_encoder and in the vocab
self.model.get_vocab_size() if with_added_tokens {
+ if with_added_tokens { self.get_vocab(with_added_tokens).len()
self.added_vocabulary.len() } else {
} else { self.model.get_vocab_size()
0 }
}
} }
/// Converts a token in the corresponding id. /// Converts a token in the corresponding id.