mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
update the get_vocab_size
to compute actual length of the get_vocab
function
This commit is contained in:
@ -668,11 +668,10 @@ where
|
||||
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
||||
// TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
|
||||
// now some tokens can be both in the added_tokens_encoder and in the vocab
|
||||
self.model.get_vocab_size()
|
||||
+ if with_added_tokens {
|
||||
self.added_vocabulary.len()
|
||||
if with_added_tokens {
|
||||
self.get_vocab(with_added_tokens).len()
|
||||
} else {
|
||||
0
|
||||
self.model.get_vocab_size()
|
||||
}
|
||||
}
|
||||
|
||||
|
Reference in New Issue
Block a user