mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
update the get_vocab_size
to compute actual length of the get_vocab
function
This commit is contained in:
@ -668,12 +668,11 @@ where
|
|||||||
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
||||||
// TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
|
// TODO ArthurZ THIS IS WRONG! We need to measure the length of the `set` because
|
||||||
// now some tokens can be both in the added_tokens_encoder and in the vocab
|
// now some tokens can be both in the added_tokens_encoder and in the vocab
|
||||||
self.model.get_vocab_size()
|
if with_added_tokens {
|
||||||
+ if with_added_tokens {
|
self.get_vocab(with_added_tokens).len()
|
||||||
self.added_vocabulary.len()
|
} else {
|
||||||
} else {
|
self.model.get_vocab_size()
|
||||||
0
|
}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts a token in the corresponding id.
|
/// Converts a token in the corresponding id.
|
||||||
|
Reference in New Issue
Block a user