mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Handle vocab size with added tokens
This commit is contained in:
@ -36,9 +36,8 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#[getter]
|
fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
||||||
fn get_vocab_size(&self) -> usize {
|
self.tokenizer.get_vocab_size(with_added_tokens)
|
||||||
self.tokenizer.get_vocab_size()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
|
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
|
||||||
|
@ -210,8 +210,13 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Get the size of the vocabulary
|
/// Get the size of the vocabulary
|
||||||
pub fn get_vocab_size(&self) -> usize {
|
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
|
||||||
self.model.get_vocab_size()
|
self.model.get_vocab_size()
|
||||||
|
+ if with_added_tokens {
|
||||||
|
self.added_tokens.len()
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Converts a token in the corresponding id.
|
/// Converts a token in the corresponding id.
|
||||||
|
Reference in New Issue
Block a user