Handle vocab size with added tokens

This commit is contained in:
Anthony MOI
2019-12-19 20:19:56 -05:00
parent b7040e0412
commit f2b9c30ad9
2 changed files with 8 additions and 4 deletions

View File

@ -36,9 +36,8 @@ impl Tokenizer {
}
}
#[getter]
fn get_vocab_size(&self) -> usize {
self.tokenizer.get_vocab_size()
fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
self.tokenizer.get_vocab_size(with_added_tokens)
}
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {

View File

@ -210,8 +210,13 @@ impl Tokenizer {
}
/// Get the size of the vocabulary
pub fn get_vocab_size(&self) -> usize {
pub fn get_vocab_size(&self, with_added_tokens: bool) -> usize {
self.model.get_vocab_size()
+ if with_added_tokens {
self.added_tokens.len()
} else {
0
}
}
/// Converts a token in the corresponding id.