Expose vocabulary size

This commit is contained in:
Anthony MOI
2019-12-10 16:20:31 -05:00
parent 6c294c60b0
commit b4b31d73cd
4 changed files with 19 additions and 0 deletions

View File

@ -30,6 +30,11 @@ impl Tokenizer {
}
}
#[getter]
fn get_vocab_size(&self) -> usize {
self.tokenizer.get_vocab_size()
}
fn with_model(&mut self, model: &mut Model) -> PyResult<()> {
if let Some(model) = model.model.to_pointer() {
self.tokenizer.with_model(model);

View File

@ -96,6 +96,10 @@ impl BPE {
}
impl Model for BPE {
fn get_vocab_size(&self) -> usize {
self.vocab.len()
}
fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
if sentence.len() == 0 {
return vec![];

View File

@ -48,6 +48,10 @@ impl WordPiece {
}
impl Model for WordPiece {
fn get_vocab_size(&self) -> usize {
self.vocab.len()
}
fn tokenize(&self, sentence: Vec<String>) -> Vec<Token> {
let mut output_tokens = vec![];

View File

@ -37,6 +37,7 @@ pub trait Model {
fn decode(&self, ids: Vec<u32>) -> Vec<String>;
fn token_to_id(&self, token: &str) -> Option<u32>;
fn id_to_token(&self, id: u32) -> Option<String>;
fn get_vocab_size(&self) -> usize;
}
/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
@ -166,6 +167,11 @@ impl Tokenizer {
self
}
/// Get the size of the vocabulary
pub fn get_vocab_size(&self) -> usize {
self.model.get_vocab_size()
}
/// Converts a token in the corresponding id.
pub fn token_to_id(&self, token: &str) -> Option<u32> {
self.model.token_to_id(token)