Add Tokenizer token_to_id/id_to_token

This commit is contained in:
Anthony MOI
2019-11-20 17:28:28 -05:00
parent 8b3d7d1aa0
commit 3ec26b332c
3 changed files with 28 additions and 0 deletions

View File

@ -153,6 +153,14 @@ impl Tokenizer {
})
.collect()
}
fn token_to_id(&self, token: &str) -> Option<u32> {
self.tokenizer.token_to_id(token)
}
fn id_to_token(&self, id: u32) -> Option<String> {
self.tokenizer.id_to_token(id)
}
}
#[pymodule]

View File

@ -150,4 +150,12 @@ impl Model for BPE {
encoded
}
fn token_to_id(&self, token: &str) -> Option<u32> {
self.vocab.get(token).map(|id| *id)
}
fn id_to_token(&self, id: u32) -> Option<String> {
self.vocab_r.get(&id).map(|token| token.clone())
}
}

View File

@ -27,6 +27,8 @@ pub trait PreTokenizer {
/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
pub trait Model {
fn tokenize(&self, tokens: Vec<String>) -> Vec<Token>;
fn token_to_id(&self, token: &str) -> Option<u32>;
fn id_to_token(&self, id: u32) -> Option<String>;
}
/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
@ -99,6 +101,16 @@ impl Tokenizer {
self
}
/// Converts a token in the corresponding id.
pub fn token_to_id(&self, token: &str) -> Option<u32> {
self.model.token_to_id(token)
}
/// Converts an id to the corresponding token.
pub fn id_to_token(&self, id: u32) -> Option<String> {
self.model.id_to_token(id)
}
/// Encode the given sentence
pub fn encode(&self, sentence: &str) -> Vec<Token> {
let pre_tokenized = match &self.pre_tokenizer {