diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index cc898501..20bc44ac 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -153,6 +153,14 @@ impl Tokenizer { }) .collect() } + + fn token_to_id(&self, token: &str) -> Option { + self.tokenizer.token_to_id(token) + } + + fn id_to_token(&self, id: u32) -> Option { + self.tokenizer.id_to_token(id) + } } #[pymodule] diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 3ab37a2a..15e51aa5 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -150,4 +150,12 @@ impl Model for BPE { encoded } + + fn token_to_id(&self, token: &str) -> Option { + self.vocab.get(token).map(|id| *id) + } + + fn id_to_token(&self, id: u32) -> Option { + self.vocab_r.get(&id).map(|token| token.clone()) + } } diff --git a/tokenizers/src/tokenizer.rs b/tokenizers/src/tokenizer.rs index d1625bf9..6e98456b 100644 --- a/tokenizers/src/tokenizer.rs +++ b/tokenizers/src/tokenizer.rs @@ -27,6 +27,8 @@ pub trait PreTokenizer { /// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram) pub trait Model { fn tokenize(&self, tokens: Vec) -> Vec; + fn token_to_id(&self, token: &str) -> Option; + fn id_to_token(&self, id: u32) -> Option; } /// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer. @@ -99,6 +101,16 @@ impl Tokenizer { self } + /// Converts a token in the corresponding id. + pub fn token_to_id(&self, token: &str) -> Option { + self.model.token_to_id(token) + } + + /// Converts an id to the corresponding token. + pub fn id_to_token(&self, id: u32) -> Option { + self.model.id_to_token(id) + } + /// Encode the given sentence pub fn encode(&self, sentence: &str) -> Vec { let pre_tokenized = match &self.pre_tokenizer {