Add Tokenizer token_to_id/id_to_token

2025-08-22 16:25:30 +00:00 · 2019-11-20 17:28:28 -05:00
parent 8b3d7d1aa0
commit 3ec26b332c
3 changed files with 28 additions and 0 deletions
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@ -153,6 +153,14 @@ impl Tokenizer {
            })
            .collect()
    }
+
+    fn token_to_id(&self, token: &str) -> Option<u32> {
+        self.tokenizer.token_to_id(token)
+    }
+
+    fn id_to_token(&self, id: u32) -> Option<String> {
+        self.tokenizer.id_to_token(id)
+    }
 }

 #[pymodule]
--- a/tokenizers/src/models/bpe/model.rs
+++ b/tokenizers/src/models/bpe/model.rs
@ -150,4 +150,12 @@ impl Model for BPE {

        encoded
    }
+
+    fn token_to_id(&self, token: &str) -> Option<u32> {
+        self.vocab.get(token).map(|id| *id)
+    }
+
+    fn id_to_token(&self, id: u32) -> Option<String> {
+        self.vocab_r.get(&id).map(|token| token.clone())
+    }
 }
--- a/tokenizers/src/tokenizer.rs
+++ b/tokenizers/src/tokenizer.rs
@ -27,6 +27,8 @@ pub trait PreTokenizer {
 /// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
 pub trait Model {
    fn tokenize(&self, tokens: Vec<String>) -> Vec<Token>;
+    fn token_to_id(&self, token: &str) -> Option<u32>;
+    fn id_to_token(&self, id: u32) -> Option<String>;
 }

 /// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
@ -99,6 +101,16 @@ impl Tokenizer {
        self
    }

+    /// Converts a token in the corresponding id.
+    pub fn token_to_id(&self, token: &str) -> Option<u32> {
+        self.model.token_to_id(token)
+    }
+
+    /// Converts an id to the corresponding token.
+    pub fn id_to_token(&self, id: u32) -> Option<String> {
+        self.model.id_to_token(id)
+    }
+
    /// Encode the given sentence
    pub fn encode(&self, sentence: &str) -> Vec<Token> {
        let pre_tokenized = match &self.pre_tokenizer {