mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Add Tokenizer token_to_id/id_to_token
This commit is contained in:
@ -153,6 +153,14 @@ impl Tokenizer {
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.tokenizer.token_to_id(token)
|
||||
}
|
||||
|
||||
fn id_to_token(&self, id: u32) -> Option<String> {
|
||||
self.tokenizer.id_to_token(id)
|
||||
}
|
||||
}
|
||||
|
||||
#[pymodule]
|
||||
|
@ -150,4 +150,12 @@ impl Model for BPE {
|
||||
|
||||
encoded
|
||||
}
|
||||
|
||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.vocab.get(token).map(|id| *id)
|
||||
}
|
||||
|
||||
fn id_to_token(&self, id: u32) -> Option<String> {
|
||||
self.vocab_r.get(&id).map(|token| token.clone())
|
||||
}
|
||||
}
|
||||
|
@ -27,6 +27,8 @@ pub trait PreTokenizer {
|
||||
/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
|
||||
pub trait Model {
|
||||
fn tokenize(&self, tokens: Vec<String>) -> Vec<Token>;
|
||||
fn token_to_id(&self, token: &str) -> Option<u32>;
|
||||
fn id_to_token(&self, id: u32) -> Option<String>;
|
||||
}
|
||||
|
||||
/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
|
||||
@ -99,6 +101,16 @@ impl Tokenizer {
|
||||
self
|
||||
}
|
||||
|
||||
/// Converts a token in the corresponding id.
|
||||
pub fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.model.token_to_id(token)
|
||||
}
|
||||
|
||||
/// Converts an id to the corresponding token.
|
||||
pub fn id_to_token(&self, id: u32) -> Option<String> {
|
||||
self.model.id_to_token(id)
|
||||
}
|
||||
|
||||
/// Encode the given sentence
|
||||
pub fn encode(&self, sentence: &str) -> Vec<Token> {
|
||||
let pre_tokenized = match &self.pre_tokenizer {
|
||||
|
Reference in New Issue
Block a user