mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Ability to decode with added tokens
This commit is contained in:
@ -184,15 +184,6 @@ impl Model for BPE {
|
||||
Ok(encoded)
|
||||
}
|
||||
|
||||
fn decode(&self, ids: Vec<u32>) -> Result<Vec<String>> {
|
||||
Ok(ids
|
||||
.into_iter()
|
||||
.map(|id| self.vocab_r.get(&id))
|
||||
.filter(|token| token.is_some())
|
||||
.map(|id| id.unwrap().clone())
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.vocab.get(token).copied()
|
||||
}
|
||||
|
@ -141,15 +141,6 @@ impl Model for WordPiece {
|
||||
Ok(output_tokens)
|
||||
}
|
||||
|
||||
fn decode(&self, ids: Vec<u32>) -> Result<Vec<String>> {
|
||||
Ok(ids
|
||||
.into_iter()
|
||||
.map(|id| self.vocab_r.get(&id))
|
||||
.filter(|token| token.is_some())
|
||||
.map(|id| id.unwrap().clone())
|
||||
.collect())
|
||||
}
|
||||
|
||||
fn token_to_id(&self, token: &str) -> Option<u32> {
|
||||
self.vocab.get(token).copied()
|
||||
}
|
||||
|
@ -38,7 +38,6 @@ pub trait PreTokenizer {
|
||||
/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
|
||||
pub trait Model {
|
||||
fn tokenize(&self, tokens: Vec<String>) -> Result<Vec<Token>>;
|
||||
fn decode(&self, ids: Vec<u32>) -> Result<Vec<String>>;
|
||||
fn token_to_id(&self, token: &str) -> Option<u32>;
|
||||
fn id_to_token(&self, id: u32) -> Option<String>;
|
||||
fn get_vocab_size(&self) -> usize;
|
||||
@ -292,7 +291,18 @@ impl Tokenizer {
|
||||
|
||||
/// Decode the given ids, back to a String
|
||||
pub fn decode(&self, ids: Vec<u32>) -> Result<String> {
|
||||
let tokens = self.model.decode(ids)?;
|
||||
let tokens = ids
|
||||
.into_iter()
|
||||
.map(|id| {
|
||||
if let Some(token) = self.added_tokens_r.get(&id) {
|
||||
Some(token.content.to_owned())
|
||||
} else {
|
||||
self.model.id_to_token(id)
|
||||
}
|
||||
})
|
||||
.filter(|token| token.is_some())
|
||||
.map(|id| id.unwrap())
|
||||
.collect::<Vec<_>>();
|
||||
|
||||
if let Some(decoder) = &self.decoder {
|
||||
decoder.decode(tokens)
|
||||
|
Reference in New Issue
Block a user