mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) … (#1569)
* Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder (#1513)"
This reverts commit 25aee8b88c
.
* don't remove audit
* deprecate id_to_token
* use simple id to token
* don't break id_to_token since we are deprecating anyways?
This commit is contained in:
@ -847,35 +847,23 @@ where
|
|||||||
|
|
||||||
/// Decode the given ids, back to a String
|
/// Decode the given ids, back to a String
|
||||||
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
|
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
|
||||||
let mut result = String::with_capacity(ids.len());
|
let tokens = ids
|
||||||
let mut chunks = Vec::with_capacity(ids.len());
|
.iter()
|
||||||
for id in ids {
|
.filter_map(|id| {
|
||||||
if let Some(added_token) = self.added_vocabulary.simple_id_to_token(*id) {
|
self.added_vocabulary
|
||||||
if skip_special_tokens && self.added_vocabulary.is_special_token(&added_token) {
|
.simple_id_to_token(*id)
|
||||||
continue;
|
.or_else(|| self.model.id_to_token(*id))
|
||||||
}
|
.filter(|token| {
|
||||||
let text_chunk = if let Some(decoder) = &self.decoder {
|
!skip_special_tokens || !self.added_vocabulary.is_special_token(token)
|
||||||
decoder.decode(chunks.clone())?
|
})
|
||||||
|
})
|
||||||
|
.collect::<Vec<_>>();
|
||||||
|
|
||||||
|
if let Some(decoder) = &self.decoder {
|
||||||
|
decoder.decode(tokens)
|
||||||
} else {
|
} else {
|
||||||
chunks.join(" ")
|
Ok(tokens.join(" "))
|
||||||
};
|
|
||||||
result.push_str(&text_chunk);
|
|
||||||
if !result.is_empty() && self.decoder.is_none() {
|
|
||||||
result.push(' ');
|
|
||||||
}
|
}
|
||||||
result.push_str(&added_token);
|
|
||||||
chunks.clear();
|
|
||||||
} else if let Some(token) = self.model.id_to_token(*id) {
|
|
||||||
chunks.push(token);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
let text_chunk = if let Some(decoder) = &self.decoder {
|
|
||||||
decoder.decode(chunks.clone())?
|
|
||||||
} else {
|
|
||||||
chunks.join(" ")
|
|
||||||
};
|
|
||||||
result.push_str(&text_chunk);
|
|
||||||
Ok(result)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user