Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) … (#1569)

* Revert "[BREAKING CHANGE] Ignore added_tokens (both special and not) in the decoder (#1513)"

This reverts commit 25aee8b88c.

* don't remove audit

* deprecate id_to_token

* use simple id to token

* don't break id_to_token since we are deprecating anyways?
This commit is contained in:
Arthur
2024-07-12 07:29:40 +02:00
committed by GitHub
parent fdd26ba9a3
commit f2a44dc5d1

View File

@ -847,35 +847,23 @@ where
/// Decode the given ids, back to a String /// Decode the given ids, back to a String
pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> { pub fn decode(&self, ids: &[u32], skip_special_tokens: bool) -> Result<String> {
let mut result = String::with_capacity(ids.len()); let tokens = ids
let mut chunks = Vec::with_capacity(ids.len()); .iter()
for id in ids { .filter_map(|id| {
if let Some(added_token) = self.added_vocabulary.simple_id_to_token(*id) { self.added_vocabulary
if skip_special_tokens && self.added_vocabulary.is_special_token(&added_token) { .simple_id_to_token(*id)
continue; .or_else(|| self.model.id_to_token(*id))
} .filter(|token| {
let text_chunk = if let Some(decoder) = &self.decoder { !skip_special_tokens || !self.added_vocabulary.is_special_token(token)
decoder.decode(chunks.clone())? })
})
.collect::<Vec<_>>();
if let Some(decoder) = &self.decoder {
decoder.decode(tokens)
} else { } else {
chunks.join(" ") Ok(tokens.join(" "))
};
result.push_str(&text_chunk);
if !result.is_empty() && self.decoder.is_none() {
result.push(' ');
} }
result.push_str(&added_token);
chunks.clear();
} else if let Some(token) = self.model.id_to_token(*id) {
chunks.push(token);
}
}
let text_chunk = if let Some(decoder) = &self.decoder {
decoder.decode(chunks.clone())?
} else {
chunks.join(" ")
};
result.push_str(&text_chunk);
Ok(result)
} }
} }