diff --git a/bindings/node/native/src/tokenizer.rs b/bindings/node/native/src/tokenizer.rs index f4a49867..d034de9e 100644 --- a/bindings/node/native/src/tokenizer.rs +++ b/bindings/node/native/src/tokenizer.rs @@ -502,11 +502,7 @@ declare_types! { let guard = cx.lock(); let borrowed = this.borrow(&guard); let normalizer = borrowed.tokenizer.get_normalizer(); - if let Some(normalizer) = normalizer { - Some(Container::from_ref(normalizer)) - } else { - None - } + normalizer.map(|normalizer| { Container::from_ref(normalizer) }) }; if let Some(normalizer) = normalizer { @@ -561,11 +557,7 @@ declare_types! { let guard = cx.lock(); let borrowed = this.borrow(&guard); let pretok = borrowed.tokenizer.get_pre_tokenizer(); - if let Some(pretok) = pretok { - Some(Container::from_ref(pretok)) - } else { - None - } + pretok.map(|pretok| { Container::from_ref(pretok) }) }; if let Some(pretok) = pretok { @@ -620,11 +612,7 @@ declare_types! { let guard = cx.lock(); let borrowed = this.borrow(&guard); let processor = borrowed.tokenizer.get_post_processor(); - if let Some(processor) = processor { - Some(Container::from_ref(processor)) - } else { - None - } + processor.map(|processor| { Container::from_ref(processor) }) }; if let Some(processor) = processor { @@ -679,11 +667,7 @@ declare_types! { let guard = cx.lock(); let borrowed = this.borrow(&guard); let decoder = borrowed.tokenizer.get_decoder(); - if let Some(decoder) = decoder { - Some(Container::from_ref(decoder)) - } else { - None - } + decoder.map(|decoder| { Container::from_ref(decoder) }) }; if let Some(decoder) = decoder { diff --git a/bindings/python/src/error.rs b/bindings/python/src/error.rs index 104cd12d..d7f27e29 100644 --- a/bindings/python/src/error.rs +++ b/bindings/python/src/error.rs @@ -23,9 +23,6 @@ impl std::error::Error for PyError {} pub struct ToPyResult(pub Result); impl std::convert::Into> for ToPyResult { fn into(self) -> PyResult { - match self.0 { - Ok(o) => Ok(o), - Err(e) => Err(exceptions::Exception::py_err(format!("{}", e))), - } + self.0.map_err(|e| { exceptions::Exception::py_err(format!("{}", e)) }) } } diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index e7dab0ff..4be1f423 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -40,7 +40,7 @@ serde_json = "1.0" clap = "2.33.0" unicode-normalization-alignments = "0.1.12" unicode_categories = "0.1.1" -indicatif = "0.13.0" +indicatif = "0.14.0" [dev-dependencies] criterion = "0.3.0" diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 77931b4e..36613e64 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -153,10 +153,7 @@ impl Clone for BPE { // `Clone` can't be derive because it's not implemented for `Cache`. // To keep things simple when we clone, the new BPE will start with a fresh cache. fn clone(&self) -> Self { - let fresh_cache = match self.cache { - Some(ref cache) => Some(cache.fresh()), - None => None, - }; + let fresh_cache = self.cache.as_ref().map(|cache| cache.fresh()); Self { vocab: self.vocab.clone(), vocab_r: self.vocab_r.clone(), @@ -359,10 +356,10 @@ impl Model for BPE { let mut encoded: Vec = Vec::with_capacity(sentence.len()); let mut cached_words = match self.dropout { - None => match self.cache { - Some(ref cache) => cache.get_values(sentence.iter().map(|(s, _)| s.clone())), - None => None, - }, + None => self + .cache + .as_ref() + .and_then(|cache| cache.get_values(sentence.iter().map(|(s, _)| s.clone()))), Some(_) => None, // If using dropout we don't want to use the cache. }; let mut should_update_cache = false; @@ -446,10 +443,9 @@ impl Model for BPE { merges_file.write_all( &merges .into_iter() - .map(|(pair, _)| { + .flat_map(|(pair, _)| { format!("{} {}\n", self.vocab_r[&pair.0], self.vocab_r[&pair.1]).into_bytes() }) - .flatten() .collect::>()[..], )?; diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index 272db4f0..935749bc 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -261,8 +261,7 @@ impl Model for WordPiece { vocab_file.write_all( &vocab .into_iter() - .map(|(token, _)| format!("{}\n", token).as_bytes().to_owned()) - .flatten() + .flat_map(|(token, _)| format!("{}\n", token).as_bytes().to_owned()) .collect::>()[..], )?; diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index fd3e18b4..3faf8619 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -194,9 +194,8 @@ mod tests { for sample in samples { let pre_tokenized = bl.pre_tokenize(&sample).unwrap(); let separated_tokens = pre_tokenized - .into_iter() - .map(|(token, _)| token.split("").map(|t| t.into()).collect::>()) - .flatten() + .iter() + .flat_map(|(token, _)| token.split("").map(|t| t.into())) .collect::>(); assert_eq!(sample, bl.decode(separated_tokens).unwrap()); } diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index c0d7a89b..9b8a3a03 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -57,9 +57,8 @@ impl PreTokenizer for Metaspace { impl Decoder for Metaspace { fn decode(&self, tokens: Vec) -> Result { Ok(tokens - .into_iter() - .map(|t| t.chars().collect::>()) - .flatten() + .iter() + .flat_map(|t| t.chars()) .enumerate() .map(|(i, c)| { if c == self.replacement { diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 0e05301f..d723f821 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -9,7 +9,7 @@ impl PreTokenizer for Whitespace { } Ok(RE .captures_iter(s) - .map(|captures| { + .flat_map(|captures| { captures .iter() .map(|m| { @@ -21,7 +21,6 @@ impl PreTokenizer for Whitespace { }) .collect::>() }) - .flatten() .collect()) } } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 90b6b486..607d7cdd 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -679,7 +679,7 @@ impl Tokenizer { let mut start_offset = 0; let mut splits = splits .into_iter() - .map(|(start, end)| { + .flat_map(|(start, end)| { let mut splits = vec![]; if start_offset < start { splits.push((start_offset, start)); @@ -689,7 +689,6 @@ impl Tokenizer { splits }) - .flatten() .collect::>(); if let Some((_, end)) = splits.iter().last().copied() { if end < sentence.len() {