From 6b1028d5509bcac32ad1a52c2a80b230e649f04f Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Fri, 13 Dec 2019 17:52:31 -0500 Subject: [PATCH] Add clippy warnings + fix all of them --- tokenizers/src/lib.rs | 2 ++ tokenizers/src/models/bpe/cache.rs | 5 ++--- tokenizers/src/models/bpe/model.rs | 14 +++++++------- tokenizers/src/models/bpe/trainer.rs | 6 ++++-- tokenizers/src/models/bpe/word.rs | 2 +- tokenizers/src/models/wordpiece/mod.rs | 4 ++-- tokenizers/src/pre_tokenizers/whitespace.rs | 2 +- tokenizers/src/processors/bert.rs | 10 +++++----- tokenizers/src/tokenizer/encoding.rs | 9 +++++---- tokenizers/src/tokenizer/mod.rs | 4 ++-- tokenizers/src/utils.rs | 6 +++--- 11 files changed, 34 insertions(+), 30 deletions(-) diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 04d7d50c..5af84a15 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -1,3 +1,5 @@ +#![warn(clippy::all)] + #[macro_use] extern crate lazy_static; diff --git a/tokenizers/src/models/bpe/cache.rs b/tokenizers/src/models/bpe/cache.rs index db318300..18c76406 100644 --- a/tokenizers/src/models/bpe/cache.rs +++ b/tokenizers/src/models/bpe/cache.rs @@ -10,6 +10,7 @@ use std::sync::Mutex; /// The goal is clearly not the accuracy of the content, both get and set /// are not guaranteed to actually get or set. /// +#[derive(Default)] pub struct Cache where K: Eq + Hash + Clone, @@ -32,9 +33,7 @@ where pub fn get_values(&self, keys: &[K]) -> Vec> { let mut lock = self.map.try_lock(); if let Ok(ref mut cache) = lock { - keys.iter() - .map(|k| cache.get(k).map(|v| v.clone())) - .collect() + keys.iter().map(|k| cache.get(k).cloned()).collect() } else { keys.iter().map(|_| None).collect() } diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index 6cb34fe1..835e6eea 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -69,14 +69,14 @@ impl BPE { continue; } - let parts = line.split(" ").collect::>(); + let parts = line.split(' ').collect::>(); let a = vocab .get(parts[0]) - .ok_or(Error::MergeTokenOutOfVocabulary(parts[0].to_owned()))?; + .ok_or_else(|| Error::MergeTokenOutOfVocabulary(parts[0].to_owned()))?; let b = vocab .get(parts[1]) - .ok_or(Error::MergeTokenOutOfVocabulary(parts[1].to_owned()))?; + .ok_or_else(|| Error::MergeTokenOutOfVocabulary(parts[1].to_owned()))?; let pair = (*a, *b); let new_token = format!("{}{}", parts[0], parts[1]); let new_id = vocab @@ -101,7 +101,7 @@ impl Model for BPE { } fn tokenize(&self, sentence: Vec) -> Result> { - if sentence.len() == 0 { + if sentence.is_empty() { return Ok(vec![]); } @@ -109,7 +109,7 @@ impl Model for BPE { let mut cached_words = self.cache.get_values(&sentence); for (i, w) in sentence.iter().enumerate() { - if let None = cached_words[i] { + if cached_words[i].is_none() { let mut word = Word::new(); for c in w.chars() { match self.vocab.get(&c.to_string()) { @@ -194,10 +194,10 @@ impl Model for BPE { } fn token_to_id(&self, token: &str) -> Option { - self.vocab.get(token).map(|id| *id) + self.vocab.get(token).copied() } fn id_to_token(&self, id: u32) -> Option { - self.vocab_r.get(&id).map(|token| token.clone()) + self.vocab_r.get(&id).cloned() } } diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index c0e4d777..88769606 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -3,6 +3,8 @@ //! //! In charge of training a BPE model //! +#![allow(clippy::map_entry)] + use super::{Pair, Word, BPE}; use crate::tokenizer::{Model, Result, Trainer}; use std::{ @@ -87,7 +89,7 @@ impl Trainer for BpeTrainer { // Initialize pair_counts and where_to_update for this pair if we just saw it if !pair_counts.contains_key(&cur_pair) { - let pair = (0, cur_pair.clone()); + let pair = (0, cur_pair); pair_counts.insert(cur_pair, pair); if !where_to_update.contains_key(&cur_pair) { where_to_update.insert(cur_pair, HashSet::new()); @@ -125,7 +127,7 @@ impl Trainer for BpeTrainer { // Find the best pair let mut best_count = 0; let mut best_pair = (std::u32::MAX, std::u32::MAX); - for (_, x) in &pair_counts { + for x in pair_counts.values() { if x.0 > best_count { best_count = x.0; best_pair = x.1; diff --git a/tokenizers/src/models/bpe/word.rs b/tokenizers/src/models/bpe/word.rs index 7f91e6bf..541ddac5 100644 --- a/tokenizers/src/models/bpe/word.rs +++ b/tokenizers/src/models/bpe/word.rs @@ -1,7 +1,7 @@ use super::Pair; // TODO: Add tests -#[derive(Clone)] +#[derive(Clone, Default)] pub struct Word { chars: Vec, sizes: Vec, diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index 339d6ff1..cd62a104 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -151,11 +151,11 @@ impl Model for WordPiece { } fn token_to_id(&self, token: &str) -> Option { - self.vocab.get(token).map(|id| *id) + self.vocab.get(token).copied() } fn id_to_token(&self, id: u32) -> Option { - self.vocab_r.get(&id).map(|token| token.clone()) + self.vocab_r.get(&id).cloned() } } diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index a3ed1232..832e0aa0 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -14,7 +14,7 @@ impl PreTokenizer for Whitespace { .iter() .map(|m| { m.map(|capture| s[capture.start()..capture.end()].to_owned()) - .unwrap_or(String::from("")) + .unwrap_or_else(|| String::from("")) }) .collect() }) diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs index 21a9e9e9..1d97f109 100644 --- a/tokenizers/src/processors/bert.rs +++ b/tokenizers/src/processors/bert.rs @@ -83,13 +83,13 @@ impl PostProcessor for BertProcessing { .map(|e| e.get_normalized()) .unwrap_or("") ), - [&ids[..], &pair_ids.unwrap_or(vec![])[..]].concat(), - [&type_ids[..], &pair_type_ids.unwrap_or(vec![])[..]].concat(), - [&tokens[..], &pair_tokens.unwrap_or(vec![])[..]].concat(), - [&offsets[..], &pair_offsets.unwrap_or(vec![])[..]].concat(), + [&ids[..], &pair_ids.unwrap_or_else(|| vec![])[..]].concat(), + [&type_ids[..], &pair_type_ids.unwrap_or_else(|| vec![])[..]].concat(), + [&tokens[..], &pair_tokens.unwrap_or_else(|| vec![])[..]].concat(), + [&offsets[..], &pair_offsets.unwrap_or_else(|| vec![])[..]].concat(), [ &special_tokens[..], - &pair_special_tokens.unwrap_or(vec![])[..], + &pair_special_tokens.unwrap_or_else(|| vec![])[..], ] .concat(), attention_mask, diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs index a6e632a4..0cca9dbb 100644 --- a/tokenizers/src/tokenizer/encoding.rs +++ b/tokenizers/src/tokenizer/encoding.rs @@ -12,6 +12,7 @@ pub struct Encoding { overflowing: Option>, } impl Encoding { + #[allow(clippy::too_many_arguments)] pub fn new( original: String, normalized: String, @@ -118,8 +119,8 @@ impl Encoding { } pub fn merge_with(&mut self, pair: Encoding) { - self.original.extend(pair.original.chars()); - self.normalized.extend(pair.normalized.chars()); + self.original.push_str(&pair.original); + self.normalized.push_str(&pair.normalized); self.ids.extend(pair.ids); self.type_ids.extend(pair.type_ids); self.tokens.extend(pair.tokens); @@ -142,12 +143,12 @@ impl Encoding { /// Prepend the `stride` last elements of the `previous` Vec to the current Vec // A new Vec is instantiated though. -fn prepend_stride(previous: &Vec, current: Vec, stride: usize) -> Vec { +fn prepend_stride(previous: &[T], current: Vec, stride: usize) -> Vec { let prev = previous .iter() .rev() .take(stride) - .map(|v| v.clone()) + .cloned() .rev() .collect::>(); diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index 32fcf029..afc77318 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -236,7 +236,7 @@ impl Tokenizer { } /// Train a model and replace our current Model, using the given Trainer - pub fn train(&mut self, trainer: &Box, files: Vec) -> Result<()> { + pub fn train(&mut self, trainer: &dyn Trainer, files: Vec) -> Result<()> { let results = files .par_iter() .map(|file| -> Result> { @@ -284,7 +284,7 @@ impl Tokenizer { if let Some(normalizer) = &self.normalizer { normalizer.normalize(sentence) } else { - Ok(sentence.to_owned()) + Ok(sentence) } } diff --git a/tokenizers/src/utils.rs b/tokenizers/src/utils.rs index aa5fb935..6f45ce5f 100644 --- a/tokenizers/src/utils.rs +++ b/tokenizers/src/utils.rs @@ -52,9 +52,9 @@ pub fn truncate_encodings( } encoding.truncate(encoding.get_ids().len() - n_first, stride); - pair_encoding - .as_mut() - .map(|encoding| encoding.truncate(encoding.get_ids().len() - n_second, stride)); + if let Some(encoding) = pair_encoding.as_mut() { + encoding.truncate(encoding.get_ids().len() - n_second, stride); + } } TruncationStrategy::OnlyFirst | TruncationStrategy::OnlySecond => { let target = if strategy == TruncationStrategy::OnlyFirst {