From 6b1028d5509bcac32ad1a52c2a80b230e649f04f Mon Sep 17 00:00:00 2001
From: Anthony MOI <m.anthony.moi@gmail.com>
Date: Fri, 13 Dec 2019 17:52:31 -0500
Subject: [PATCH] Add clippy warnings + fix all of them

---
 tokenizers/src/lib.rs                       |  2 ++
 tokenizers/src/models/bpe/cache.rs          |  5 ++---
 tokenizers/src/models/bpe/model.rs          | 14 +++++++-------
 tokenizers/src/models/bpe/trainer.rs        |  6 ++++--
 tokenizers/src/models/bpe/word.rs           |  2 +-
 tokenizers/src/models/wordpiece/mod.rs      |  4 ++--
 tokenizers/src/pre_tokenizers/whitespace.rs |  2 +-
 tokenizers/src/processors/bert.rs           | 10 +++++-----
 tokenizers/src/tokenizer/encoding.rs        |  9 +++++----
 tokenizers/src/tokenizer/mod.rs             |  4 ++--
 tokenizers/src/utils.rs                     |  6 +++---
 11 files changed, 34 insertions(+), 30 deletions(-)
diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs
index 04d7d50c..5af84a15 100644
--- a/tokenizers/src/lib.rs
+++ b/tokenizers/src/lib.rs
@@ -1,3 +1,5 @@
+#![warn(clippy::all)]
+
 #[macro_use]
 extern crate lazy_static;
 
diff --git a/tokenizers/src/models/bpe/cache.rs b/tokenizers/src/models/bpe/cache.rs
index db318300..18c76406 100644
--- a/tokenizers/src/models/bpe/cache.rs
+++ b/tokenizers/src/models/bpe/cache.rs
@@ -10,6 +10,7 @@ use std::sync::Mutex;
 /// The goal is clearly not the accuracy of the content, both get and set
 /// are not guaranteed to actually get or set.
 ///
+#[derive(Default)]
 pub struct Cache<K, V>
 where
     K: Eq + Hash + Clone,
@@ -32,9 +33,7 @@ where
     pub fn get_values(&self, keys: &[K]) -> Vec<Option<V>> {
         let mut lock = self.map.try_lock();
         if let Ok(ref mut cache) = lock {
-            keys.iter()
-                .map(|k| cache.get(k).map(|v| v.clone()))
-                .collect()
+            keys.iter().map(|k| cache.get(k).cloned()).collect()
         } else {
             keys.iter().map(|_| None).collect()
         }
diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs
index 6cb34fe1..835e6eea 100644
--- a/tokenizers/src/models/bpe/model.rs
+++ b/tokenizers/src/models/bpe/model.rs
@@ -69,14 +69,14 @@ impl BPE {
                 continue;
             }
 
-            let parts = line.split(" ").collect::<Vec<_>>();
+            let parts = line.split(' ').collect::<Vec<_>>();
 
             let a = vocab
                 .get(parts[0])
-                .ok_or(Error::MergeTokenOutOfVocabulary(parts[0].to_owned()))?;
+                .ok_or_else(|| Error::MergeTokenOutOfVocabulary(parts[0].to_owned()))?;
             let b = vocab
                 .get(parts[1])
-                .ok_or(Error::MergeTokenOutOfVocabulary(parts[1].to_owned()))?;
+                .ok_or_else(|| Error::MergeTokenOutOfVocabulary(parts[1].to_owned()))?;
             let pair = (*a, *b);
             let new_token = format!("{}{}", parts[0], parts[1]);
             let new_id = vocab
@@ -101,7 +101,7 @@ impl Model for BPE {
     }
 
     fn tokenize(&self, sentence: Vec<String>) -> Result<Vec<Token>> {
-        if sentence.len() == 0 {
+        if sentence.is_empty() {
             return Ok(vec![]);
         }
 
@@ -109,7 +109,7 @@ impl Model for BPE {
         let mut cached_words = self.cache.get_values(&sentence);
 
         for (i, w) in sentence.iter().enumerate() {
-            if let None = cached_words[i] {
+            if cached_words[i].is_none() {
                 let mut word = Word::new();
                 for c in w.chars() {
                     match self.vocab.get(&c.to_string()) {
@@ -194,10 +194,10 @@ impl Model for BPE {
     }
 
     fn token_to_id(&self, token: &str) -> Option<u32> {
-        self.vocab.get(token).map(|id| *id)
+        self.vocab.get(token).copied()
     }
 
     fn id_to_token(&self, id: u32) -> Option<String> {
-        self.vocab_r.get(&id).map(|token| token.clone())
+        self.vocab_r.get(&id).cloned()
     }
 }
diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs
index c0e4d777..88769606 100644
--- a/tokenizers/src/models/bpe/trainer.rs
+++ b/tokenizers/src/models/bpe/trainer.rs
@@ -3,6 +3,8 @@
 //!
 //! In charge of training a BPE model
 //!
+#![allow(clippy::map_entry)]
+
 use super::{Pair, Word, BPE};
 use crate::tokenizer::{Model, Result, Trainer};
 use std::{
@@ -87,7 +89,7 @@ impl Trainer for BpeTrainer {
 
                 // Initialize pair_counts and where_to_update for this pair if we just saw it
                 if !pair_counts.contains_key(&cur_pair) {
-                    let pair = (0, cur_pair.clone());
+                    let pair = (0, cur_pair);
                     pair_counts.insert(cur_pair, pair);
                     if !where_to_update.contains_key(&cur_pair) {
                         where_to_update.insert(cur_pair, HashSet::new());
@@ -125,7 +127,7 @@ impl Trainer for BpeTrainer {
             // Find the best pair
             let mut best_count = 0;
             let mut best_pair = (std::u32::MAX, std::u32::MAX);
-            for (_, x) in &pair_counts {
+            for x in pair_counts.values() {
                 if x.0 > best_count {
                     best_count = x.0;
                     best_pair = x.1;
diff --git a/tokenizers/src/models/bpe/word.rs b/tokenizers/src/models/bpe/word.rs
index 7f91e6bf..541ddac5 100644
--- a/tokenizers/src/models/bpe/word.rs
+++ b/tokenizers/src/models/bpe/word.rs
@@ -1,7 +1,7 @@
 use super::Pair;
 
 // TODO: Add tests
-#[derive(Clone)]
+#[derive(Clone, Default)]
 pub struct Word {
     chars: Vec<u32>,
     sizes: Vec<usize>,
diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs
index 339d6ff1..cd62a104 100644
--- a/tokenizers/src/models/wordpiece/mod.rs
+++ b/tokenizers/src/models/wordpiece/mod.rs
@@ -151,11 +151,11 @@ impl Model for WordPiece {
     }
 
     fn token_to_id(&self, token: &str) -> Option<u32> {
-        self.vocab.get(token).map(|id| *id)
+        self.vocab.get(token).copied()
     }
 
     fn id_to_token(&self, id: u32) -> Option<String> {
-        self.vocab_r.get(&id).map(|token| token.clone())
+        self.vocab_r.get(&id).cloned()
     }
 }
 
diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs
index a3ed1232..832e0aa0 100644
--- a/tokenizers/src/pre_tokenizers/whitespace.rs
+++ b/tokenizers/src/pre_tokenizers/whitespace.rs
@@ -14,7 +14,7 @@ impl PreTokenizer for Whitespace {
                     .iter()
                     .map(|m| {
                         m.map(|capture| s[capture.start()..capture.end()].to_owned())
-                            .unwrap_or(String::from(""))
+                            .unwrap_or_else(|| String::from(""))
                     })
                     .collect()
             })
diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs
index 21a9e9e9..1d97f109 100644
--- a/tokenizers/src/processors/bert.rs
+++ b/tokenizers/src/processors/bert.rs
@@ -83,13 +83,13 @@ impl PostProcessor for BertProcessing {
                     .map(|e| e.get_normalized())
                     .unwrap_or("")
             ),
-            [&ids[..], &pair_ids.unwrap_or(vec![])[..]].concat(),
-            [&type_ids[..], &pair_type_ids.unwrap_or(vec![])[..]].concat(),
-            [&tokens[..], &pair_tokens.unwrap_or(vec![])[..]].concat(),
-            [&offsets[..], &pair_offsets.unwrap_or(vec![])[..]].concat(),
+            [&ids[..], &pair_ids.unwrap_or_else(|| vec![])[..]].concat(),
+            [&type_ids[..], &pair_type_ids.unwrap_or_else(|| vec![])[..]].concat(),
+            [&tokens[..], &pair_tokens.unwrap_or_else(|| vec![])[..]].concat(),
+            [&offsets[..], &pair_offsets.unwrap_or_else(|| vec![])[..]].concat(),
             [
                 &special_tokens[..],
-                &pair_special_tokens.unwrap_or(vec![])[..],
+                &pair_special_tokens.unwrap_or_else(|| vec![])[..],
             ]
             .concat(),
             attention_mask,
diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs
index a6e632a4..0cca9dbb 100644
--- a/tokenizers/src/tokenizer/encoding.rs
+++ b/tokenizers/src/tokenizer/encoding.rs
@@ -12,6 +12,7 @@ pub struct Encoding {
     overflowing: Option<Box<Encoding>>,
 }
 impl Encoding {
+    #[allow(clippy::too_many_arguments)]
     pub fn new(
         original: String,
         normalized: String,
@@ -118,8 +119,8 @@ impl Encoding {
     }
 
     pub fn merge_with(&mut self, pair: Encoding) {
-        self.original.extend(pair.original.chars());
-        self.normalized.extend(pair.normalized.chars());
+        self.original.push_str(&pair.original);
+        self.normalized.push_str(&pair.normalized);
         self.ids.extend(pair.ids);
         self.type_ids.extend(pair.type_ids);
         self.tokens.extend(pair.tokens);
@@ -142,12 +143,12 @@ impl Encoding {
 
 /// Prepend the `stride` last elements of the `previous` Vec to the current Vec
 // A new Vec is instantiated though.
-fn prepend_stride<T: Clone>(previous: &Vec<T>, current: Vec<T>, stride: usize) -> Vec<T> {
+fn prepend_stride<T: Clone>(previous: &[T], current: Vec<T>, stride: usize) -> Vec<T> {
     let prev = previous
         .iter()
         .rev()
         .take(stride)
-        .map(|v| v.clone())
+        .cloned()
         .rev()
         .collect::<Vec<_>>();
 
diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs
index 32fcf029..afc77318 100644
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -236,7 +236,7 @@ impl Tokenizer {
     }
 
     /// Train a model and replace our current Model, using the given Trainer
-    pub fn train(&mut self, trainer: &Box<dyn Trainer>, files: Vec<String>) -> Result<()> {
+    pub fn train(&mut self, trainer: &dyn Trainer, files: Vec<String>) -> Result<()> {
         let results = files
             .par_iter()
             .map(|file| -> Result<HashMap<String, u32>> {
@@ -284,7 +284,7 @@ impl Tokenizer {
         if let Some(normalizer) = &self.normalizer {
             normalizer.normalize(sentence)
         } else {
-            Ok(sentence.to_owned())
+            Ok(sentence)
         }
     }
 
diff --git a/tokenizers/src/utils.rs b/tokenizers/src/utils.rs
index aa5fb935..6f45ce5f 100644
--- a/tokenizers/src/utils.rs
+++ b/tokenizers/src/utils.rs
@@ -52,9 +52,9 @@ pub fn truncate_encodings(
             }
 
             encoding.truncate(encoding.get_ids().len() - n_first, stride);
-            pair_encoding
-                .as_mut()
-                .map(|encoding| encoding.truncate(encoding.get_ids().len() - n_second, stride));
+            if let Some(encoding) = pair_encoding.as_mut() {
+                encoding.truncate(encoding.get_ids().len() - n_second, stride);
+            }
         }
         TruncationStrategy::OnlyFirst | TruncationStrategy::OnlySecond => {
             let target = if strategy == TruncationStrategy::OnlyFirst {