Add clippy warnings + fix all of them

This commit is contained in:
Anthony MOI
2019-12-13 17:52:31 -05:00
parent 24139d7324
commit 6b1028d550
11 changed files with 34 additions and 30 deletions

View File

@ -1,3 +1,5 @@
#![warn(clippy::all)]
#[macro_use] #[macro_use]
extern crate lazy_static; extern crate lazy_static;

View File

@ -10,6 +10,7 @@ use std::sync::Mutex;
/// The goal is clearly not the accuracy of the content, both get and set /// The goal is clearly not the accuracy of the content, both get and set
/// are not guaranteed to actually get or set. /// are not guaranteed to actually get or set.
/// ///
#[derive(Default)]
pub struct Cache<K, V> pub struct Cache<K, V>
where where
K: Eq + Hash + Clone, K: Eq + Hash + Clone,
@ -32,9 +33,7 @@ where
pub fn get_values(&self, keys: &[K]) -> Vec<Option<V>> { pub fn get_values(&self, keys: &[K]) -> Vec<Option<V>> {
let mut lock = self.map.try_lock(); let mut lock = self.map.try_lock();
if let Ok(ref mut cache) = lock { if let Ok(ref mut cache) = lock {
keys.iter() keys.iter().map(|k| cache.get(k).cloned()).collect()
.map(|k| cache.get(k).map(|v| v.clone()))
.collect()
} else { } else {
keys.iter().map(|_| None).collect() keys.iter().map(|_| None).collect()
} }

View File

@ -69,14 +69,14 @@ impl BPE {
continue; continue;
} }
let parts = line.split(" ").collect::<Vec<_>>(); let parts = line.split(' ').collect::<Vec<_>>();
let a = vocab let a = vocab
.get(parts[0]) .get(parts[0])
.ok_or(Error::MergeTokenOutOfVocabulary(parts[0].to_owned()))?; .ok_or_else(|| Error::MergeTokenOutOfVocabulary(parts[0].to_owned()))?;
let b = vocab let b = vocab
.get(parts[1]) .get(parts[1])
.ok_or(Error::MergeTokenOutOfVocabulary(parts[1].to_owned()))?; .ok_or_else(|| Error::MergeTokenOutOfVocabulary(parts[1].to_owned()))?;
let pair = (*a, *b); let pair = (*a, *b);
let new_token = format!("{}{}", parts[0], parts[1]); let new_token = format!("{}{}", parts[0], parts[1]);
let new_id = vocab let new_id = vocab
@ -101,7 +101,7 @@ impl Model for BPE {
} }
fn tokenize(&self, sentence: Vec<String>) -> Result<Vec<Token>> { fn tokenize(&self, sentence: Vec<String>) -> Result<Vec<Token>> {
if sentence.len() == 0 { if sentence.is_empty() {
return Ok(vec![]); return Ok(vec![]);
} }
@ -109,7 +109,7 @@ impl Model for BPE {
let mut cached_words = self.cache.get_values(&sentence); let mut cached_words = self.cache.get_values(&sentence);
for (i, w) in sentence.iter().enumerate() { for (i, w) in sentence.iter().enumerate() {
if let None = cached_words[i] { if cached_words[i].is_none() {
let mut word = Word::new(); let mut word = Word::new();
for c in w.chars() { for c in w.chars() {
match self.vocab.get(&c.to_string()) { match self.vocab.get(&c.to_string()) {
@ -194,10 +194,10 @@ impl Model for BPE {
} }
fn token_to_id(&self, token: &str) -> Option<u32> { fn token_to_id(&self, token: &str) -> Option<u32> {
self.vocab.get(token).map(|id| *id) self.vocab.get(token).copied()
} }
fn id_to_token(&self, id: u32) -> Option<String> { fn id_to_token(&self, id: u32) -> Option<String> {
self.vocab_r.get(&id).map(|token| token.clone()) self.vocab_r.get(&id).cloned()
} }
} }

View File

@ -3,6 +3,8 @@
//! //!
//! In charge of training a BPE model //! In charge of training a BPE model
//! //!
#![allow(clippy::map_entry)]
use super::{Pair, Word, BPE}; use super::{Pair, Word, BPE};
use crate::tokenizer::{Model, Result, Trainer}; use crate::tokenizer::{Model, Result, Trainer};
use std::{ use std::{
@ -87,7 +89,7 @@ impl Trainer for BpeTrainer {
// Initialize pair_counts and where_to_update for this pair if we just saw it // Initialize pair_counts and where_to_update for this pair if we just saw it
if !pair_counts.contains_key(&cur_pair) { if !pair_counts.contains_key(&cur_pair) {
let pair = (0, cur_pair.clone()); let pair = (0, cur_pair);
pair_counts.insert(cur_pair, pair); pair_counts.insert(cur_pair, pair);
if !where_to_update.contains_key(&cur_pair) { if !where_to_update.contains_key(&cur_pair) {
where_to_update.insert(cur_pair, HashSet::new()); where_to_update.insert(cur_pair, HashSet::new());
@ -125,7 +127,7 @@ impl Trainer for BpeTrainer {
// Find the best pair // Find the best pair
let mut best_count = 0; let mut best_count = 0;
let mut best_pair = (std::u32::MAX, std::u32::MAX); let mut best_pair = (std::u32::MAX, std::u32::MAX);
for (_, x) in &pair_counts { for x in pair_counts.values() {
if x.0 > best_count { if x.0 > best_count {
best_count = x.0; best_count = x.0;
best_pair = x.1; best_pair = x.1;

View File

@ -1,7 +1,7 @@
use super::Pair; use super::Pair;
// TODO: Add tests // TODO: Add tests
#[derive(Clone)] #[derive(Clone, Default)]
pub struct Word { pub struct Word {
chars: Vec<u32>, chars: Vec<u32>,
sizes: Vec<usize>, sizes: Vec<usize>,

View File

@ -151,11 +151,11 @@ impl Model for WordPiece {
} }
fn token_to_id(&self, token: &str) -> Option<u32> { fn token_to_id(&self, token: &str) -> Option<u32> {
self.vocab.get(token).map(|id| *id) self.vocab.get(token).copied()
} }
fn id_to_token(&self, id: u32) -> Option<String> { fn id_to_token(&self, id: u32) -> Option<String> {
self.vocab_r.get(&id).map(|token| token.clone()) self.vocab_r.get(&id).cloned()
} }
} }

View File

@ -14,7 +14,7 @@ impl PreTokenizer for Whitespace {
.iter() .iter()
.map(|m| { .map(|m| {
m.map(|capture| s[capture.start()..capture.end()].to_owned()) m.map(|capture| s[capture.start()..capture.end()].to_owned())
.unwrap_or(String::from("")) .unwrap_or_else(|| String::from(""))
}) })
.collect() .collect()
}) })

View File

@ -83,13 +83,13 @@ impl PostProcessor for BertProcessing {
.map(|e| e.get_normalized()) .map(|e| e.get_normalized())
.unwrap_or("") .unwrap_or("")
), ),
[&ids[..], &pair_ids.unwrap_or(vec![])[..]].concat(), [&ids[..], &pair_ids.unwrap_or_else(|| vec![])[..]].concat(),
[&type_ids[..], &pair_type_ids.unwrap_or(vec![])[..]].concat(), [&type_ids[..], &pair_type_ids.unwrap_or_else(|| vec![])[..]].concat(),
[&tokens[..], &pair_tokens.unwrap_or(vec![])[..]].concat(), [&tokens[..], &pair_tokens.unwrap_or_else(|| vec![])[..]].concat(),
[&offsets[..], &pair_offsets.unwrap_or(vec![])[..]].concat(), [&offsets[..], &pair_offsets.unwrap_or_else(|| vec![])[..]].concat(),
[ [
&special_tokens[..], &special_tokens[..],
&pair_special_tokens.unwrap_or(vec![])[..], &pair_special_tokens.unwrap_or_else(|| vec![])[..],
] ]
.concat(), .concat(),
attention_mask, attention_mask,

View File

@ -12,6 +12,7 @@ pub struct Encoding {
overflowing: Option<Box<Encoding>>, overflowing: Option<Box<Encoding>>,
} }
impl Encoding { impl Encoding {
#[allow(clippy::too_many_arguments)]
pub fn new( pub fn new(
original: String, original: String,
normalized: String, normalized: String,
@ -118,8 +119,8 @@ impl Encoding {
} }
pub fn merge_with(&mut self, pair: Encoding) { pub fn merge_with(&mut self, pair: Encoding) {
self.original.extend(pair.original.chars()); self.original.push_str(&pair.original);
self.normalized.extend(pair.normalized.chars()); self.normalized.push_str(&pair.normalized);
self.ids.extend(pair.ids); self.ids.extend(pair.ids);
self.type_ids.extend(pair.type_ids); self.type_ids.extend(pair.type_ids);
self.tokens.extend(pair.tokens); self.tokens.extend(pair.tokens);
@ -142,12 +143,12 @@ impl Encoding {
/// Prepend the `stride` last elements of the `previous` Vec to the current Vec /// Prepend the `stride` last elements of the `previous` Vec to the current Vec
// A new Vec is instantiated though. // A new Vec is instantiated though.
fn prepend_stride<T: Clone>(previous: &Vec<T>, current: Vec<T>, stride: usize) -> Vec<T> { fn prepend_stride<T: Clone>(previous: &[T], current: Vec<T>, stride: usize) -> Vec<T> {
let prev = previous let prev = previous
.iter() .iter()
.rev() .rev()
.take(stride) .take(stride)
.map(|v| v.clone()) .cloned()
.rev() .rev()
.collect::<Vec<_>>(); .collect::<Vec<_>>();

View File

@ -236,7 +236,7 @@ impl Tokenizer {
} }
/// Train a model and replace our current Model, using the given Trainer /// Train a model and replace our current Model, using the given Trainer
pub fn train(&mut self, trainer: &Box<dyn Trainer>, files: Vec<String>) -> Result<()> { pub fn train(&mut self, trainer: &dyn Trainer, files: Vec<String>) -> Result<()> {
let results = files let results = files
.par_iter() .par_iter()
.map(|file| -> Result<HashMap<String, u32>> { .map(|file| -> Result<HashMap<String, u32>> {
@ -284,7 +284,7 @@ impl Tokenizer {
if let Some(normalizer) = &self.normalizer { if let Some(normalizer) = &self.normalizer {
normalizer.normalize(sentence) normalizer.normalize(sentence)
} else { } else {
Ok(sentence.to_owned()) Ok(sentence)
} }
} }

View File

@ -52,9 +52,9 @@ pub fn truncate_encodings(
} }
encoding.truncate(encoding.get_ids().len() - n_first, stride); encoding.truncate(encoding.get_ids().len() - n_first, stride);
pair_encoding if let Some(encoding) = pair_encoding.as_mut() {
.as_mut() encoding.truncate(encoding.get_ids().len() - n_second, stride);
.map(|encoding| encoding.truncate(encoding.get_ids().len() - n_second, stride)); }
} }
TruncationStrategy::OnlyFirst | TruncationStrategy::OnlySecond => { TruncationStrategy::OnlyFirst | TruncationStrategy::OnlySecond => {
let target = if strategy == TruncationStrategy::OnlyFirst { let target = if strategy == TruncationStrategy::OnlyFirst {