Updating code according to clippy. (#1048)

- Adding `Eq` where possible
- Denied the ref deref warnings as it was spamming and solution not
  really better.
This commit is contained in:
Nicolas Patry
2022-08-24 19:45:15 +02:00
committed by GitHub
parent 67c56adf68
commit b1c9bc68b5
25 changed files with 50 additions and 46 deletions

View File

@ -847,18 +847,19 @@ dependencies = [
[[package]]
name = "macro_rules_attribute"
version = "0.0.2"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "641c64af6cd80b81cf9c2f2f6ee382b1050c71ce63e20800499971a4a4195005"
checksum = "258c86475e1616d6f2d8f5227cfaabd3dae1f6d5388b9597df8a199d4497aba7"
dependencies = [
"macro_rules_attribute-proc_macro",
"paste",
]
[[package]]
name = "macro_rules_attribute-proc_macro"
version = "0.0.2"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb246ada5a8c47b8b6e90c9f9a0f84f294939cdf558f1bc8d17fbb30f9706598"
checksum = "f26a8d2502d5aa4d411ef494ba7470eb299f05725179ce3b5de77aa01a9ffdea"
[[package]]
name = "matches"
@ -1182,9 +1183,9 @@ dependencies = [
[[package]]
name = "paste"
version = "1.0.6"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5"
checksum = "9423e2b32f7a043629287a536f21951e8c6a82482d0acb1eeebfc90bc2225b22"
[[package]]
name = "percent-encoding"

View File

@ -836,18 +836,19 @@ dependencies = [
[[package]]
name = "macro_rules_attribute"
version = "0.0.2"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "641c64af6cd80b81cf9c2f2f6ee382b1050c71ce63e20800499971a4a4195005"
checksum = "258c86475e1616d6f2d8f5227cfaabd3dae1f6d5388b9597df8a199d4497aba7"
dependencies = [
"macro_rules_attribute-proc_macro",
"paste",
]
[[package]]
name = "macro_rules_attribute-proc_macro"
version = "0.0.2"
version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb246ada5a8c47b8b6e90c9f9a0f84f294939cdf558f1bc8d17fbb30f9706598"
checksum = "f26a8d2502d5aa4d411ef494ba7470eb299f05725179ce3b5de77aa01a9ffdea"
[[package]]
name = "matches"
@ -1154,9 +1155,9 @@ dependencies = [
[[package]]
name = "paste"
version = "1.0.6"
version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5"
checksum = "9423e2b32f7a043629287a536f21951e8c6a82482d0acb1eeebfc90bc2225b22"
[[package]]
name = "percent-encoding"

View File

@ -1,5 +1,7 @@
#![warn(clippy::all)]
#![allow(clippy::upper_case_acronyms)]
// Many false positives with pyo3 it seems &str, and &PyAny get flagged
#![allow(clippy::borrow_deref_ref)]
extern crate tokenizers as tk;

View File

@ -164,7 +164,7 @@ impl BpeTrainerBuilder {
/// let special_tokens = trainer.train(&mut model).unwrap();
/// ```
#[non_exhaustive]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub struct BpeTrainer {
/// The minimum frequency a pair must have to produce a merge operation
pub min_frequency: u32,

View File

@ -94,7 +94,7 @@ impl WordLevelBuilder {
}
}
#[derive(PartialEq, Clone)]
#[derive(PartialEq, Clone, Eq)]
pub struct WordLevel {
vocab: HashMap<String, u32>,
vocab_r: HashMap<u32, String>,

View File

@ -119,7 +119,7 @@ impl WordPieceBuilder {
/// A
/// [WordPiece](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf)
/// model.
#[derive(Clone, PartialEq)]
#[derive(Clone, PartialEq, Eq)]
pub struct WordPiece {
vocab: Vocab,
vocab_r: VocabR,

View File

@ -3,7 +3,7 @@ use crate::utils::SysRegex;
use serde::{Deserialize, Serialize};
/// Represents the different patterns that `Replace` can use
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum ReplacePattern {
String(String),
Regex(String),

View File

@ -6,7 +6,7 @@ fn is_bert_punc(x: char) -> bool {
char::is_ascii_punctuation(&x) || x.is_punctuation()
}
#[derive(Copy, Clone, Debug, PartialEq)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct BertPreTokenizer;

View File

@ -42,7 +42,7 @@ lazy_static! {
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
}
#[derive(Copy, Clone, Debug, PartialEq)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care
/// of all the required processing steps to transform a UTF-8 string as needed before and after the
/// BPE model does its job.

View File

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use crate::utils::macro_rules_attribute;
#[derive(Copy, Clone, Debug, PartialEq)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[non_exhaustive]
#[macro_rules_attribute(impl_serde_type!)]
pub struct CharDelimiterSplit {

View File

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use crate::utils::macro_rules_attribute;
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug, PartialEq, Eq)]
/// Pre tokenizes the numbers into single tokens. If individual_digits is set
/// to true, then all digits are splitted into individual tokens.
#[non_exhaustive]

View File

@ -2,7 +2,7 @@ use serde::{Deserialize, Deserializer, Serialize};
use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
#[derive(Debug, Clone, PartialEq, Serialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Eq)]
/// Replaces all the whitespaces by the provided meta character and then
/// splits on this character
#[serde(tag = "type")]

View File

@ -8,7 +8,7 @@ fn is_punc(x: char) -> bool {
char::is_ascii_punctuation(&x) || x.is_punctuation()
}
#[derive(Copy, Clone, Debug, PartialEq)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct Punctuation {
#[serde(default = "default_split")]

View File

@ -6,7 +6,7 @@ use crate::tokenizer::{
};
/// Represents the different patterns that `Split` can use
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum SplitPattern {
String(String),
Regex(String),

View File

@ -2,7 +2,7 @@ use crate::pre_tokenizers::unicode_scripts::scripts::{get_script, Script};
use crate::tokenizer::{normalizer::Range, PreTokenizedString, PreTokenizer, Result};
use crate::utils::macro_rules_attribute;
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct UnicodeScripts;

View File

@ -2,7 +2,7 @@
// Unicode scripts : https://gist.github.com/Narsil/07556f26dc84a6baeff4d499e68d3cd2
// Rust adaptation : https://gist.github.com/Narsil/1df9fbbf5296a8d4d62de55dcb2fe700
#[derive(PartialEq, Debug, Clone, Copy)]
#[derive(PartialEq, Debug, Clone, Copy, Eq)]
pub enum Script {
Any,
Adlam,

View File

@ -5,7 +5,7 @@ use crate::tokenizer::{
};
use crate::utils::macro_rules_attribute;
#[derive(Clone, Debug, PartialEq)]
#[derive(Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct Whitespace;
@ -28,7 +28,7 @@ impl PreTokenizer for Whitespace {
}
}
#[derive(Copy, Clone, Debug, PartialEq)]
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)]
pub struct WhitespaceSplit;

View File

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::iter::FromIterator;
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
#[serde(tag = "type")]
pub struct BertProcessing {
sep: (String, u32),

View File

@ -13,7 +13,7 @@ use crate::processors::roberta::RobertaProcessing;
use crate::processors::template::TemplateProcessing;
use crate::{Encoding, PostProcessor, Result};
#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)]
#[derive(Serialize, Deserialize, PartialEq, Debug, Clone, Eq)]
#[serde(untagged)]
pub enum PostProcessorWrapper {
// Roberta must be before Bert for deserialization (serde does not validate tags)

View File

@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::iter::FromIterator;
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
#[serde(tag = "type")]
pub struct RobertaProcessing {
sep: (String, u32),

View File

@ -63,7 +63,7 @@ use std::convert::{TryFrom, TryInto};
use std::result::Result as StdResult;
/// Represents both sequences received as input of the PostProcessor
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum Sequence {
/// This is the first sequence, the one that is always specified
A,
@ -91,7 +91,7 @@ pub enum Sequence {
///
/// [`SpecialToken`]: struct.SpecialToken.html
///
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum Piece {
Sequence { id: Sequence, type_id: u32 },
SpecialToken { id: String, type_id: u32 },
@ -188,7 +188,7 @@ impl TryFrom<&str> for Piece {
/// vec!["A".into(), "complex".into(), "special".into(), "token".into(), ":".into()]
/// ).unwrap();
/// ```
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub struct SpecialToken {
/// A unique id used to identify this SpecialToken in the template
id: String,
@ -249,7 +249,7 @@ impl SpecialToken {
///
/// [`Piece`]: enum.Piece.html
///
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
#[serde(transparent)]
pub struct Template(Vec<Piece>);
@ -289,7 +289,7 @@ impl TryFrom<&str> for Template {
/// from a HashMap or a Vec<[`SpecialToken`]>.
///
/// [`SpecialToken`]: struct.SpecialToken.html
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize, Eq)]
#[serde(transparent)]
pub struct Tokens(
#[serde(serialize_with = "crate::utils::ordered_map")] pub HashMap<String, SpecialToken>,
@ -332,7 +332,7 @@ impl From<HashMap<String, SpecialToken>> for Tokens {
/// .unwrap();
/// ```
///
#[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize)]
#[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize, Eq)]
#[serde(tag = "type", from = "TemplateProcessingDeserializer")]
#[builder(build_fn(validate = "Self::validate"))]
pub struct TemplateProcessing {

View File

@ -146,7 +146,7 @@ pub trait Trainer {
F: Fn(&str) -> Result<Vec<String>> + Sync;
}
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token {
pub id: u32,
pub value: String,

View File

@ -19,7 +19,7 @@ macro_rules! apply_signed {
}
/// The possible offsets referential
#[derive(Debug, Clone, Copy, PartialEq)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OffsetReferential {
Original,
Normalized,
@ -27,7 +27,7 @@ pub enum OffsetReferential {
/// Represents a Range usable by the NormalizedString to index its content.
/// A Range can use indices relative to either the `Original` or the `Normalized` string
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub enum Range<T: RangeBounds<usize> + Clone> {
Original(T),
Normalized(T),
@ -91,7 +91,7 @@ where
/// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]`
/// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]`
/// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
pub enum SplitDelimiterBehavior {
Removed,
Isolated,
@ -108,7 +108,7 @@ pub enum SplitDelimiterBehavior {
/// It is possible to retrieve a part of the original string, by indexing it with
/// offsets from the normalized one, and the other way around too. It is also
/// possible to convert offsets from one referential to the other one easily.
#[derive(Default, Debug, Clone, PartialEq)]
#[derive(Default, Debug, Clone, PartialEq, Eq)]
pub struct NormalizedString {
/// The original version of the string, before any modification
original: String,

View File

@ -4,7 +4,7 @@ use crate::{
use std::collections::HashMap;
/// Various possible types of offsets
#[derive(Debug, Clone, Copy, PartialEq)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OffsetType {
Byte,
Char,
@ -15,7 +15,7 @@ pub enum OffsetType {
/// This Split contains the underlying `NormalizedString` as well as its offsets
/// in the original string. These offsets are in the `original` referential.
/// It also contains any `Token` associated to the current split
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Split {
/// The underlying `NormalizedString`. Each SubString is represented by a `NormalizedString`
/// and in the end we might be carrying a lot of SubString representing various parts of the
@ -49,7 +49,7 @@ impl From<(NormalizedString, Option<Vec<Token>>)> for Split {
/// Once everything has been normalized and tokenized, the `PreTokenizedString` is able
/// to build an `Encoding` with all the relevant offsets and word ids, relative to the
/// original string.
#[derive(Debug, Clone, PartialEq)]
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct PreTokenizedString {
original: String,
splits: Vec<Split>,

View File

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use std::cmp;
use std::mem;
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
pub enum TruncationDirection {
Left,
Right,
@ -53,7 +53,7 @@ pub enum TruncationError {
SequenceTooShort,
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
pub enum TruncationStrategy {
LongestFirst,
OnlyFirst,