Updating code according to clippy. (#1048)

- Adding `Eq` where possible
- Denied the ref deref warnings as it was spamming and solution not
  really better.
This commit is contained in:
Nicolas Patry
2022-08-24 19:45:15 +02:00
committed by GitHub
parent 67c56adf68
commit b1c9bc68b5
25 changed files with 50 additions and 46 deletions

View File

@ -847,18 +847,19 @@ dependencies = [
[[package]] [[package]]
name = "macro_rules_attribute" name = "macro_rules_attribute"
version = "0.0.2" version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "641c64af6cd80b81cf9c2f2f6ee382b1050c71ce63e20800499971a4a4195005" checksum = "258c86475e1616d6f2d8f5227cfaabd3dae1f6d5388b9597df8a199d4497aba7"
dependencies = [ dependencies = [
"macro_rules_attribute-proc_macro", "macro_rules_attribute-proc_macro",
"paste",
] ]
[[package]] [[package]]
name = "macro_rules_attribute-proc_macro" name = "macro_rules_attribute-proc_macro"
version = "0.0.2" version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb246ada5a8c47b8b6e90c9f9a0f84f294939cdf558f1bc8d17fbb30f9706598" checksum = "f26a8d2502d5aa4d411ef494ba7470eb299f05725179ce3b5de77aa01a9ffdea"
[[package]] [[package]]
name = "matches" name = "matches"
@ -1182,9 +1183,9 @@ dependencies = [
[[package]] [[package]]
name = "paste" name = "paste"
version = "1.0.6" version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5" checksum = "9423e2b32f7a043629287a536f21951e8c6a82482d0acb1eeebfc90bc2225b22"
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"

View File

@ -836,18 +836,19 @@ dependencies = [
[[package]] [[package]]
name = "macro_rules_attribute" name = "macro_rules_attribute"
version = "0.0.2" version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "641c64af6cd80b81cf9c2f2f6ee382b1050c71ce63e20800499971a4a4195005" checksum = "258c86475e1616d6f2d8f5227cfaabd3dae1f6d5388b9597df8a199d4497aba7"
dependencies = [ dependencies = [
"macro_rules_attribute-proc_macro", "macro_rules_attribute-proc_macro",
"paste",
] ]
[[package]] [[package]]
name = "macro_rules_attribute-proc_macro" name = "macro_rules_attribute-proc_macro"
version = "0.0.2" version = "0.1.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eb246ada5a8c47b8b6e90c9f9a0f84f294939cdf558f1bc8d17fbb30f9706598" checksum = "f26a8d2502d5aa4d411ef494ba7470eb299f05725179ce3b5de77aa01a9ffdea"
[[package]] [[package]]
name = "matches" name = "matches"
@ -1154,9 +1155,9 @@ dependencies = [
[[package]] [[package]]
name = "paste" name = "paste"
version = "1.0.6" version = "1.0.8"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5" checksum = "9423e2b32f7a043629287a536f21951e8c6a82482d0acb1eeebfc90bc2225b22"
[[package]] [[package]]
name = "percent-encoding" name = "percent-encoding"

View File

@ -1,5 +1,7 @@
#![warn(clippy::all)] #![warn(clippy::all)]
#![allow(clippy::upper_case_acronyms)] #![allow(clippy::upper_case_acronyms)]
// Many false positives with pyo3 it seems &str, and &PyAny get flagged
#![allow(clippy::borrow_deref_ref)]
extern crate tokenizers as tk; extern crate tokenizers as tk;

View File

@ -164,7 +164,7 @@ impl BpeTrainerBuilder {
/// let special_tokens = trainer.train(&mut model).unwrap(); /// let special_tokens = trainer.train(&mut model).unwrap();
/// ``` /// ```
#[non_exhaustive] #[non_exhaustive]
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub struct BpeTrainer { pub struct BpeTrainer {
/// The minimum frequency a pair must have to produce a merge operation /// The minimum frequency a pair must have to produce a merge operation
pub min_frequency: u32, pub min_frequency: u32,

View File

@ -94,7 +94,7 @@ impl WordLevelBuilder {
} }
} }
#[derive(PartialEq, Clone)] #[derive(PartialEq, Clone, Eq)]
pub struct WordLevel { pub struct WordLevel {
vocab: HashMap<String, u32>, vocab: HashMap<String, u32>,
vocab_r: HashMap<u32, String>, vocab_r: HashMap<u32, String>,

View File

@ -119,7 +119,7 @@ impl WordPieceBuilder {
/// A /// A
/// [WordPiece](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf) /// [WordPiece](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf)
/// model. /// model.
#[derive(Clone, PartialEq)] #[derive(Clone, PartialEq, Eq)]
pub struct WordPiece { pub struct WordPiece {
vocab: Vocab, vocab: Vocab,
vocab_r: VocabR, vocab_r: VocabR,

View File

@ -3,7 +3,7 @@ use crate::utils::SysRegex;
use serde::{Deserialize, Serialize}; use serde::{Deserialize, Serialize};
/// Represents the different patterns that `Replace` can use /// Represents the different patterns that `Replace` can use
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum ReplacePattern { pub enum ReplacePattern {
String(String), String(String),
Regex(String), Regex(String),

View File

@ -6,7 +6,7 @@ fn is_bert_punc(x: char) -> bool {
char::is_ascii_punctuation(&x) || x.is_punctuation() char::is_ascii_punctuation(&x) || x.is_punctuation()
} }
#[derive(Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)] #[macro_rules_attribute(impl_serde_type!)]
pub struct BertPreTokenizer; pub struct BertPreTokenizer;

View File

@ -42,7 +42,7 @@ lazy_static! {
bytes_char().into_iter().map(|(c, b)| (b, c)).collect(); bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
} }
#[derive(Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care
/// of all the required processing steps to transform a UTF-8 string as needed before and after the /// of all the required processing steps to transform a UTF-8 string as needed before and after the
/// BPE model does its job. /// BPE model does its job.

View File

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use crate::utils::macro_rules_attribute; use crate::utils::macro_rules_attribute;
#[derive(Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[non_exhaustive] #[non_exhaustive]
#[macro_rules_attribute(impl_serde_type!)] #[macro_rules_attribute(impl_serde_type!)]
pub struct CharDelimiterSplit { pub struct CharDelimiterSplit {

View File

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
use crate::utils::macro_rules_attribute; use crate::utils::macro_rules_attribute;
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq, Eq)]
/// Pre tokenizes the numbers into single tokens. If individual_digits is set /// Pre tokenizes the numbers into single tokens. If individual_digits is set
/// to true, then all digits are splitted into individual tokens. /// to true, then all digits are splitted into individual tokens.
#[non_exhaustive] #[non_exhaustive]

View File

@ -2,7 +2,7 @@ use serde::{Deserialize, Deserializer, Serialize};
use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
#[derive(Debug, Clone, PartialEq, Serialize)] #[derive(Debug, Clone, PartialEq, Serialize, Eq)]
/// Replaces all the whitespaces by the provided meta character and then /// Replaces all the whitespaces by the provided meta character and then
/// splits on this character /// splits on this character
#[serde(tag = "type")] #[serde(tag = "type")]

View File

@ -8,7 +8,7 @@ fn is_punc(x: char) -> bool {
char::is_ascii_punctuation(&x) || x.is_punctuation() char::is_ascii_punctuation(&x) || x.is_punctuation()
} }
#[derive(Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)] #[macro_rules_attribute(impl_serde_type!)]
pub struct Punctuation { pub struct Punctuation {
#[serde(default = "default_split")] #[serde(default = "default_split")]

View File

@ -6,7 +6,7 @@ use crate::tokenizer::{
}; };
/// Represents the different patterns that `Split` can use /// Represents the different patterns that `Split` can use
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum SplitPattern { pub enum SplitPattern {
String(String), String(String),
Regex(String), Regex(String),

View File

@ -2,7 +2,7 @@ use crate::pre_tokenizers::unicode_scripts::scripts::{get_script, Script};
use crate::tokenizer::{normalizer::Range, PreTokenizedString, PreTokenizer, Result}; use crate::tokenizer::{normalizer::Range, PreTokenizedString, PreTokenizer, Result};
use crate::utils::macro_rules_attribute; use crate::utils::macro_rules_attribute;
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)] #[macro_rules_attribute(impl_serde_type!)]
pub struct UnicodeScripts; pub struct UnicodeScripts;

View File

@ -2,7 +2,7 @@
// Unicode scripts : https://gist.github.com/Narsil/07556f26dc84a6baeff4d499e68d3cd2 // Unicode scripts : https://gist.github.com/Narsil/07556f26dc84a6baeff4d499e68d3cd2
// Rust adaptation : https://gist.github.com/Narsil/1df9fbbf5296a8d4d62de55dcb2fe700 // Rust adaptation : https://gist.github.com/Narsil/1df9fbbf5296a8d4d62de55dcb2fe700
#[derive(PartialEq, Debug, Clone, Copy)] #[derive(PartialEq, Debug, Clone, Copy, Eq)]
pub enum Script { pub enum Script {
Any, Any,
Adlam, Adlam,

View File

@ -5,7 +5,7 @@ use crate::tokenizer::{
}; };
use crate::utils::macro_rules_attribute; use crate::utils::macro_rules_attribute;
#[derive(Clone, Debug, PartialEq)] #[derive(Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)] #[macro_rules_attribute(impl_serde_type!)]
pub struct Whitespace; pub struct Whitespace;
@ -28,7 +28,7 @@ impl PreTokenizer for Whitespace {
} }
} }
#[derive(Copy, Clone, Debug, PartialEq)] #[derive(Copy, Clone, Debug, PartialEq, Eq)]
#[macro_rules_attribute(impl_serde_type!)] #[macro_rules_attribute(impl_serde_type!)]
pub struct WhitespaceSplit; pub struct WhitespaceSplit;

View File

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use std::collections::HashMap; use std::collections::HashMap;
use std::iter::FromIterator; use std::iter::FromIterator;
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] #[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
#[serde(tag = "type")] #[serde(tag = "type")]
pub struct BertProcessing { pub struct BertProcessing {
sep: (String, u32), sep: (String, u32),

View File

@ -13,7 +13,7 @@ use crate::processors::roberta::RobertaProcessing;
use crate::processors::template::TemplateProcessing; use crate::processors::template::TemplateProcessing;
use crate::{Encoding, PostProcessor, Result}; use crate::{Encoding, PostProcessor, Result};
#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] #[derive(Serialize, Deserialize, PartialEq, Debug, Clone, Eq)]
#[serde(untagged)] #[serde(untagged)]
pub enum PostProcessorWrapper { pub enum PostProcessorWrapper {
// Roberta must be before Bert for deserialization (serde does not validate tags) // Roberta must be before Bert for deserialization (serde does not validate tags)

View File

@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
use std::collections::HashMap; use std::collections::HashMap;
use std::iter::FromIterator; use std::iter::FromIterator;
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] #[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
#[serde(tag = "type")] #[serde(tag = "type")]
pub struct RobertaProcessing { pub struct RobertaProcessing {
sep: (String, u32), sep: (String, u32),

View File

@ -63,7 +63,7 @@ use std::convert::{TryFrom, TryInto};
use std::result::Result as StdResult; use std::result::Result as StdResult;
/// Represents both sequences received as input of the PostProcessor /// Represents both sequences received as input of the PostProcessor
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum Sequence { pub enum Sequence {
/// This is the first sequence, the one that is always specified /// This is the first sequence, the one that is always specified
A, A,
@ -91,7 +91,7 @@ pub enum Sequence {
/// ///
/// [`SpecialToken`]: struct.SpecialToken.html /// [`SpecialToken`]: struct.SpecialToken.html
/// ///
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub enum Piece { pub enum Piece {
Sequence { id: Sequence, type_id: u32 }, Sequence { id: Sequence, type_id: u32 },
SpecialToken { id: String, type_id: u32 }, SpecialToken { id: String, type_id: u32 },
@ -188,7 +188,7 @@ impl TryFrom<&str> for Piece {
/// vec!["A".into(), "complex".into(), "special".into(), "token".into(), ":".into()] /// vec!["A".into(), "complex".into(), "special".into(), "token".into(), ":".into()]
/// ).unwrap(); /// ).unwrap();
/// ``` /// ```
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
pub struct SpecialToken { pub struct SpecialToken {
/// A unique id used to identify this SpecialToken in the template /// A unique id used to identify this SpecialToken in the template
id: String, id: String,
@ -249,7 +249,7 @@ impl SpecialToken {
/// ///
/// [`Piece`]: enum.Piece.html /// [`Piece`]: enum.Piece.html
/// ///
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
#[serde(transparent)] #[serde(transparent)]
pub struct Template(Vec<Piece>); pub struct Template(Vec<Piece>);
@ -289,7 +289,7 @@ impl TryFrom<&str> for Template {
/// from a HashMap or a Vec<[`SpecialToken`]>. /// from a HashMap or a Vec<[`SpecialToken`]>.
/// ///
/// [`SpecialToken`]: struct.SpecialToken.html /// [`SpecialToken`]: struct.SpecialToken.html
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize, Eq)]
#[serde(transparent)] #[serde(transparent)]
pub struct Tokens( pub struct Tokens(
#[serde(serialize_with = "crate::utils::ordered_map")] pub HashMap<String, SpecialToken>, #[serde(serialize_with = "crate::utils::ordered_map")] pub HashMap<String, SpecialToken>,
@ -332,7 +332,7 @@ impl From<HashMap<String, SpecialToken>> for Tokens {
/// .unwrap(); /// .unwrap();
/// ``` /// ```
/// ///
#[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize)] #[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize, Eq)]
#[serde(tag = "type", from = "TemplateProcessingDeserializer")] #[serde(tag = "type", from = "TemplateProcessingDeserializer")]
#[builder(build_fn(validate = "Self::validate"))] #[builder(build_fn(validate = "Self::validate"))]
pub struct TemplateProcessing { pub struct TemplateProcessing {

View File

@ -146,7 +146,7 @@ pub trait Trainer {
F: Fn(&str) -> Result<Vec<String>> + Sync; F: Fn(&str) -> Result<Vec<String>> + Sync;
} }
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct Token { pub struct Token {
pub id: u32, pub id: u32,
pub value: String, pub value: String,

View File

@ -19,7 +19,7 @@ macro_rules! apply_signed {
} }
/// The possible offsets referential /// The possible offsets referential
#[derive(Debug, Clone, Copy, PartialEq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OffsetReferential { pub enum OffsetReferential {
Original, Original,
Normalized, Normalized,
@ -27,7 +27,7 @@ pub enum OffsetReferential {
/// Represents a Range usable by the NormalizedString to index its content. /// Represents a Range usable by the NormalizedString to index its content.
/// A Range can use indices relative to either the `Original` or the `Normalized` string /// A Range can use indices relative to either the `Original` or the `Normalized` string
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub enum Range<T: RangeBounds<usize> + Clone> { pub enum Range<T: RangeBounds<usize> + Clone> {
Original(T), Original(T),
Normalized(T), Normalized(T),
@ -91,7 +91,7 @@ where
/// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]` /// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]`
/// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]` /// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]`
/// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]` /// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
pub enum SplitDelimiterBehavior { pub enum SplitDelimiterBehavior {
Removed, Removed,
Isolated, Isolated,
@ -108,7 +108,7 @@ pub enum SplitDelimiterBehavior {
/// It is possible to retrieve a part of the original string, by indexing it with /// It is possible to retrieve a part of the original string, by indexing it with
/// offsets from the normalized one, and the other way around too. It is also /// offsets from the normalized one, and the other way around too. It is also
/// possible to convert offsets from one referential to the other one easily. /// possible to convert offsets from one referential to the other one easily.
#[derive(Default, Debug, Clone, PartialEq)] #[derive(Default, Debug, Clone, PartialEq, Eq)]
pub struct NormalizedString { pub struct NormalizedString {
/// The original version of the string, before any modification /// The original version of the string, before any modification
original: String, original: String,

View File

@ -4,7 +4,7 @@ use crate::{
use std::collections::HashMap; use std::collections::HashMap;
/// Various possible types of offsets /// Various possible types of offsets
#[derive(Debug, Clone, Copy, PartialEq)] #[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum OffsetType { pub enum OffsetType {
Byte, Byte,
Char, Char,
@ -15,7 +15,7 @@ pub enum OffsetType {
/// This Split contains the underlying `NormalizedString` as well as its offsets /// This Split contains the underlying `NormalizedString` as well as its offsets
/// in the original string. These offsets are in the `original` referential. /// in the original string. These offsets are in the `original` referential.
/// It also contains any `Token` associated to the current split /// It also contains any `Token` associated to the current split
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct Split { pub struct Split {
/// The underlying `NormalizedString`. Each SubString is represented by a `NormalizedString` /// The underlying `NormalizedString`. Each SubString is represented by a `NormalizedString`
/// and in the end we might be carrying a lot of SubString representing various parts of the /// and in the end we might be carrying a lot of SubString representing various parts of the
@ -49,7 +49,7 @@ impl From<(NormalizedString, Option<Vec<Token>>)> for Split {
/// Once everything has been normalized and tokenized, the `PreTokenizedString` is able /// Once everything has been normalized and tokenized, the `PreTokenizedString` is able
/// to build an `Encoding` with all the relevant offsets and word ids, relative to the /// to build an `Encoding` with all the relevant offsets and word ids, relative to the
/// original string. /// original string.
#[derive(Debug, Clone, PartialEq)] #[derive(Debug, Clone, PartialEq, Eq)]
pub struct PreTokenizedString { pub struct PreTokenizedString {
original: String, original: String,
splits: Vec<Split>, splits: Vec<Split>,

View File

@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
use std::cmp; use std::cmp;
use std::mem; use std::mem;
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
pub enum TruncationDirection { pub enum TruncationDirection {
Left, Left,
Right, Right,
@ -53,7 +53,7 @@ pub enum TruncationError {
SequenceTooShort, SequenceTooShort,
} }
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] #[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
pub enum TruncationStrategy { pub enum TruncationStrategy {
LongestFirst, LongestFirst,
OnlyFirst, OnlyFirst,