diff --git a/bindings/node/native/Cargo.lock b/bindings/node/native/Cargo.lock index 4fb98e5f..f68951ca 100644 --- a/bindings/node/native/Cargo.lock +++ b/bindings/node/native/Cargo.lock @@ -847,18 +847,19 @@ dependencies = [ [[package]] name = "macro_rules_attribute" -version = "0.0.2" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "641c64af6cd80b81cf9c2f2f6ee382b1050c71ce63e20800499971a4a4195005" +checksum = "258c86475e1616d6f2d8f5227cfaabd3dae1f6d5388b9597df8a199d4497aba7" dependencies = [ "macro_rules_attribute-proc_macro", + "paste", ] [[package]] name = "macro_rules_attribute-proc_macro" -version = "0.0.2" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb246ada5a8c47b8b6e90c9f9a0f84f294939cdf558f1bc8d17fbb30f9706598" +checksum = "f26a8d2502d5aa4d411ef494ba7470eb299f05725179ce3b5de77aa01a9ffdea" [[package]] name = "matches" @@ -1182,9 +1183,9 @@ dependencies = [ [[package]] name = "paste" -version = "1.0.6" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5" +checksum = "9423e2b32f7a043629287a536f21951e8c6a82482d0acb1eeebfc90bc2225b22" [[package]] name = "percent-encoding" diff --git a/bindings/python/Cargo.lock b/bindings/python/Cargo.lock index f40694d7..1867a9ed 100644 --- a/bindings/python/Cargo.lock +++ b/bindings/python/Cargo.lock @@ -836,18 +836,19 @@ dependencies = [ [[package]] name = "macro_rules_attribute" -version = "0.0.2" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "641c64af6cd80b81cf9c2f2f6ee382b1050c71ce63e20800499971a4a4195005" +checksum = "258c86475e1616d6f2d8f5227cfaabd3dae1f6d5388b9597df8a199d4497aba7" dependencies = [ "macro_rules_attribute-proc_macro", + "paste", ] [[package]] name = "macro_rules_attribute-proc_macro" -version = "0.0.2" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb246ada5a8c47b8b6e90c9f9a0f84f294939cdf558f1bc8d17fbb30f9706598" +checksum = "f26a8d2502d5aa4d411ef494ba7470eb299f05725179ce3b5de77aa01a9ffdea" [[package]] name = "matches" @@ -1154,9 +1155,9 @@ dependencies = [ [[package]] name = "paste" -version = "1.0.6" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5" +checksum = "9423e2b32f7a043629287a536f21951e8c6a82482d0acb1eeebfc90bc2225b22" [[package]] name = "percent-encoding" diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 42dd6b7c..458b0256 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -1,5 +1,7 @@ #![warn(clippy::all)] #![allow(clippy::upper_case_acronyms)] +// Many false positives with pyo3 it seems &str, and &PyAny get flagged +#![allow(clippy::borrow_deref_ref)] extern crate tokenizers as tk; diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index 709fae30..43ab8488 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -164,7 +164,7 @@ impl BpeTrainerBuilder { /// let special_tokens = trainer.train(&mut model).unwrap(); /// ``` #[non_exhaustive] -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] pub struct BpeTrainer { /// The minimum frequency a pair must have to produce a merge operation pub min_frequency: u32, diff --git a/tokenizers/src/models/wordlevel/mod.rs b/tokenizers/src/models/wordlevel/mod.rs index 1b08b1d2..3482ffee 100644 --- a/tokenizers/src/models/wordlevel/mod.rs +++ b/tokenizers/src/models/wordlevel/mod.rs @@ -94,7 +94,7 @@ impl WordLevelBuilder { } } -#[derive(PartialEq, Clone)] +#[derive(PartialEq, Clone, Eq)] pub struct WordLevel { vocab: HashMap, vocab_r: HashMap, diff --git a/tokenizers/src/models/wordpiece/mod.rs b/tokenizers/src/models/wordpiece/mod.rs index 38cb6ec9..9baf2458 100644 --- a/tokenizers/src/models/wordpiece/mod.rs +++ b/tokenizers/src/models/wordpiece/mod.rs @@ -119,7 +119,7 @@ impl WordPieceBuilder { /// A /// [WordPiece](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf) /// model. -#[derive(Clone, PartialEq)] +#[derive(Clone, PartialEq, Eq)] pub struct WordPiece { vocab: Vocab, vocab_r: VocabR, diff --git a/tokenizers/src/normalizers/replace.rs b/tokenizers/src/normalizers/replace.rs index c000e9e8..fb42222f 100644 --- a/tokenizers/src/normalizers/replace.rs +++ b/tokenizers/src/normalizers/replace.rs @@ -3,7 +3,7 @@ use crate::utils::SysRegex; use serde::{Deserialize, Serialize}; /// Represents the different patterns that `Replace` can use -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] pub enum ReplacePattern { String(String), Regex(String), diff --git a/tokenizers/src/pre_tokenizers/bert.rs b/tokenizers/src/pre_tokenizers/bert.rs index fa473ccc..93fdd05c 100644 --- a/tokenizers/src/pre_tokenizers/bert.rs +++ b/tokenizers/src/pre_tokenizers/bert.rs @@ -6,7 +6,7 @@ fn is_bert_punc(x: char) -> bool { char::is_ascii_punctuation(&x) || x.is_punctuation() } -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] #[macro_rules_attribute(impl_serde_type!)] pub struct BertPreTokenizer; diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index 77ff3976..afa3d372 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -42,7 +42,7 @@ lazy_static! { bytes_char().into_iter().map(|(c, b)| (b, c)).collect(); } -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] /// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care /// of all the required processing steps to transform a UTF-8 string as needed before and after the /// BPE model does its job. diff --git a/tokenizers/src/pre_tokenizers/delimiter.rs b/tokenizers/src/pre_tokenizers/delimiter.rs index 15b91e0a..64ef63cc 100644 --- a/tokenizers/src/pre_tokenizers/delimiter.rs +++ b/tokenizers/src/pre_tokenizers/delimiter.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::utils::macro_rules_attribute; -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] #[non_exhaustive] #[macro_rules_attribute(impl_serde_type!)] pub struct CharDelimiterSplit { diff --git a/tokenizers/src/pre_tokenizers/digits.rs b/tokenizers/src/pre_tokenizers/digits.rs index a64bab7c..942e2521 100644 --- a/tokenizers/src/pre_tokenizers/digits.rs +++ b/tokenizers/src/pre_tokenizers/digits.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; use crate::utils::macro_rules_attribute; -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] /// Pre tokenizes the numbers into single tokens. If individual_digits is set /// to true, then all digits are splitted into individual tokens. #[non_exhaustive] diff --git a/tokenizers/src/pre_tokenizers/metaspace.rs b/tokenizers/src/pre_tokenizers/metaspace.rs index 07472b04..ad4df5af 100644 --- a/tokenizers/src/pre_tokenizers/metaspace.rs +++ b/tokenizers/src/pre_tokenizers/metaspace.rs @@ -2,7 +2,7 @@ use serde::{Deserialize, Deserializer, Serialize}; use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior}; -#[derive(Debug, Clone, PartialEq, Serialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Eq)] /// Replaces all the whitespaces by the provided meta character and then /// splits on this character #[serde(tag = "type")] diff --git a/tokenizers/src/pre_tokenizers/punctuation.rs b/tokenizers/src/pre_tokenizers/punctuation.rs index 43e9bc28..0ba7d602 100644 --- a/tokenizers/src/pre_tokenizers/punctuation.rs +++ b/tokenizers/src/pre_tokenizers/punctuation.rs @@ -8,7 +8,7 @@ fn is_punc(x: char) -> bool { char::is_ascii_punctuation(&x) || x.is_punctuation() } -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] #[macro_rules_attribute(impl_serde_type!)] pub struct Punctuation { #[serde(default = "default_split")] diff --git a/tokenizers/src/pre_tokenizers/split.rs b/tokenizers/src/pre_tokenizers/split.rs index 1d8fe1af..0e2a9023 100644 --- a/tokenizers/src/pre_tokenizers/split.rs +++ b/tokenizers/src/pre_tokenizers/split.rs @@ -6,7 +6,7 @@ use crate::tokenizer::{ }; /// Represents the different patterns that `Split` can use -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] pub enum SplitPattern { String(String), Regex(String), diff --git a/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs b/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs index 82117268..2de48b73 100644 --- a/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs +++ b/tokenizers/src/pre_tokenizers/unicode_scripts/pre_tokenizer.rs @@ -2,7 +2,7 @@ use crate::pre_tokenizers::unicode_scripts::scripts::{get_script, Script}; use crate::tokenizer::{normalizer::Range, PreTokenizedString, PreTokenizer, Result}; use crate::utils::macro_rules_attribute; -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] #[macro_rules_attribute(impl_serde_type!)] pub struct UnicodeScripts; diff --git a/tokenizers/src/pre_tokenizers/unicode_scripts/scripts.rs b/tokenizers/src/pre_tokenizers/unicode_scripts/scripts.rs index 16dc0298..2f4588aa 100644 --- a/tokenizers/src/pre_tokenizers/unicode_scripts/scripts.rs +++ b/tokenizers/src/pre_tokenizers/unicode_scripts/scripts.rs @@ -2,7 +2,7 @@ // Unicode scripts : https://gist.github.com/Narsil/07556f26dc84a6baeff4d499e68d3cd2 // Rust adaptation : https://gist.github.com/Narsil/1df9fbbf5296a8d4d62de55dcb2fe700 -#[derive(PartialEq, Debug, Clone, Copy)] +#[derive(PartialEq, Debug, Clone, Copy, Eq)] pub enum Script { Any, Adlam, diff --git a/tokenizers/src/pre_tokenizers/whitespace.rs b/tokenizers/src/pre_tokenizers/whitespace.rs index 73b012c1..63ea4429 100644 --- a/tokenizers/src/pre_tokenizers/whitespace.rs +++ b/tokenizers/src/pre_tokenizers/whitespace.rs @@ -5,7 +5,7 @@ use crate::tokenizer::{ }; use crate::utils::macro_rules_attribute; -#[derive(Clone, Debug, PartialEq)] +#[derive(Clone, Debug, PartialEq, Eq)] #[macro_rules_attribute(impl_serde_type!)] pub struct Whitespace; @@ -28,7 +28,7 @@ impl PreTokenizer for Whitespace { } } -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, Eq)] #[macro_rules_attribute(impl_serde_type!)] pub struct WhitespaceSplit; diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs index 57cc52a4..bbfb4577 100644 --- a/tokenizers/src/processors/bert.rs +++ b/tokenizers/src/processors/bert.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::iter::FromIterator; -#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)] +#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)] #[serde(tag = "type")] pub struct BertProcessing { sep: (String, u32), diff --git a/tokenizers/src/processors/mod.rs b/tokenizers/src/processors/mod.rs index e29a0534..a74eec75 100644 --- a/tokenizers/src/processors/mod.rs +++ b/tokenizers/src/processors/mod.rs @@ -13,7 +13,7 @@ use crate::processors::roberta::RobertaProcessing; use crate::processors::template::TemplateProcessing; use crate::{Encoding, PostProcessor, Result}; -#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)] +#[derive(Serialize, Deserialize, PartialEq, Debug, Clone, Eq)] #[serde(untagged)] pub enum PostProcessorWrapper { // Roberta must be before Bert for deserialization (serde does not validate tags) diff --git a/tokenizers/src/processors/roberta.rs b/tokenizers/src/processors/roberta.rs index 2985a79a..a8384306 100644 --- a/tokenizers/src/processors/roberta.rs +++ b/tokenizers/src/processors/roberta.rs @@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize}; use std::collections::HashMap; use std::iter::FromIterator; -#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)] +#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)] #[serde(tag = "type")] pub struct RobertaProcessing { sep: (String, u32), diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index 57c6d202..f2463a80 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -63,7 +63,7 @@ use std::convert::{TryFrom, TryInto}; use std::result::Result as StdResult; /// Represents both sequences received as input of the PostProcessor -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] pub enum Sequence { /// This is the first sequence, the one that is always specified A, @@ -91,7 +91,7 @@ pub enum Sequence { /// /// [`SpecialToken`]: struct.SpecialToken.html /// -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] pub enum Piece { Sequence { id: Sequence, type_id: u32 }, SpecialToken { id: String, type_id: u32 }, @@ -188,7 +188,7 @@ impl TryFrom<&str> for Piece { /// vec!["A".into(), "complex".into(), "special".into(), "token".into(), ":".into()] /// ).unwrap(); /// ``` -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] pub struct SpecialToken { /// A unique id used to identify this SpecialToken in the template id: String, @@ -249,7 +249,7 @@ impl SpecialToken { /// /// [`Piece`]: enum.Piece.html /// -#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)] #[serde(transparent)] pub struct Template(Vec); @@ -289,7 +289,7 @@ impl TryFrom<&str> for Template { /// from a HashMap or a Vec<[`SpecialToken`]>. /// /// [`SpecialToken`]: struct.SpecialToken.html -#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize, Eq)] #[serde(transparent)] pub struct Tokens( #[serde(serialize_with = "crate::utils::ordered_map")] pub HashMap, @@ -332,7 +332,7 @@ impl From> for Tokens { /// .unwrap(); /// ``` /// -#[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize)] +#[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize, Eq)] #[serde(tag = "type", from = "TemplateProcessingDeserializer")] #[builder(build_fn(validate = "Self::validate"))] pub struct TemplateProcessing { diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index bff0e2a2..4a706135 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -146,7 +146,7 @@ pub trait Trainer { F: Fn(&str) -> Result> + Sync; } -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Token { pub id: u32, pub value: String, diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 34df29b7..10089c24 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -19,7 +19,7 @@ macro_rules! apply_signed { } /// The possible offsets referential -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum OffsetReferential { Original, Normalized, @@ -27,7 +27,7 @@ pub enum OffsetReferential { /// Represents a Range usable by the NormalizedString to index its content. /// A Range can use indices relative to either the `Original` or the `Normalized` string -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub enum Range + Clone> { Original(T), Normalized(T), @@ -91,7 +91,7 @@ where /// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]` /// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]` /// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]` -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)] pub enum SplitDelimiterBehavior { Removed, Isolated, @@ -108,7 +108,7 @@ pub enum SplitDelimiterBehavior { /// It is possible to retrieve a part of the original string, by indexing it with /// offsets from the normalized one, and the other way around too. It is also /// possible to convert offsets from one referential to the other one easily. -#[derive(Default, Debug, Clone, PartialEq)] +#[derive(Default, Debug, Clone, PartialEq, Eq)] pub struct NormalizedString { /// The original version of the string, before any modification original: String, diff --git a/tokenizers/src/tokenizer/pre_tokenizer.rs b/tokenizers/src/tokenizer/pre_tokenizer.rs index cf0ff8be..54e24f76 100644 --- a/tokenizers/src/tokenizer/pre_tokenizer.rs +++ b/tokenizers/src/tokenizer/pre_tokenizer.rs @@ -4,7 +4,7 @@ use crate::{ use std::collections::HashMap; /// Various possible types of offsets -#[derive(Debug, Clone, Copy, PartialEq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum OffsetType { Byte, Char, @@ -15,7 +15,7 @@ pub enum OffsetType { /// This Split contains the underlying `NormalizedString` as well as its offsets /// in the original string. These offsets are in the `original` referential. /// It also contains any `Token` associated to the current split -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct Split { /// The underlying `NormalizedString`. Each SubString is represented by a `NormalizedString` /// and in the end we might be carrying a lot of SubString representing various parts of the @@ -49,7 +49,7 @@ impl From<(NormalizedString, Option>)> for Split { /// Once everything has been normalized and tokenized, the `PreTokenizedString` is able /// to build an `Encoding` with all the relevant offsets and word ids, relative to the /// original string. -#[derive(Debug, Clone, PartialEq)] +#[derive(Debug, Clone, PartialEq, Eq)] pub struct PreTokenizedString { original: String, splits: Vec, diff --git a/tokenizers/src/utils/truncation.rs b/tokenizers/src/utils/truncation.rs index 7b1a5fa4..64814787 100644 --- a/tokenizers/src/utils/truncation.rs +++ b/tokenizers/src/utils/truncation.rs @@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize}; use std::cmp; use std::mem; -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)] pub enum TruncationDirection { Left, Right, @@ -53,7 +53,7 @@ pub enum TruncationError { SequenceTooShort, } -#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)] pub enum TruncationStrategy { LongestFirst, OnlyFirst,