mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Updating code according to clippy. (#1048)
- Adding `Eq` where possible - Denied the ref deref warnings as it was spamming and solution not really better.
This commit is contained in:
13
bindings/node/native/Cargo.lock
generated
13
bindings/node/native/Cargo.lock
generated
@ -847,18 +847,19 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "macro_rules_attribute"
|
||||
version = "0.0.2"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "641c64af6cd80b81cf9c2f2f6ee382b1050c71ce63e20800499971a4a4195005"
|
||||
checksum = "258c86475e1616d6f2d8f5227cfaabd3dae1f6d5388b9597df8a199d4497aba7"
|
||||
dependencies = [
|
||||
"macro_rules_attribute-proc_macro",
|
||||
"paste",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "macro_rules_attribute-proc_macro"
|
||||
version = "0.0.2"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb246ada5a8c47b8b6e90c9f9a0f84f294939cdf558f1bc8d17fbb30f9706598"
|
||||
checksum = "f26a8d2502d5aa4d411ef494ba7470eb299f05725179ce3b5de77aa01a9ffdea"
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
@ -1182,9 +1183,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "paste"
|
||||
version = "1.0.6"
|
||||
version = "1.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5"
|
||||
checksum = "9423e2b32f7a043629287a536f21951e8c6a82482d0acb1eeebfc90bc2225b22"
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
|
13
bindings/python/Cargo.lock
generated
13
bindings/python/Cargo.lock
generated
@ -836,18 +836,19 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "macro_rules_attribute"
|
||||
version = "0.0.2"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "641c64af6cd80b81cf9c2f2f6ee382b1050c71ce63e20800499971a4a4195005"
|
||||
checksum = "258c86475e1616d6f2d8f5227cfaabd3dae1f6d5388b9597df8a199d4497aba7"
|
||||
dependencies = [
|
||||
"macro_rules_attribute-proc_macro",
|
||||
"paste",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "macro_rules_attribute-proc_macro"
|
||||
version = "0.0.2"
|
||||
version = "0.1.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "eb246ada5a8c47b8b6e90c9f9a0f84f294939cdf558f1bc8d17fbb30f9706598"
|
||||
checksum = "f26a8d2502d5aa4d411ef494ba7470eb299f05725179ce3b5de77aa01a9ffdea"
|
||||
|
||||
[[package]]
|
||||
name = "matches"
|
||||
@ -1154,9 +1155,9 @@ dependencies = [
|
||||
|
||||
[[package]]
|
||||
name = "paste"
|
||||
version = "1.0.6"
|
||||
version = "1.0.8"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0744126afe1a6dd7f394cb50a716dbe086cb06e255e53d8d0185d82828358fb5"
|
||||
checksum = "9423e2b32f7a043629287a536f21951e8c6a82482d0acb1eeebfc90bc2225b22"
|
||||
|
||||
[[package]]
|
||||
name = "percent-encoding"
|
||||
|
@ -1,5 +1,7 @@
|
||||
#![warn(clippy::all)]
|
||||
#![allow(clippy::upper_case_acronyms)]
|
||||
// Many false positives with pyo3 it seems &str, and &PyAny get flagged
|
||||
#![allow(clippy::borrow_deref_ref)]
|
||||
|
||||
extern crate tokenizers as tk;
|
||||
|
||||
|
@ -164,7 +164,7 @@ impl BpeTrainerBuilder {
|
||||
/// let special_tokens = trainer.train(&mut model).unwrap();
|
||||
/// ```
|
||||
#[non_exhaustive]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub struct BpeTrainer {
|
||||
/// The minimum frequency a pair must have to produce a merge operation
|
||||
pub min_frequency: u32,
|
||||
|
@ -94,7 +94,7 @@ impl WordLevelBuilder {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(PartialEq, Clone)]
|
||||
#[derive(PartialEq, Clone, Eq)]
|
||||
pub struct WordLevel {
|
||||
vocab: HashMap<String, u32>,
|
||||
vocab_r: HashMap<u32, String>,
|
||||
|
@ -119,7 +119,7 @@ impl WordPieceBuilder {
|
||||
/// A
|
||||
/// [WordPiece](https://static.googleusercontent.com/media/research.google.com/en//pubs/archive/37842.pdf)
|
||||
/// model.
|
||||
#[derive(Clone, PartialEq)]
|
||||
#[derive(Clone, PartialEq, Eq)]
|
||||
pub struct WordPiece {
|
||||
vocab: Vocab,
|
||||
vocab_r: VocabR,
|
||||
|
@ -3,7 +3,7 @@ use crate::utils::SysRegex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Represents the different patterns that `Replace` can use
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub enum ReplacePattern {
|
||||
String(String),
|
||||
Regex(String),
|
||||
|
@ -6,7 +6,7 @@ fn is_bert_punc(x: char) -> bool {
|
||||
char::is_ascii_punctuation(&x) || x.is_punctuation()
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
#[macro_rules_attribute(impl_serde_type!)]
|
||||
pub struct BertPreTokenizer;
|
||||
|
||||
|
@ -42,7 +42,7 @@ lazy_static! {
|
||||
bytes_char().into_iter().map(|(c, b)| (b, c)).collect();
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
/// Provides all the necessary steps to handle the BPE tokenization at the byte-level. Takes care
|
||||
/// of all the required processing steps to transform a UTF-8 string as needed before and after the
|
||||
/// BPE model does its job.
|
||||
|
@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
||||
use crate::utils::macro_rules_attribute;
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
#[non_exhaustive]
|
||||
#[macro_rules_attribute(impl_serde_type!)]
|
||||
pub struct CharDelimiterSplit {
|
||||
|
@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use crate::tokenizer::{PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
||||
use crate::utils::macro_rules_attribute;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
/// Pre tokenizes the numbers into single tokens. If individual_digits is set
|
||||
/// to true, then all digits are splitted into individual tokens.
|
||||
#[non_exhaustive]
|
||||
|
@ -2,7 +2,7 @@ use serde::{Deserialize, Deserializer, Serialize};
|
||||
|
||||
use crate::tokenizer::{Decoder, PreTokenizedString, PreTokenizer, Result, SplitDelimiterBehavior};
|
||||
|
||||
#[derive(Debug, Clone, PartialEq, Serialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Eq)]
|
||||
/// Replaces all the whitespaces by the provided meta character and then
|
||||
/// splits on this character
|
||||
#[serde(tag = "type")]
|
||||
|
@ -8,7 +8,7 @@ fn is_punc(x: char) -> bool {
|
||||
char::is_ascii_punctuation(&x) || x.is_punctuation()
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
#[macro_rules_attribute(impl_serde_type!)]
|
||||
pub struct Punctuation {
|
||||
#[serde(default = "default_split")]
|
||||
|
@ -6,7 +6,7 @@ use crate::tokenizer::{
|
||||
};
|
||||
|
||||
/// Represents the different patterns that `Split` can use
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub enum SplitPattern {
|
||||
String(String),
|
||||
Regex(String),
|
||||
|
@ -2,7 +2,7 @@ use crate::pre_tokenizers::unicode_scripts::scripts::{get_script, Script};
|
||||
use crate::tokenizer::{normalizer::Range, PreTokenizedString, PreTokenizer, Result};
|
||||
use crate::utils::macro_rules_attribute;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
#[macro_rules_attribute(impl_serde_type!)]
|
||||
pub struct UnicodeScripts;
|
||||
|
||||
|
@ -2,7 +2,7 @@
|
||||
// Unicode scripts : https://gist.github.com/Narsil/07556f26dc84a6baeff4d499e68d3cd2
|
||||
// Rust adaptation : https://gist.github.com/Narsil/1df9fbbf5296a8d4d62de55dcb2fe700
|
||||
|
||||
#[derive(PartialEq, Debug, Clone, Copy)]
|
||||
#[derive(PartialEq, Debug, Clone, Copy, Eq)]
|
||||
pub enum Script {
|
||||
Any,
|
||||
Adlam,
|
||||
|
@ -5,7 +5,7 @@ use crate::tokenizer::{
|
||||
};
|
||||
use crate::utils::macro_rules_attribute;
|
||||
|
||||
#[derive(Clone, Debug, PartialEq)]
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
#[macro_rules_attribute(impl_serde_type!)]
|
||||
pub struct Whitespace;
|
||||
|
||||
@ -28,7 +28,7 @@ impl PreTokenizer for Whitespace {
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Copy, Clone, Debug, PartialEq)]
|
||||
#[derive(Copy, Clone, Debug, PartialEq, Eq)]
|
||||
#[macro_rules_attribute(impl_serde_type!)]
|
||||
pub struct WhitespaceSplit;
|
||||
|
||||
|
@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Clone, Debug, PartialEq, Eq)]
|
||||
#[serde(tag = "type")]
|
||||
pub struct BertProcessing {
|
||||
sep: (String, u32),
|
||||
|
@ -13,7 +13,7 @@ use crate::processors::roberta::RobertaProcessing;
|
||||
use crate::processors::template::TemplateProcessing;
|
||||
use crate::{Encoding, PostProcessor, Result};
|
||||
|
||||
#[derive(Serialize, Deserialize, PartialEq, Debug, Clone)]
|
||||
#[derive(Serialize, Deserialize, PartialEq, Debug, Clone, Eq)]
|
||||
#[serde(untagged)]
|
||||
pub enum PostProcessorWrapper {
|
||||
// Roberta must be before Bert for deserialization (serde does not validate tags)
|
||||
|
@ -4,7 +4,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::iter::FromIterator;
|
||||
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq)]
|
||||
#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
|
||||
#[serde(tag = "type")]
|
||||
pub struct RobertaProcessing {
|
||||
sep: (String, u32),
|
||||
|
@ -63,7 +63,7 @@ use std::convert::{TryFrom, TryInto};
|
||||
use std::result::Result as StdResult;
|
||||
|
||||
/// Represents both sequences received as input of the PostProcessor
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub enum Sequence {
|
||||
/// This is the first sequence, the one that is always specified
|
||||
A,
|
||||
@ -91,7 +91,7 @@ pub enum Sequence {
|
||||
///
|
||||
/// [`SpecialToken`]: struct.SpecialToken.html
|
||||
///
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub enum Piece {
|
||||
Sequence { id: Sequence, type_id: u32 },
|
||||
SpecialToken { id: String, type_id: u32 },
|
||||
@ -188,7 +188,7 @@ impl TryFrom<&str> for Piece {
|
||||
/// vec!["A".into(), "complex".into(), "special".into(), "token".into(), ":".into()]
|
||||
/// ).unwrap();
|
||||
/// ```
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub struct SpecialToken {
|
||||
/// A unique id used to identify this SpecialToken in the template
|
||||
id: String,
|
||||
@ -249,7 +249,7 @@ impl SpecialToken {
|
||||
///
|
||||
/// [`Piece`]: enum.Piece.html
|
||||
///
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize, Eq)]
|
||||
#[serde(transparent)]
|
||||
pub struct Template(Vec<Piece>);
|
||||
|
||||
@ -289,7 +289,7 @@ impl TryFrom<&str> for Template {
|
||||
/// from a HashMap or a Vec<[`SpecialToken`]>.
|
||||
///
|
||||
/// [`SpecialToken`]: struct.SpecialToken.html
|
||||
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Default, Serialize, Deserialize, Eq)]
|
||||
#[serde(transparent)]
|
||||
pub struct Tokens(
|
||||
#[serde(serialize_with = "crate::utils::ordered_map")] pub HashMap<String, SpecialToken>,
|
||||
@ -332,7 +332,7 @@ impl From<HashMap<String, SpecialToken>> for Tokens {
|
||||
/// .unwrap();
|
||||
/// ```
|
||||
///
|
||||
#[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, PartialEq, Builder, Serialize, Deserialize, Eq)]
|
||||
#[serde(tag = "type", from = "TemplateProcessingDeserializer")]
|
||||
#[builder(build_fn(validate = "Self::validate"))]
|
||||
pub struct TemplateProcessing {
|
||||
|
@ -146,7 +146,7 @@ pub trait Trainer {
|
||||
F: Fn(&str) -> Result<Vec<String>> + Sync;
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Token {
|
||||
pub id: u32,
|
||||
pub value: String,
|
||||
|
@ -19,7 +19,7 @@ macro_rules! apply_signed {
|
||||
}
|
||||
|
||||
/// The possible offsets referential
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum OffsetReferential {
|
||||
Original,
|
||||
Normalized,
|
||||
@ -27,7 +27,7 @@ pub enum OffsetReferential {
|
||||
|
||||
/// Represents a Range usable by the NormalizedString to index its content.
|
||||
/// A Range can use indices relative to either the `Original` or the `Normalized` string
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub enum Range<T: RangeBounds<usize> + Clone> {
|
||||
Original(T),
|
||||
Normalized(T),
|
||||
@ -91,7 +91,7 @@ where
|
||||
/// - MergedWithPrevious => `[ "the-", "final-", "-", "countdown" ]`
|
||||
/// - MergedWithNext => `[ "the", "-final", "-", "-countdown" ]`
|
||||
/// - Contiguous => `[ "the", "-", "final", "--", "countdown" ]`
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub enum SplitDelimiterBehavior {
|
||||
Removed,
|
||||
Isolated,
|
||||
@ -108,7 +108,7 @@ pub enum SplitDelimiterBehavior {
|
||||
/// It is possible to retrieve a part of the original string, by indexing it with
|
||||
/// offsets from the normalized one, and the other way around too. It is also
|
||||
/// possible to convert offsets from one referential to the other one easily.
|
||||
#[derive(Default, Debug, Clone, PartialEq)]
|
||||
#[derive(Default, Debug, Clone, PartialEq, Eq)]
|
||||
pub struct NormalizedString {
|
||||
/// The original version of the string, before any modification
|
||||
original: String,
|
||||
|
@ -4,7 +4,7 @@ use crate::{
|
||||
use std::collections::HashMap;
|
||||
|
||||
/// Various possible types of offsets
|
||||
#[derive(Debug, Clone, Copy, PartialEq)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum OffsetType {
|
||||
Byte,
|
||||
Char,
|
||||
@ -15,7 +15,7 @@ pub enum OffsetType {
|
||||
/// This Split contains the underlying `NormalizedString` as well as its offsets
|
||||
/// in the original string. These offsets are in the `original` referential.
|
||||
/// It also contains any `Token` associated to the current split
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct Split {
|
||||
/// The underlying `NormalizedString`. Each SubString is represented by a `NormalizedString`
|
||||
/// and in the end we might be carrying a lot of SubString representing various parts of the
|
||||
@ -49,7 +49,7 @@ impl From<(NormalizedString, Option<Vec<Token>>)> for Split {
|
||||
/// Once everything has been normalized and tokenized, the `PreTokenizedString` is able
|
||||
/// to build an `Encoding` with all the relevant offsets and word ids, relative to the
|
||||
/// original string.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
#[derive(Debug, Clone, PartialEq, Eq)]
|
||||
pub struct PreTokenizedString {
|
||||
original: String,
|
||||
splits: Vec<Split>,
|
||||
|
@ -3,7 +3,7 @@ use serde::{Deserialize, Serialize};
|
||||
use std::cmp;
|
||||
use std::mem;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub enum TruncationDirection {
|
||||
Left,
|
||||
Right,
|
||||
@ -53,7 +53,7 @@ pub enum TruncationError {
|
||||
SequenceTooShort,
|
||||
}
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize, Eq)]
|
||||
pub enum TruncationStrategy {
|
||||
LongestFirst,
|
||||
OnlyFirst,
|
||||
|
Reference in New Issue
Block a user