From 01f8bc834cf20d8b5125169a1daa2a6c7d4284fd Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Tue, 27 May 2025 11:30:32 +0200 Subject: [PATCH] clippy (#1781) * clippy * fmtr * rutc? * fix onig issue * up * decode stream default * jump a release for cargo audit ... * more cliippy stuff * clippy? * proper style * fmt --- .github/workflows/rust.yml | 2 +- bindings/python/Cargo.toml | 6 +++--- .../py_src/tokenizers/decoders/__init__.pyi | 2 +- tokenizers/benches/llama3.rs | 2 +- tokenizers/src/decoders/byte_fallback.rs | 6 +----- tokenizers/src/models/mod.rs | 4 ++-- tokenizers/src/models/unigram/model.rs | 4 ++-- tokenizers/src/normalizers/byte_level.rs | 2 +- tokenizers/src/pre_tokenizers/byte_level.rs | 2 +- tokenizers/src/processors/bert.rs | 14 ++++++++------ tokenizers/src/processors/roberta.rs | 16 ++++++++-------- tokenizers/src/processors/template.rs | 10 +++++----- tokenizers/src/tokenizer/added_vocabulary.rs | 2 +- tokenizers/src/tokenizer/encoding.rs | 2 +- tokenizers/src/tokenizer/normalizer.rs | 12 ++++-------- tokenizers/src/tokenizer/serialization.rs | 4 +--- tokenizers/src/utils/from_pretrained.rs | 8 +++----- tokenizers/src/utils/mod.rs | 7 ++----- 18 files changed, 46 insertions(+), 59 deletions(-) diff --git a/.github/workflows/rust.yml b/.github/workflows/rust.yml index 9b9dd8e4..cbb0f15f 100644 --- a/.github/workflows/rust.yml +++ b/.github/workflows/rust.yml @@ -10,7 +10,7 @@ jobs: build: runs-on: ${{ matrix.os }} env: - MACOSX_DEPLOYMENT_TARGET: 10.11 + MACOSX_DEPLOYMENT_TARGET: 10.12 strategy: matrix: os: [ubuntu-latest, windows-latest, macOS-latest] diff --git a/bindings/python/Cargo.toml b/bindings/python/Cargo.toml index 2c1daae8..d6e11712 100644 --- a/bindings/python/Cargo.toml +++ b/bindings/python/Cargo.toml @@ -14,8 +14,8 @@ serde = { version = "1.0", features = ["rc", "derive"] } serde_json = "1.0" libc = "0.2" env_logger = "0.11" -pyo3 = { version = "0.23", features = ["abi3", "abi3-py39", "py-clone"] } -numpy = "0.23" +pyo3 = { version = "0.24", features = ["abi3", "abi3-py39", "py-clone"] } +numpy = "0.24" ndarray = "0.16" itertools = "0.12" @@ -24,7 +24,7 @@ path = "../../tokenizers" [dev-dependencies] tempfile = "3.10" -pyo3 = { version = "0.23", features = ["auto-initialize"] } +pyo3 = { version = "0.24", features = ["auto-initialize"] } [features] default = ["pyo3/extension-module"] diff --git a/bindings/python/py_src/tokenizers/decoders/__init__.pyi b/bindings/python/py_src/tokenizers/decoders/__init__.pyi index 672aebb8..488dc770 100644 --- a/bindings/python/py_src/tokenizers/decoders/__init__.pyi +++ b/bindings/python/py_src/tokenizers/decoders/__init__.pyi @@ -33,7 +33,7 @@ class BPEDecoder(Decoder): Args: suffix (:obj:`str`, `optional`, defaults to :obj:``): - The suffix that was used to caracterize an end-of-word. This suffix will + The suffix that was used to characterize an end-of-word. This suffix will be replaced by whitespaces during the decoding """ def __init__(self, suffix=""): diff --git a/tokenizers/benches/llama3.rs b/tokenizers/benches/llama3.rs index 77af3bd6..cb6cae8e 100644 --- a/tokenizers/benches/llama3.rs +++ b/tokenizers/benches/llama3.rs @@ -7,7 +7,7 @@ use tokenizers::Tokenizer; pub fn llama3(c: &mut Criterion) { let data = std::fs::read_to_string("data/big.txt").unwrap(); let mut group = c.benchmark_group("llama3-encode"); - group.throughput(Throughput::Bytes(data.bytes().len() as u64)); + group.throughput(Throughput::Bytes(data.len() as u64)); group.bench_function("llama3-offsets", |b| { let tokenizer = Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap(); diff --git a/tokenizers/src/decoders/byte_fallback.rs b/tokenizers/src/decoders/byte_fallback.rs index b04b3db6..57b7b63c 100644 --- a/tokenizers/src/decoders/byte_fallback.rs +++ b/tokenizers/src/decoders/byte_fallback.rs @@ -28,11 +28,7 @@ impl Decoder for ByteFallback { for token in tokens { let bytes = if token.len() == 6 && token.starts_with("<0x") && token.ends_with('>') { - if let Ok(byte) = u8::from_str_radix(&token[3..5], 16) { - Some(byte) - } else { - None - } + u8::from_str_radix(&token[3..5], 16).ok() } else { None }; diff --git a/tokenizers/src/models/mod.rs b/tokenizers/src/models/mod.rs index 3a3a91ad..4e5419ba 100644 --- a/tokenizers/src/models/mod.rs +++ b/tokenizers/src/models/mod.rs @@ -35,7 +35,7 @@ impl Serialize for OrderedVocabIter<'_> { { // There could be holes so max + 1 is more correct than vocab_r.len() let mut holes = vec![]; - let result = if let Some(max) = self.vocab_r.iter().map(|(key, _)| key).max() { + let result = if let Some(max) = self.vocab_r.keys().max() { let iter = (0..*max + 1).filter_map(|i| { if let Some(token) = self.vocab_r.get(&i) { Some((token, i)) @@ -50,7 +50,7 @@ impl Serialize for OrderedVocabIter<'_> { }; if !holes.is_empty() { - warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes); + warn!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !"); println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !"); } result diff --git a/tokenizers/src/models/unigram/model.rs b/tokenizers/src/models/unigram/model.rs index da4d631c..fd498c82 100644 --- a/tokenizers/src/models/unigram/model.rs +++ b/tokenizers/src/models/unigram/model.rs @@ -313,7 +313,7 @@ impl Unigram { && node.id == self.unk_id.ok_or(UnigramError::MissingUnkId)? { token.push( - String::from_utf8(sentence[starts_at..ends_at].as_bytes().to_vec()).unwrap(), + String::from_utf8((sentence.as_bytes()[starts_at..ends_at]).to_vec()).unwrap(), ); } else { if !token.is_empty() { @@ -322,7 +322,7 @@ impl Unigram { token = vec![]; } results.push( - String::from_utf8(sentence[starts_at..ends_at].as_bytes().to_vec()).unwrap(), + String::from_utf8((sentence.as_bytes()[starts_at..ends_at]).to_vec()).unwrap(), ); } ends_at = starts_at; diff --git a/tokenizers/src/normalizers/byte_level.rs b/tokenizers/src/normalizers/byte_level.rs index ae47de5a..ae8fecfb 100644 --- a/tokenizers/src/normalizers/byte_level.rs +++ b/tokenizers/src/normalizers/byte_level.rs @@ -35,7 +35,7 @@ impl Normalizer for ByteLevel { let mut i = 0; for cur_char in s.chars() { let size = cur_char.len_utf8(); - let bytes = s[i..i + size].as_bytes(); + let bytes = &s.as_bytes()[i..i + size]; i += size; transformations.extend( bytes diff --git a/tokenizers/src/pre_tokenizers/byte_level.rs b/tokenizers/src/pre_tokenizers/byte_level.rs index e761cbc9..62115dba 100644 --- a/tokenizers/src/pre_tokenizers/byte_level.rs +++ b/tokenizers/src/pre_tokenizers/byte_level.rs @@ -135,7 +135,7 @@ impl PreTokenizer for ByteLevel { let mut i = 0; for cur_char in s.chars() { let size = cur_char.len_utf8(); - let bytes = s[i..i + size].as_bytes(); + let bytes = &s.as_bytes()[i..i + size]; i += size; transformations.extend( bytes diff --git a/tokenizers/src/processors/bert.rs b/tokenizers/src/processors/bert.rs index 17939112..3cd0cf38 100644 --- a/tokenizers/src/processors/bert.rs +++ b/tokenizers/src/processors/bert.rs @@ -65,9 +65,9 @@ impl PostProcessor for BertProcessing { let ids = [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat(); let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat(); let tokens = [ - &[self.cls.0.clone()], + std::slice::from_ref(&self.cls.0), encoding.get_tokens(), - &[self.sep.0.clone()], + std::slice::from_ref(&self.sep.0), ] .concat(); let words = [&[None], encoding.get_word_ids(), &[None]].concat(); @@ -95,9 +95,9 @@ impl PostProcessor for BertProcessing { [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat(); let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat(); let tokens = [ - &[self.cls.0.clone()], + std::slice::from_ref(&self.cls.0), encoding.get_tokens(), - &[self.sep.0.clone()], + std::slice::from_ref(&self.sep.0), ] .concat(); let words = [&[None], encoding.get_word_ids(), &[None]].concat(); @@ -130,7 +130,8 @@ impl PostProcessor for BertProcessing { } else { let pair_ids = [encoding.get_ids(), &[self.sep.1]].concat(); let pair_type_ids = [encoding.get_type_ids(), &[1]].concat(); - let pair_tokens = [encoding.get_tokens(), &[self.sep.0.clone()]].concat(); + let pair_tokens = + [encoding.get_tokens(), std::slice::from_ref(&self.sep.0)].concat(); let pair_words = [encoding.get_word_ids(), &[None]].concat(); let pair_offsets = [encoding.get_offsets(), &[(0, 0)]].concat(); let pair_special_tokens = @@ -155,7 +156,8 @@ impl PostProcessor for BertProcessing { let pair_ids = [encoding.get_ids(), &[self.sep.1]].concat(); let pair_type_ids = [encoding.get_type_ids(), &[1]].concat(); let pair_tokens = - [encoding.get_tokens(), &[self.sep.0.clone()]].concat(); + [encoding.get_tokens(), std::slice::from_ref(&self.sep.0)] + .concat(); let pair_words = [encoding.get_word_ids(), &[None]].concat(); let pair_offsets = [encoding.get_offsets(), &[(0, 0)]].concat(); let pair_special_tokens = diff --git a/tokenizers/src/processors/roberta.rs b/tokenizers/src/processors/roberta.rs index 5bbc4ea6..d870f415 100644 --- a/tokenizers/src/processors/roberta.rs +++ b/tokenizers/src/processors/roberta.rs @@ -95,9 +95,9 @@ impl PostProcessor for RobertaProcessing { let ids = [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat(); let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat(); let tokens = [ - &[self.cls.0.clone()], + std::slice::from_ref(&self.cls.0), encoding.get_tokens(), - &[self.sep.0.clone()], + std::slice::from_ref(&self.sep.0), ] .concat(); let words = [&[None], encoding.get_word_ids(), &[None]].concat(); @@ -125,9 +125,9 @@ impl PostProcessor for RobertaProcessing { [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat(); let type_ids = vec![0; encoding.get_ids().len() + 2]; let tokens = [ - &[self.cls.0.clone()], + std::slice::from_ref(&self.cls.0), encoding.get_tokens(), - &[self.sep.0.clone()], + std::slice::from_ref(&self.sep.0), ] .concat(); let words = [&[None], encoding.get_word_ids(), &[None]].concat(); @@ -161,9 +161,9 @@ impl PostProcessor for RobertaProcessing { let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat(); let pair_type_ids = vec![0; encoding.get_ids().len() + 2]; let pair_tokens = [ - &[self.sep.0.clone()], + std::slice::from_ref(&self.sep.0), encoding.get_tokens(), - &[self.sep.0.clone()], + std::slice::from_ref(&self.sep.0), ] .concat(); let pair_words = [&[None], encoding.get_word_ids(), &[None]].concat(); @@ -191,9 +191,9 @@ impl PostProcessor for RobertaProcessing { [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat(); let pair_type_ids = vec![0; encoding.get_ids().len() + 2]; let pair_tokens = [ - &[self.sep.0.clone()], + std::slice::from_ref(&self.sep.0), encoding.get_tokens(), - &[self.sep.0.clone()], + std::slice::from_ref(&self.sep.0), ] .concat(); let pair_words = diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index 6c9cf9a7..d119a6ff 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -565,16 +565,16 @@ impl TemplateProcessing { let encoding = Encoding::new( tok.ids.clone(), - std::iter::repeat(*type_id).take(len).collect(), + std::iter::repeat_n(*type_id, len).collect(), tok.tokens.clone(), // words - std::iter::repeat(None).take(len).collect(), + std::iter::repeat_n(None, len).collect(), // offsets - std::iter::repeat((0, 0)).take(len).collect(), + std::iter::repeat_n((0, 0), len).collect(), // special_tokens_mask - std::iter::repeat(1).take(len).collect(), + std::iter::repeat_n(1, len).collect(), // attention_mask - std::iter::repeat(1).take(len).collect(), + std::iter::repeat_n(1, len).collect(), // overflowing vec![], // sequence_range diff --git a/tokenizers/src/tokenizer/added_vocabulary.rs b/tokenizers/src/tokenizer/added_vocabulary.rs index f18b4529..a93d5328 100644 --- a/tokenizers/src/tokenizer/added_vocabulary.rs +++ b/tokenizers/src/tokenizer/added_vocabulary.rs @@ -668,7 +668,7 @@ mod tests { // Also adds tokens already covered by the model let added_token = AddedToken::from("test", false); assert_eq!( - vocab.add_tokens(&[added_token.clone()], &model, normalizer), + vocab.add_tokens(std::slice::from_ref(&added_token), &model, normalizer), 1 ); assert_eq!(vocab.len(), 3); diff --git a/tokenizers/src/tokenizer/encoding.rs b/tokenizers/src/tokenizer/encoding.rs index 1732686e..4449bcde 100644 --- a/tokenizers/src/tokenizer/encoding.rs +++ b/tokenizers/src/tokenizer/encoding.rs @@ -139,7 +139,7 @@ impl Encoding { for seq_id in 0..self.n_sequences() { let range = self.sequence_range(seq_id); let seq_len = range.len(); - sequences.splice(range, std::iter::repeat(Some(seq_id)).take(seq_len)); + sequences.splice(range, std::iter::repeat_n(Some(seq_id), seq_len)); } sequences } diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index 7f50d9c9..0b8c519e 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -328,9 +328,7 @@ impl NormalizedString { }, }; trace!( - "===== transform_range call with {:?} (initial_offset: {}) =====", - n_range, - initial_offset + "===== transform_range call with {n_range:?} (initial_offset: {initial_offset}) =====" ); // Retrieve the original characters that are being replaced. This let us @@ -386,9 +384,7 @@ impl NormalizedString { let replaced_char_size_change = c.len_utf8() as isize - replaced_char_size as isize; if let Some(ref replaced_char) = replaced_char { trace!( - "Replacing char {:?} - with a change in size: {}", - replaced_char, - replaced_char_size_change + "Replacing char {replaced_char:?} - with a change in size: {replaced_char_size_change}" ); } @@ -401,12 +397,12 @@ impl NormalizedString { } else { 0 }; - trace!("Total bytes to remove: {}", total_bytes_to_remove); + trace!("Total bytes to remove: {total_bytes_to_remove}"); // Keep track of the changes for next offsets offset += replaced_char_size as isize; offset += total_bytes_to_remove as isize; - trace!("New offset: {}", offset); + trace!("New offset: {offset}"); trace!("New normalized alignment: {}x {:?}", c.len_utf8(), align); alignments.extend((0..c.len_utf8()).map(|_| align)); diff --git a/tokenizers/src/tokenizer/serialization.rs b/tokenizers/src/tokenizer/serialization.rs index 26d8344f..7075bed8 100644 --- a/tokenizers/src/tokenizer/serialization.rs +++ b/tokenizers/src/tokenizer/serialization.rs @@ -159,9 +159,7 @@ where if rid != token.id { warn!( "Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'", - token.token.content, - token.id, - rid.to_string() + token.token.content, token.id, rid ); } } diff --git a/tokenizers/src/utils/from_pretrained.rs b/tokenizers/src/utils/from_pretrained.rs index 223fbbeb..d2601c9c 100644 --- a/tokenizers/src/utils/from_pretrained.rs +++ b/tokenizers/src/utils/from_pretrained.rs @@ -36,14 +36,13 @@ pub fn from_pretrained>( let valid_chars_stringified = valid_chars .iter() .fold(vec![], |mut buf, x| { - buf.push(format!("'{}'", x)); + buf.push(format!("'{x}'")); buf }) .join(", "); // "'/', '-', '_', '.'" if !valid { return Err(format!( - "Model \"{}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}", - identifier + "Model \"{identifier}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}" ) .into()); } @@ -53,8 +52,7 @@ pub fn from_pretrained>( let valid_revision = revision.chars().all(is_valid_char); if !valid_revision { return Err(format!( - "Revision \"{}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}", - revision + "Revision \"{revision}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}" ) .into()); } diff --git a/tokenizers/src/utils/mod.rs b/tokenizers/src/utils/mod.rs index 75d88734..915dcdd4 100644 --- a/tokenizers/src/utils/mod.rs +++ b/tokenizers/src/utils/mod.rs @@ -2,18 +2,15 @@ pub(crate) mod cache; #[cfg(feature = "http")] pub(crate) mod from_pretrained; -#[cfg(feature = "fancy-regex")] +#[cfg(all(feature = "fancy-regex", not(feature = "onig")))] mod fancy; -#[cfg(feature = "fancy-regex")] +#[cfg(all(feature = "fancy-regex", not(feature = "onig")))] pub use fancy::SysRegex; #[cfg(feature = "onig")] mod onig; #[cfg(feature = "onig")] pub use crate::utils::onig::SysRegex; -#[cfg(all(feature = "onig", feature = "fancy-regex"))] -compile_error!("Features `onig` and `fancy-regex` are mutually exclusive"); - #[cfg(not(any(feature = "onig", feature = "fancy-regex")))] compile_error!("One of the `onig`, or `fancy-regex` features must be enabled");