From bfa842e06357cb9fe2a91c75f73900d721428f5d Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Mon, 19 Dec 2022 13:50:48 +0100 Subject: [PATCH] Adding stale bot ? (#1123) * Adding stale bot ? * Clippy. --- .github/stale.yml | 17 +++++++++++++++++ tokenizers/src/models/unigram/serialization.rs | 2 +- tokenizers/src/models/unigram/trainer.rs | 2 +- tokenizers/src/processors/template.rs | 2 +- tokenizers/src/tokenizer/normalizer.rs | 14 +++----------- 5 files changed, 23 insertions(+), 14 deletions(-) create mode 100644 .github/stale.yml diff --git a/.github/stale.yml b/.github/stale.yml new file mode 100644 index 00000000..dc90e5a1 --- /dev/null +++ b/.github/stale.yml @@ -0,0 +1,17 @@ +# Number of days of inactivity before an issue becomes stale +daysUntilStale: 60 +# Number of days of inactivity before a stale issue is closed +daysUntilClose: 7 +# Issues with these labels will never be considered stale +exemptLabels: + - pinned + - security +# Label to use when marking an issue as stale +staleLabel: wontfix +# Comment to post when marking an issue as stale. Set to `false` to disable +markComment: > + This issue has been automatically marked as stale because it has not had + recent activity. It will be closed if no further activity occurs. Thank you + for your contributions. +# Comment to post when closing a stale issue. Set to `false` to disable +closeComment: false diff --git a/tokenizers/src/models/unigram/serialization.rs b/tokenizers/src/models/unigram/serialization.rs index 5ebc8154..a04fc17f 100644 --- a/tokenizers/src/models/unigram/serialization.rs +++ b/tokenizers/src/models/unigram/serialization.rs @@ -63,7 +63,7 @@ impl<'de> Visitor<'de> for UnigramVisitor { } match (vocab, unk_id) { (Some(vocab), unk_id) => Ok(Unigram::from(vocab, unk_id) - .map_err(|err| Error::custom(&format!("Unable to load vocab {:?}", err)))?), + .map_err(|err| Error::custom(format!("Unable to load vocab {:?}", err)))?), (None, _) => Err(Error::custom("Missing vocab")), } } diff --git a/tokenizers/src/models/unigram/trainer.rs b/tokenizers/src/models/unigram/trainer.rs index 444133fb..d4562b28 100644 --- a/tokenizers/src/models/unigram/trainer.rs +++ b/tokenizers/src/models/unigram/trainer.rs @@ -501,7 +501,7 @@ impl UnigramTrainer { let expected_loops = (((desired_vocab_size as f64).ln() - (pieces.len() as f64).ln()) / self.shrinking_factor.ln()) as usize + 1; - let expected_updates = expected_loops as usize * self.n_sub_iterations as usize; + let expected_updates = expected_loops * self.n_sub_iterations as usize; self.update_progress(&progress, expected_updates, "EM training"); let required_chars = self.required_chars(&sentences); let mut new_model = Unigram::from(pieces.clone(), Some(0))?; diff --git a/tokenizers/src/processors/template.rs b/tokenizers/src/processors/template.rs index 13bc91e5..313b4911 100644 --- a/tokenizers/src/processors/template.rs +++ b/tokenizers/src/processors/template.rs @@ -487,7 +487,7 @@ impl TemplateProcessing { .flat_map(|piece| { match piece { Piece::Sequence { id, type_id } => { - let i = if *id == Sequence::A { 0 } else { 1 }; + let i = usize::from(*id != Sequence::A); let encoding = &mut encodings[i]; encoding.set_type_ids(vec![*type_id; encoding.len()]); encoding.set_sequence_id(i); diff --git a/tokenizers/src/tokenizer/normalizer.rs b/tokenizers/src/tokenizer/normalizer.rs index ac16ce92..39c2c85d 100644 --- a/tokenizers/src/tokenizer/normalizer.rs +++ b/tokenizers/src/tokenizer/normalizer.rs @@ -507,7 +507,7 @@ impl NormalizedString { let transformations = s .chars() .enumerate() - .map(|(i, c)| (c, if i == 0 { 0 } else { 1 })) + .map(|(i, c)| (c, isize::from(i != 0))) .chain(std::iter::once((next, 1))); self.transform_range(Range::Normalized(0..next.len_utf8()), transformations, 0); @@ -853,16 +853,8 @@ pub fn get_range_of>(s: &str, range: T) -> Option<&str> { } else if start >= len || end > len || start >= end { None } else { - let start_b = s - .char_indices() - .map(|(i, _)| i) - .nth(start as usize) - .unwrap_or(0); - let end_b = s - .char_indices() - .map(|(i, _)| i) - .nth(end as usize) - .unwrap_or(s.len()); + let start_b = s.char_indices().map(|(i, _)| i).nth(start).unwrap_or(0); + let end_b = s.char_indices().map(|(i, _)| i).nth(end).unwrap_or(s.len()); Some(&s[start_b..end_b]) } }