mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 08:15:49 +00:00
clippy (#1781)
* clippy * fmtr * rutc? * fix onig issue * up * decode stream default * jump a release for cargo audit ... * more cliippy stuff * clippy? * proper style * fmt
This commit is contained in:
2
.github/workflows/rust.yml
vendored
2
.github/workflows/rust.yml
vendored
@ -10,7 +10,7 @@ jobs:
|
||||
build:
|
||||
runs-on: ${{ matrix.os }}
|
||||
env:
|
||||
MACOSX_DEPLOYMENT_TARGET: 10.11
|
||||
MACOSX_DEPLOYMENT_TARGET: 10.12
|
||||
strategy:
|
||||
matrix:
|
||||
os: [ubuntu-latest, windows-latest, macOS-latest]
|
||||
|
@ -14,8 +14,8 @@ serde = { version = "1.0", features = ["rc", "derive"] }
|
||||
serde_json = "1.0"
|
||||
libc = "0.2"
|
||||
env_logger = "0.11"
|
||||
pyo3 = { version = "0.23", features = ["abi3", "abi3-py39", "py-clone"] }
|
||||
numpy = "0.23"
|
||||
pyo3 = { version = "0.24", features = ["abi3", "abi3-py39", "py-clone"] }
|
||||
numpy = "0.24"
|
||||
ndarray = "0.16"
|
||||
itertools = "0.12"
|
||||
|
||||
@ -24,7 +24,7 @@ path = "../../tokenizers"
|
||||
|
||||
[dev-dependencies]
|
||||
tempfile = "3.10"
|
||||
pyo3 = { version = "0.23", features = ["auto-initialize"] }
|
||||
pyo3 = { version = "0.24", features = ["auto-initialize"] }
|
||||
|
||||
[features]
|
||||
default = ["pyo3/extension-module"]
|
||||
|
@ -33,7 +33,7 @@ class BPEDecoder(Decoder):
|
||||
|
||||
Args:
|
||||
suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
|
||||
The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
The suffix that was used to characterize an end-of-word. This suffix will
|
||||
be replaced by whitespaces during the decoding
|
||||
"""
|
||||
def __init__(self, suffix="</w>"):
|
||||
|
@ -7,7 +7,7 @@ use tokenizers::Tokenizer;
|
||||
pub fn llama3(c: &mut Criterion) {
|
||||
let data = std::fs::read_to_string("data/big.txt").unwrap();
|
||||
let mut group = c.benchmark_group("llama3-encode");
|
||||
group.throughput(Throughput::Bytes(data.bytes().len() as u64));
|
||||
group.throughput(Throughput::Bytes(data.len() as u64));
|
||||
group.bench_function("llama3-offsets", |b| {
|
||||
let tokenizer =
|
||||
Tokenizer::from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", None).unwrap();
|
||||
|
@ -28,11 +28,7 @@ impl Decoder for ByteFallback {
|
||||
|
||||
for token in tokens {
|
||||
let bytes = if token.len() == 6 && token.starts_with("<0x") && token.ends_with('>') {
|
||||
if let Ok(byte) = u8::from_str_radix(&token[3..5], 16) {
|
||||
Some(byte)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
u8::from_str_radix(&token[3..5], 16).ok()
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
@ -35,7 +35,7 @@ impl Serialize for OrderedVocabIter<'_> {
|
||||
{
|
||||
// There could be holes so max + 1 is more correct than vocab_r.len()
|
||||
let mut holes = vec![];
|
||||
let result = if let Some(max) = self.vocab_r.iter().map(|(key, _)| key).max() {
|
||||
let result = if let Some(max) = self.vocab_r.keys().max() {
|
||||
let iter = (0..*max + 1).filter_map(|i| {
|
||||
if let Some(token) = self.vocab_r.get(&i) {
|
||||
Some((token, i))
|
||||
@ -50,7 +50,7 @@ impl Serialize for OrderedVocabIter<'_> {
|
||||
};
|
||||
|
||||
if !holes.is_empty() {
|
||||
warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
|
||||
warn!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !");
|
||||
println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !");
|
||||
}
|
||||
result
|
||||
|
@ -313,7 +313,7 @@ impl Unigram {
|
||||
&& node.id == self.unk_id.ok_or(UnigramError::MissingUnkId)?
|
||||
{
|
||||
token.push(
|
||||
String::from_utf8(sentence[starts_at..ends_at].as_bytes().to_vec()).unwrap(),
|
||||
String::from_utf8((sentence.as_bytes()[starts_at..ends_at]).to_vec()).unwrap(),
|
||||
);
|
||||
} else {
|
||||
if !token.is_empty() {
|
||||
@ -322,7 +322,7 @@ impl Unigram {
|
||||
token = vec![];
|
||||
}
|
||||
results.push(
|
||||
String::from_utf8(sentence[starts_at..ends_at].as_bytes().to_vec()).unwrap(),
|
||||
String::from_utf8((sentence.as_bytes()[starts_at..ends_at]).to_vec()).unwrap(),
|
||||
);
|
||||
}
|
||||
ends_at = starts_at;
|
||||
|
@ -35,7 +35,7 @@ impl Normalizer for ByteLevel {
|
||||
let mut i = 0;
|
||||
for cur_char in s.chars() {
|
||||
let size = cur_char.len_utf8();
|
||||
let bytes = s[i..i + size].as_bytes();
|
||||
let bytes = &s.as_bytes()[i..i + size];
|
||||
i += size;
|
||||
transformations.extend(
|
||||
bytes
|
||||
|
@ -135,7 +135,7 @@ impl PreTokenizer for ByteLevel {
|
||||
let mut i = 0;
|
||||
for cur_char in s.chars() {
|
||||
let size = cur_char.len_utf8();
|
||||
let bytes = s[i..i + size].as_bytes();
|
||||
let bytes = &s.as_bytes()[i..i + size];
|
||||
i += size;
|
||||
transformations.extend(
|
||||
bytes
|
||||
|
@ -65,9 +65,9 @@ impl PostProcessor for BertProcessing {
|
||||
let ids = [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat();
|
||||
let tokens = [
|
||||
&[self.cls.0.clone()],
|
||||
std::slice::from_ref(&self.cls.0),
|
||||
encoding.get_tokens(),
|
||||
&[self.sep.0.clone()],
|
||||
std::slice::from_ref(&self.sep.0),
|
||||
]
|
||||
.concat();
|
||||
let words = [&[None], encoding.get_word_ids(), &[None]].concat();
|
||||
@ -95,9 +95,9 @@ impl PostProcessor for BertProcessing {
|
||||
[&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat();
|
||||
let tokens = [
|
||||
&[self.cls.0.clone()],
|
||||
std::slice::from_ref(&self.cls.0),
|
||||
encoding.get_tokens(),
|
||||
&[self.sep.0.clone()],
|
||||
std::slice::from_ref(&self.sep.0),
|
||||
]
|
||||
.concat();
|
||||
let words = [&[None], encoding.get_word_ids(), &[None]].concat();
|
||||
@ -130,7 +130,8 @@ impl PostProcessor for BertProcessing {
|
||||
} else {
|
||||
let pair_ids = [encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let pair_type_ids = [encoding.get_type_ids(), &[1]].concat();
|
||||
let pair_tokens = [encoding.get_tokens(), &[self.sep.0.clone()]].concat();
|
||||
let pair_tokens =
|
||||
[encoding.get_tokens(), std::slice::from_ref(&self.sep.0)].concat();
|
||||
let pair_words = [encoding.get_word_ids(), &[None]].concat();
|
||||
let pair_offsets = [encoding.get_offsets(), &[(0, 0)]].concat();
|
||||
let pair_special_tokens =
|
||||
@ -155,7 +156,8 @@ impl PostProcessor for BertProcessing {
|
||||
let pair_ids = [encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let pair_type_ids = [encoding.get_type_ids(), &[1]].concat();
|
||||
let pair_tokens =
|
||||
[encoding.get_tokens(), &[self.sep.0.clone()]].concat();
|
||||
[encoding.get_tokens(), std::slice::from_ref(&self.sep.0)]
|
||||
.concat();
|
||||
let pair_words = [encoding.get_word_ids(), &[None]].concat();
|
||||
let pair_offsets = [encoding.get_offsets(), &[(0, 0)]].concat();
|
||||
let pair_special_tokens =
|
||||
|
@ -95,9 +95,9 @@ impl PostProcessor for RobertaProcessing {
|
||||
let ids = [&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let type_ids = [&[0], encoding.get_type_ids(), &[0]].concat();
|
||||
let tokens = [
|
||||
&[self.cls.0.clone()],
|
||||
std::slice::from_ref(&self.cls.0),
|
||||
encoding.get_tokens(),
|
||||
&[self.sep.0.clone()],
|
||||
std::slice::from_ref(&self.sep.0),
|
||||
]
|
||||
.concat();
|
||||
let words = [&[None], encoding.get_word_ids(), &[None]].concat();
|
||||
@ -125,9 +125,9 @@ impl PostProcessor for RobertaProcessing {
|
||||
[&[self.cls.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let type_ids = vec![0; encoding.get_ids().len() + 2];
|
||||
let tokens = [
|
||||
&[self.cls.0.clone()],
|
||||
std::slice::from_ref(&self.cls.0),
|
||||
encoding.get_tokens(),
|
||||
&[self.sep.0.clone()],
|
||||
std::slice::from_ref(&self.sep.0),
|
||||
]
|
||||
.concat();
|
||||
let words = [&[None], encoding.get_word_ids(), &[None]].concat();
|
||||
@ -161,9 +161,9 @@ impl PostProcessor for RobertaProcessing {
|
||||
let pair_ids = [&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let pair_type_ids = vec![0; encoding.get_ids().len() + 2];
|
||||
let pair_tokens = [
|
||||
&[self.sep.0.clone()],
|
||||
std::slice::from_ref(&self.sep.0),
|
||||
encoding.get_tokens(),
|
||||
&[self.sep.0.clone()],
|
||||
std::slice::from_ref(&self.sep.0),
|
||||
]
|
||||
.concat();
|
||||
let pair_words = [&[None], encoding.get_word_ids(), &[None]].concat();
|
||||
@ -191,9 +191,9 @@ impl PostProcessor for RobertaProcessing {
|
||||
[&[self.sep.1], encoding.get_ids(), &[self.sep.1]].concat();
|
||||
let pair_type_ids = vec![0; encoding.get_ids().len() + 2];
|
||||
let pair_tokens = [
|
||||
&[self.sep.0.clone()],
|
||||
std::slice::from_ref(&self.sep.0),
|
||||
encoding.get_tokens(),
|
||||
&[self.sep.0.clone()],
|
||||
std::slice::from_ref(&self.sep.0),
|
||||
]
|
||||
.concat();
|
||||
let pair_words =
|
||||
|
@ -565,16 +565,16 @@ impl TemplateProcessing {
|
||||
|
||||
let encoding = Encoding::new(
|
||||
tok.ids.clone(),
|
||||
std::iter::repeat(*type_id).take(len).collect(),
|
||||
std::iter::repeat_n(*type_id, len).collect(),
|
||||
tok.tokens.clone(),
|
||||
// words
|
||||
std::iter::repeat(None).take(len).collect(),
|
||||
std::iter::repeat_n(None, len).collect(),
|
||||
// offsets
|
||||
std::iter::repeat((0, 0)).take(len).collect(),
|
||||
std::iter::repeat_n((0, 0), len).collect(),
|
||||
// special_tokens_mask
|
||||
std::iter::repeat(1).take(len).collect(),
|
||||
std::iter::repeat_n(1, len).collect(),
|
||||
// attention_mask
|
||||
std::iter::repeat(1).take(len).collect(),
|
||||
std::iter::repeat_n(1, len).collect(),
|
||||
// overflowing
|
||||
vec![],
|
||||
// sequence_range
|
||||
|
@ -668,7 +668,7 @@ mod tests {
|
||||
// Also adds tokens already covered by the model
|
||||
let added_token = AddedToken::from("test", false);
|
||||
assert_eq!(
|
||||
vocab.add_tokens(&[added_token.clone()], &model, normalizer),
|
||||
vocab.add_tokens(std::slice::from_ref(&added_token), &model, normalizer),
|
||||
1
|
||||
);
|
||||
assert_eq!(vocab.len(), 3);
|
||||
|
@ -139,7 +139,7 @@ impl Encoding {
|
||||
for seq_id in 0..self.n_sequences() {
|
||||
let range = self.sequence_range(seq_id);
|
||||
let seq_len = range.len();
|
||||
sequences.splice(range, std::iter::repeat(Some(seq_id)).take(seq_len));
|
||||
sequences.splice(range, std::iter::repeat_n(Some(seq_id), seq_len));
|
||||
}
|
||||
sequences
|
||||
}
|
||||
|
@ -328,9 +328,7 @@ impl NormalizedString {
|
||||
},
|
||||
};
|
||||
trace!(
|
||||
"===== transform_range call with {:?} (initial_offset: {}) =====",
|
||||
n_range,
|
||||
initial_offset
|
||||
"===== transform_range call with {n_range:?} (initial_offset: {initial_offset}) ====="
|
||||
);
|
||||
|
||||
// Retrieve the original characters that are being replaced. This let us
|
||||
@ -386,9 +384,7 @@ impl NormalizedString {
|
||||
let replaced_char_size_change = c.len_utf8() as isize - replaced_char_size as isize;
|
||||
if let Some(ref replaced_char) = replaced_char {
|
||||
trace!(
|
||||
"Replacing char {:?} - with a change in size: {}",
|
||||
replaced_char,
|
||||
replaced_char_size_change
|
||||
"Replacing char {replaced_char:?} - with a change in size: {replaced_char_size_change}"
|
||||
);
|
||||
}
|
||||
|
||||
@ -401,12 +397,12 @@ impl NormalizedString {
|
||||
} else {
|
||||
0
|
||||
};
|
||||
trace!("Total bytes to remove: {}", total_bytes_to_remove);
|
||||
trace!("Total bytes to remove: {total_bytes_to_remove}");
|
||||
|
||||
// Keep track of the changes for next offsets
|
||||
offset += replaced_char_size as isize;
|
||||
offset += total_bytes_to_remove as isize;
|
||||
trace!("New offset: {}", offset);
|
||||
trace!("New offset: {offset}");
|
||||
|
||||
trace!("New normalized alignment: {}x {:?}", c.len_utf8(), align);
|
||||
alignments.extend((0..c.len_utf8()).map(|_| align));
|
||||
|
@ -159,9 +159,7 @@ where
|
||||
if rid != token.id {
|
||||
warn!(
|
||||
"Warning: Token '{}' was expected to have ID '{}' but was given ID '{}'",
|
||||
token.token.content,
|
||||
token.id,
|
||||
rid.to_string()
|
||||
token.token.content, token.id, rid
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -36,14 +36,13 @@ pub fn from_pretrained<S: AsRef<str>>(
|
||||
let valid_chars_stringified = valid_chars
|
||||
.iter()
|
||||
.fold(vec![], |mut buf, x| {
|
||||
buf.push(format!("'{}'", x));
|
||||
buf.push(format!("'{x}'"));
|
||||
buf
|
||||
})
|
||||
.join(", "); // "'/', '-', '_', '.'"
|
||||
if !valid {
|
||||
return Err(format!(
|
||||
"Model \"{}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}",
|
||||
identifier
|
||||
"Model \"{identifier}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}"
|
||||
)
|
||||
.into());
|
||||
}
|
||||
@ -53,8 +52,7 @@ pub fn from_pretrained<S: AsRef<str>>(
|
||||
let valid_revision = revision.chars().all(is_valid_char);
|
||||
if !valid_revision {
|
||||
return Err(format!(
|
||||
"Revision \"{}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}",
|
||||
revision
|
||||
"Revision \"{revision}\" contains invalid characters, expected only alphanumeric or {valid_chars_stringified}"
|
||||
)
|
||||
.into());
|
||||
}
|
||||
|
@ -2,18 +2,15 @@ pub(crate) mod cache;
|
||||
#[cfg(feature = "http")]
|
||||
pub(crate) mod from_pretrained;
|
||||
|
||||
#[cfg(feature = "fancy-regex")]
|
||||
#[cfg(all(feature = "fancy-regex", not(feature = "onig")))]
|
||||
mod fancy;
|
||||
#[cfg(feature = "fancy-regex")]
|
||||
#[cfg(all(feature = "fancy-regex", not(feature = "onig")))]
|
||||
pub use fancy::SysRegex;
|
||||
#[cfg(feature = "onig")]
|
||||
mod onig;
|
||||
#[cfg(feature = "onig")]
|
||||
pub use crate::utils::onig::SysRegex;
|
||||
|
||||
#[cfg(all(feature = "onig", feature = "fancy-regex"))]
|
||||
compile_error!("Features `onig` and `fancy-regex` are mutually exclusive");
|
||||
|
||||
#[cfg(not(any(feature = "onig", feature = "fancy-regex")))]
|
||||
compile_error!("One of the `onig`, or `fancy-regex` features must be enabled");
|
||||
|
||||
|
Reference in New Issue
Block a user