mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fixing clippy warnings on 1.71. (#1296)
* Fixing clippy warnings on 1.71. * Fix. * Fmt. * Python clippy. * Should really set my env back again. * Fix.
This commit is contained in:
@ -155,8 +155,7 @@ fn byte_level_alphabet(mut cx: FunctionContext) -> JsResult<JsValue> {
|
||||
fn whitespace(mut cx: FunctionContext) -> JsResult<JsPreTokenizer> {
|
||||
let mut pretok = JsPreTokenizer::new::<_, JsPreTokenizer, _>(&mut cx, vec![])?;
|
||||
let guard = cx.lock();
|
||||
pretok.borrow_mut(&guard).pretok =
|
||||
Some(tk::pre_tokenizers::whitespace::Whitespace::default().into());
|
||||
pretok.borrow_mut(&guard).pretok = Some(tk::pre_tokenizers::whitespace::Whitespace {}.into());
|
||||
Ok(pretok)
|
||||
}
|
||||
|
||||
@ -303,10 +302,10 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn serialize() {
|
||||
let js_wrapped: JsPreTokenizerWrapper = Whitespace::default().into();
|
||||
let js_wrapped: JsPreTokenizerWrapper = Whitespace {}.into();
|
||||
let js_ser = serde_json::to_string(&js_wrapped).unwrap();
|
||||
|
||||
let rs_wrapped = PreTokenizerWrapper::Whitespace(Whitespace::default());
|
||||
let rs_wrapped = PreTokenizerWrapper::Whitespace(Whitespace {});
|
||||
let rs_ser = serde_json::to_string(&rs_wrapped).unwrap();
|
||||
assert_eq!(js_ser, rs_ser);
|
||||
|
||||
@ -320,11 +319,11 @@ mod test {
|
||||
}
|
||||
|
||||
let js_seq: JsPreTokenizerWrapper =
|
||||
Sequence::new(vec![WhitespaceSplit.into(), Whitespace::default().into()]).into();
|
||||
Sequence::new(vec![WhitespaceSplit.into(), Whitespace {}.into()]).into();
|
||||
let js_wrapper_ser = serde_json::to_string(&js_seq).unwrap();
|
||||
let rs_wrapped = PreTokenizerWrapper::Sequence(Sequence::new(vec![
|
||||
WhitespaceSplit.into(),
|
||||
Whitespace::default().into(),
|
||||
Whitespace {}.into(),
|
||||
]));
|
||||
let rs_ser = serde_json::to_string(&rs_wrapped).unwrap();
|
||||
assert_eq!(js_wrapper_ser, rs_ser);
|
||||
@ -335,7 +334,7 @@ mod test {
|
||||
let js_ser = serde_json::to_string(&js_seq).unwrap();
|
||||
assert_eq!(js_wrapper_ser, js_ser);
|
||||
|
||||
let rs_seq = Sequence::new(vec![WhitespaceSplit.into(), Whitespace::default().into()]);
|
||||
let rs_seq = Sequence::new(vec![WhitespaceSplit.into(), Whitespace {}.into()]);
|
||||
let rs_ser = serde_json::to_string(&rs_seq).unwrap();
|
||||
assert_eq!(js_wrapper_ser, rs_ser);
|
||||
}
|
||||
|
@ -301,7 +301,7 @@ pub struct PyWhitespace {}
|
||||
impl PyWhitespace {
|
||||
#[new]
|
||||
fn new() -> (Self, PyPreTokenizer) {
|
||||
(PyWhitespace {}, Whitespace::default().into())
|
||||
(PyWhitespace {}, Whitespace {}.into())
|
||||
}
|
||||
}
|
||||
|
||||
@ -722,7 +722,7 @@ mod test {
|
||||
#[test]
|
||||
fn get_subtype() {
|
||||
Python::with_gil(|py| {
|
||||
let py_norm = PyPreTokenizer::new(Whitespace::default().into());
|
||||
let py_norm = PyPreTokenizer::new(Whitespace {}.into());
|
||||
let py_wsp = py_norm.get_as_subtype(py).unwrap();
|
||||
assert_eq!("Whitespace", py_wsp.as_ref(py).get_type().name().unwrap());
|
||||
})
|
||||
@ -730,9 +730,9 @@ mod test {
|
||||
|
||||
#[test]
|
||||
fn serialize() {
|
||||
let py_wrapped: PyPreTokenizerWrapper = Whitespace::default().into();
|
||||
let py_wrapped: PyPreTokenizerWrapper = Whitespace {}.into();
|
||||
let py_ser = serde_json::to_string(&py_wrapped).unwrap();
|
||||
let rs_wrapped = PreTokenizerWrapper::Whitespace(Whitespace::default());
|
||||
let rs_wrapped = PreTokenizerWrapper::Whitespace(Whitespace {});
|
||||
let rs_ser = serde_json::to_string(&rs_wrapped).unwrap();
|
||||
assert_eq!(py_ser, rs_ser);
|
||||
let py_pretok: PyPreTokenizer = serde_json::from_str(&rs_ser).unwrap();
|
||||
@ -745,10 +745,10 @@ mod test {
|
||||
}
|
||||
|
||||
let py_seq: PyPreTokenizerWrapper =
|
||||
Sequence::new(vec![Whitespace::default().into(), WhitespaceSplit.into()]).into();
|
||||
Sequence::new(vec![Whitespace {}.into(), WhitespaceSplit.into()]).into();
|
||||
let py_wrapper_ser = serde_json::to_string(&py_seq).unwrap();
|
||||
let rs_wrapped = PreTokenizerWrapper::Sequence(Sequence::new(vec![
|
||||
Whitespace::default().into(),
|
||||
Whitespace {}.into(),
|
||||
WhitespaceSplit.into(),
|
||||
]));
|
||||
let rs_ser = serde_json::to_string(&rs_wrapped).unwrap();
|
||||
@ -759,7 +759,7 @@ mod test {
|
||||
assert_eq!(py_wrapper_ser, py_ser);
|
||||
|
||||
let obj = Python::with_gil(|py| {
|
||||
let py_wsp = PyPreTokenizer::new(Whitespace::default().into());
|
||||
let py_wsp = PyPreTokenizer::new(Whitespace {}.into());
|
||||
let obj: PyObject = Py::new(py, py_wsp).unwrap().into_py(py);
|
||||
obj
|
||||
});
|
||||
|
@ -81,7 +81,7 @@ fn bench_train(c: &mut Criterion) {
|
||||
DecoderWrapper,
|
||||
>;
|
||||
let mut tokenizer = Tok::new(WordPiece::default());
|
||||
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||
tokenizer.with_pre_tokenizer(Whitespace {});
|
||||
c.bench_function("WordPiece Train vocabulary (small)", |b| {
|
||||
b.iter_custom(|iters| {
|
||||
iter_bench_train(
|
||||
@ -94,7 +94,7 @@ fn bench_train(c: &mut Criterion) {
|
||||
});
|
||||
|
||||
let mut tokenizer = Tok::new(WordPiece::default());
|
||||
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||
tokenizer.with_pre_tokenizer(Whitespace {});
|
||||
c.bench_function("WordPiece Train vocabulary (big)", |b| {
|
||||
b.iter_custom(|iters| {
|
||||
iter_bench_train(
|
||||
|
@ -74,7 +74,7 @@ fn bench_train(c: &mut Criterion) {
|
||||
.build()
|
||||
.into();
|
||||
let mut tokenizer = Tokenizer::new(BPE::default()).into_inner();
|
||||
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||
tokenizer.with_pre_tokenizer(Whitespace {});
|
||||
c.bench_function("BPE Train vocabulary (small)", |b| {
|
||||
b.iter_custom(|iters| {
|
||||
iter_bench_train(
|
||||
@ -87,7 +87,7 @@ fn bench_train(c: &mut Criterion) {
|
||||
});
|
||||
|
||||
let mut tokenizer = Tokenizer::new(BPE::default()).into_inner();
|
||||
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||
tokenizer.with_pre_tokenizer(Whitespace {});
|
||||
c.bench_function("BPE Train vocabulary (big)", |b| {
|
||||
b.iter_custom(|iters| {
|
||||
iter_bench_train(
|
||||
|
@ -271,7 +271,7 @@ impl Unigram {
|
||||
{
|
||||
let key_pos = starts_at + tok_bytes.len();
|
||||
let token: String = String::from_utf8(tok_bytes).unwrap();
|
||||
let mut target_node = &mut best_path_ends_at[key_pos];
|
||||
let target_node = &mut best_path_ends_at[key_pos];
|
||||
let length = key_pos - starts_at;
|
||||
let id = self.token_to_ids.get(&token).unwrap();
|
||||
let score = self.vocab.get(*id as usize).unwrap().1;
|
||||
@ -288,7 +288,7 @@ impl Unigram {
|
||||
}
|
||||
}
|
||||
if !has_single_node {
|
||||
let mut target_node = &mut best_path_ends_at[starts_at + mblen];
|
||||
let target_node = &mut best_path_ends_at[starts_at + mblen];
|
||||
let candidate_best_path_score = unk_score + best_path_score_till_here;
|
||||
if target_node.starts_at.is_none()
|
||||
|| candidate_best_path_score > target_node.best_path_score
|
||||
|
@ -197,7 +197,7 @@ impl PostProcessor for ByteLevel {
|
||||
}
|
||||
|
||||
pub fn process_offsets(encoding: &mut Encoding, add_prefix_space: bool) {
|
||||
encoding.process_tokens_with_offsets_mut(|(i, (token, mut offsets))| {
|
||||
encoding.process_tokens_with_offsets_mut(|(i, (token, offsets))| {
|
||||
let mut leading_spaces = token
|
||||
.chars()
|
||||
.take_while(|c| *c == BYTES_CHAR[&b' '] || c.is_whitespace())
|
||||
|
@ -84,7 +84,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn basic() {
|
||||
let pretok = UnicodeScripts::default();
|
||||
let pretok = UnicodeScripts {};
|
||||
let mut pretokenized = PreTokenizedString::from("どこで生れ。Yes");
|
||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||
assert_eq!(
|
||||
@ -107,7 +107,7 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn spaces_are_included_in_every_script() {
|
||||
let pretok = UnicodeScripts::default();
|
||||
let pretok = UnicodeScripts {};
|
||||
let mut pretokenized = PreTokenizedString::from("Apples are りんご 林檎");
|
||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||
assert_eq!(
|
||||
|
@ -64,7 +64,7 @@ mod tests {
|
||||
),
|
||||
("\n", vec![]),
|
||||
];
|
||||
let pretok = Whitespace::default();
|
||||
let pretok = Whitespace {};
|
||||
for (s, res) in tests {
|
||||
let mut pretokenized = PreTokenizedString::from(s);
|
||||
pretok.pre_tokenize(&mut pretokenized).unwrap();
|
||||
|
@ -93,7 +93,7 @@ fn quicktour_slow_train() -> tokenizers::Result<()> {
|
||||
// START quicktour_init_pretok
|
||||
use tokenizers::pre_tokenizers::whitespace::Whitespace;
|
||||
|
||||
tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||
tokenizer.with_pre_tokenizer(Whitespace {});
|
||||
// END quicktour_init_pretok
|
||||
|
||||
// START quicktour_train
|
||||
@ -267,7 +267,7 @@ fn pipeline() -> tokenizers::Result<()> {
|
||||
use tokenizers::pre_tokenizers::whitespace::Whitespace;
|
||||
use tokenizers::{OffsetReferential, OffsetType, PreTokenizedString, PreTokenizer};
|
||||
|
||||
let pre_tokenizer = Whitespace::default();
|
||||
let pre_tokenizer = Whitespace {};
|
||||
let mut pre_tokenized = PreTokenizedString::from("Hello! How are you? I'm fine, thank you.");
|
||||
|
||||
pre_tokenizer.pre_tokenize(&mut pre_tokenized)?;
|
||||
@ -304,7 +304,7 @@ fn pipeline() -> tokenizers::Result<()> {
|
||||
// START pipeline_combine_pre_tokenizer
|
||||
use tokenizers::pre_tokenizers::{digits::Digits, sequence::Sequence};
|
||||
|
||||
let pre_tokenizer = Sequence::new(vec![Whitespace::default().into(), Digits::new(true).into()]);
|
||||
let pre_tokenizer = Sequence::new(vec![Whitespace {}.into(), Digits::new(true).into()]);
|
||||
let mut pre_tokenized = PreTokenizedString::from("Call 911!");
|
||||
|
||||
pre_tokenizer.pre_tokenize(&mut pre_tokenized)?;
|
||||
@ -384,7 +384,7 @@ fn train_pipeline_bert() -> tokenizers::Result<()> {
|
||||
// START bert_setup_pre_tokenizer
|
||||
use tokenizers::pre_tokenizers::whitespace::Whitespace;
|
||||
|
||||
bert_tokenizer.with_pre_tokenizer(Whitespace::default());
|
||||
bert_tokenizer.with_pre_tokenizer(Whitespace {});
|
||||
// END bert_setup_pre_tokenizer
|
||||
// START bert_setup_processor
|
||||
use tokenizers::processors::template::TemplateProcessing;
|
||||
|
@ -138,7 +138,7 @@ fn pretoks() {
|
||||
let ser_wrapped = serde_json::to_string(&ch_wrapped).unwrap();
|
||||
assert_eq!(ser_wrapped, ch_ser);
|
||||
|
||||
let wsp = Whitespace::default();
|
||||
let wsp = Whitespace {};
|
||||
let wsp_ser = serde_json::to_string(&wsp).unwrap();
|
||||
assert_eq!(wsp_ser, r#"{"type":"Whitespace"}"#);
|
||||
serde_json::from_str::<Whitespace>(&wsp_ser).unwrap();
|
||||
|
@ -45,7 +45,7 @@ fn bpe_continuing_subword_prefix_error() {
|
||||
.build()
|
||||
.unwrap(),
|
||||
)
|
||||
.with_pre_tokenizer(Some(PreTokenizerWrapper::Whitespace(Whitespace::default())))
|
||||
.with_pre_tokenizer(Some(PreTokenizerWrapper::Whitespace(Whitespace {})))
|
||||
.build()
|
||||
.unwrap();
|
||||
let mut trainer = tokenizer.get_model().get_trainer();
|
||||
|
Reference in New Issue
Block a user