diff --git a/bindings/node/native/src/decoders.rs b/bindings/node/native/src/decoders.rs index 53d36a5b..bc65c37b 100644 --- a/bindings/node/native/src/decoders.rs +++ b/bindings/node/native/src/decoders.rs @@ -5,7 +5,7 @@ use neon::prelude::*; /// Decoder pub struct Decoder { - pub decoder: Container, + pub decoder: Container, } declare_types! { diff --git a/bindings/node/native/src/models.rs b/bindings/node/native/src/models.rs index d03bab26..0800b4e6 100644 --- a/bindings/node/native/src/models.rs +++ b/bindings/node/native/src/models.rs @@ -7,7 +7,7 @@ use std::path::Path; /// Model pub struct Model { - pub model: Container, + pub model: Container, } declare_types! { diff --git a/bindings/node/native/src/normalizers.rs b/bindings/node/native/src/normalizers.rs index db3f4dbd..55f4afaf 100644 --- a/bindings/node/native/src/normalizers.rs +++ b/bindings/node/native/src/normalizers.rs @@ -5,7 +5,7 @@ use neon::prelude::*; /// Normalizer pub struct Normalizer { - pub normalizer: Container, + pub normalizer: Container, } declare_types! { diff --git a/bindings/node/native/src/pre_tokenizers.rs b/bindings/node/native/src/pre_tokenizers.rs index 54c2e52a..6a8ba252 100644 --- a/bindings/node/native/src/pre_tokenizers.rs +++ b/bindings/node/native/src/pre_tokenizers.rs @@ -5,7 +5,7 @@ use neon::prelude::*; /// PreTokenizers pub struct PreTokenizer { - pub pretok: Container, + pub pretok: Container, } declare_types! { diff --git a/bindings/node/native/src/processors.rs b/bindings/node/native/src/processors.rs index 07db9ff8..7056102b 100644 --- a/bindings/node/native/src/processors.rs +++ b/bindings/node/native/src/processors.rs @@ -5,7 +5,7 @@ use neon::prelude::*; /// Processor pub struct Processor { - pub processor: Container, + pub processor: Container, } declare_types! { diff --git a/bindings/python/src/decoders.rs b/bindings/python/src/decoders.rs index 215aaff4..82208926 100644 --- a/bindings/python/src/decoders.rs +++ b/bindings/python/src/decoders.rs @@ -9,7 +9,7 @@ use tk::tokenizer::Result; #[pyclass(dict)] pub struct Decoder { - pub decoder: Container, + pub decoder: Container, } #[pymethods] impl Decoder { diff --git a/bindings/python/src/models.rs b/bindings/python/src/models.rs index 53c1b5a0..7beb8b59 100644 --- a/bindings/python/src/models.rs +++ b/bindings/python/src/models.rs @@ -78,7 +78,7 @@ impl<'source> FromPyObject<'source> for EncodeInput { /// This class cannot be constructed directly. Please use one of the concrete models. #[pyclass] pub struct Model { - pub model: Container, + pub model: Container, } #[pymethods] diff --git a/bindings/python/src/normalizers.rs b/bindings/python/src/normalizers.rs index fe3c6e6c..723bb407 100644 --- a/bindings/python/src/normalizers.rs +++ b/bindings/python/src/normalizers.rs @@ -7,7 +7,7 @@ use pyo3::types::*; #[pyclass(dict)] pub struct Normalizer { - pub normalizer: Container, + pub normalizer: Container, } #[pyclass(extends=Normalizer)] diff --git a/bindings/python/src/pre_tokenizers.rs b/bindings/python/src/pre_tokenizers.rs index af48b0b1..14250065 100644 --- a/bindings/python/src/pre_tokenizers.rs +++ b/bindings/python/src/pre_tokenizers.rs @@ -9,7 +9,7 @@ use tk::tokenizer::{Offsets, Result}; #[pyclass(dict)] pub struct PreTokenizer { - pub pretok: Container, + pub pretok: Container, } #[pymethods] impl PreTokenizer { diff --git a/bindings/python/src/processors.rs b/bindings/python/src/processors.rs index 4ef1ab78..0d533f15 100644 --- a/bindings/python/src/processors.rs +++ b/bindings/python/src/processors.rs @@ -6,7 +6,7 @@ use pyo3::types::*; #[pyclass(dict)] pub struct PostProcessor { - pub processor: Container, + pub processor: Container, } #[pymethods] diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index 4b942a0e..9e014c31 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -601,7 +601,7 @@ impl Trainer for BpeTrainer { fn train( &self, word_counts: HashMap, - ) -> Result<(Box, Vec)> { + ) -> Result<(Box, Vec)> { let (bpe, tokens) = self.train(word_counts)?; Ok((Box::new(bpe), tokens)) } diff --git a/tokenizers/src/models/wordpiece/trainer.rs b/tokenizers/src/models/wordpiece/trainer.rs index 7363ac8e..a22b2e6d 100644 --- a/tokenizers/src/models/wordpiece/trainer.rs +++ b/tokenizers/src/models/wordpiece/trainer.rs @@ -99,7 +99,7 @@ impl Trainer for WordPieceTrainer { fn train( &self, word_counts: HashMap, - ) -> Result<(Box, Vec)> { + ) -> Result<(Box, Vec)> { let (wp, tokens) = self.train(word_counts)?; Ok((Box::new(wp), tokens)) } diff --git a/tokenizers/src/normalizers/utils.rs b/tokenizers/src/normalizers/utils.rs index b2f16f94..85e5c83c 100644 --- a/tokenizers/src/normalizers/utils.rs +++ b/tokenizers/src/normalizers/utils.rs @@ -3,11 +3,11 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result}; /// Allows concatenating multiple other Normalizer as a Sequence. /// All the normalizers run in sequence in the given order against the same NormalizedString. pub struct Sequence { - normalizers: Vec>, + normalizers: Vec>, } impl Sequence { - pub fn new(normalizers: Vec>) -> Self { + pub fn new(normalizers: Vec>) -> Self { Self { normalizers } } } diff --git a/tokenizers/src/tokenizer/mod.rs b/tokenizers/src/tokenizer/mod.rs index d3a6140f..69758984 100644 --- a/tokenizers/src/tokenizer/mod.rs +++ b/tokenizers/src/tokenizer/mod.rs @@ -31,7 +31,7 @@ pub type Result = std::result::Result Result<()>; } @@ -40,12 +40,12 @@ pub trait Normalizer { /// `NormalizedString`. In some occasions, the `PreTokenizer` might need to modify the given /// `NormalizedString` to ensure we can entirely keep track of the offsets and the mapping with /// the original string. -pub trait PreTokenizer { +pub trait PreTokenizer: Send + Sync { fn pre_tokenize(&self, normalized: &mut NormalizedString) -> Result>; } /// Represents a model used during Tokenization (like BPE or Word or Unigram). -pub trait Model { +pub trait Model: Send + Sync { fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result>; fn token_to_id(&self, token: &str) -> Option; fn id_to_token(&self, id: u32) -> Option; @@ -56,7 +56,7 @@ pub trait Model { /// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`. /// It adds any special tokens that a language model would require. -pub trait PostProcessor { +pub trait PostProcessor: Send + Sync { /// Returns the number of tokens that will be added during the processing step fn added_tokens(&self, is_pair: bool) -> usize; /// Process both encodings and returns a new merged one @@ -84,7 +84,7 @@ impl dyn PostProcessor { } /// A `Decoder` has the responsibility to merge the given `Vec` in a `String`. -pub trait Decoder { +pub trait Decoder: Send + Sync { fn decode(&self, tokens: Vec) -> Result; } @@ -95,10 +95,7 @@ pub trait Trainer: Sync { fn should_show_progress(&self) -> bool; /// The actual training method. This will return a new trained Model as well as a list /// of `special_tokens` to be added directly to the tokenizer along with the model. - fn train( - &self, - words: HashMap, - ) -> Result<(Box, Vec)>; + fn train(&self, words: HashMap) -> Result<(Box, Vec)>; /// Process a bunch of token, counting them as relevant. fn process_tokens(&self, words: &mut HashMap, tokens: Vec); } @@ -226,11 +223,11 @@ impl std::cmp::Eq for AddedToken {} /// A `Tokenizer` is capable of encoding/decoding any text. pub struct Tokenizer { // Tokenizer parts - normalizer: Option>, - pre_tokenizer: Option>, - model: Box, - post_processor: Option>, - decoder: Option>, + normalizer: Option>, + pre_tokenizer: Option>, + model: Box, + post_processor: Option>, + decoder: Option>, // Added Vocabulary capabilities /// Contains the mapping from String to ID as the user intended it. This map @@ -256,7 +253,7 @@ pub struct Tokenizer { impl Tokenizer { /// Instanciate a new Tokenizer, with the given Model - pub fn new(model: Box) -> Self { + pub fn new(model: Box) -> Self { Tokenizer { normalizer: None, pre_tokenizer: None, @@ -277,62 +274,62 @@ impl Tokenizer { } /// Set the normalizer - pub fn with_normalizer(&mut self, normalizer: Box) -> &Self { + pub fn with_normalizer(&mut self, normalizer: Box) -> &Self { self.normalizer = Some(normalizer); self } /// Get the normalizer #[allow(clippy::borrowed_box)] - pub fn get_normalizer(&self) -> Option<&Box> { + pub fn get_normalizer(&self) -> Option<&Box> { self.normalizer.as_ref() } /// Set the pre tokenizer - pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box) -> &Self { + pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box) -> &Self { self.pre_tokenizer = Some(pre_tokenizer); self } /// Get the pre tokenizer #[allow(clippy::borrowed_box)] - pub fn get_pre_tokenizer(&self) -> Option<&Box> { + pub fn get_pre_tokenizer(&self) -> Option<&Box> { self.pre_tokenizer.as_ref() } /// Set the post processor - pub fn with_post_processor(&mut self, post_processor: Box) -> &Self { + pub fn with_post_processor(&mut self, post_processor: Box) -> &Self { self.post_processor = Some(post_processor); self } /// Get the post processor #[allow(clippy::borrowed_box)] - pub fn get_post_processor(&self) -> Option<&Box> { + pub fn get_post_processor(&self) -> Option<&Box> { self.post_processor.as_ref() } /// Set the decoder - pub fn with_decoder(&mut self, decoder: Box) -> &Self { + pub fn with_decoder(&mut self, decoder: Box) -> &Self { self.decoder = Some(decoder); self } /// Get the decoder #[allow(clippy::borrowed_box)] - pub fn get_decoder(&self) -> Option<&Box> { + pub fn get_decoder(&self) -> Option<&Box> { self.decoder.as_ref() } /// Set the model - pub fn with_model(&mut self, model: Box) -> &Self { + pub fn with_model(&mut self, model: Box) -> &Self { self.model = model; self } /// Get the model #[allow(clippy::borrowed_box)] - pub fn get_model(&self) -> &Box { + pub fn get_model(&self) -> &Box { &self.model }