Require Send for all parts of the tokenizer (#222)

This commit is contained in:
Anthony MOI
2020-04-08 13:35:06 -04:00
committed by GitHub
14 changed files with 36 additions and 39 deletions

View File

@ -5,7 +5,7 @@ use neon::prelude::*;
/// Decoder /// Decoder
pub struct Decoder { pub struct Decoder {
pub decoder: Container<dyn tk::tokenizer::Decoder + Sync>, pub decoder: Container<dyn tk::tokenizer::Decoder>,
} }
declare_types! { declare_types! {

View File

@ -7,7 +7,7 @@ use std::path::Path;
/// Model /// Model
pub struct Model { pub struct Model {
pub model: Container<dyn tk::tokenizer::Model + Sync>, pub model: Container<dyn tk::tokenizer::Model>,
} }
declare_types! { declare_types! {

View File

@ -5,7 +5,7 @@ use neon::prelude::*;
/// Normalizer /// Normalizer
pub struct Normalizer { pub struct Normalizer {
pub normalizer: Container<dyn tk::tokenizer::Normalizer + Sync>, pub normalizer: Container<dyn tk::tokenizer::Normalizer>,
} }
declare_types! { declare_types! {

View File

@ -5,7 +5,7 @@ use neon::prelude::*;
/// PreTokenizers /// PreTokenizers
pub struct PreTokenizer { pub struct PreTokenizer {
pub pretok: Container<dyn tk::tokenizer::PreTokenizer + Sync>, pub pretok: Container<dyn tk::tokenizer::PreTokenizer>,
} }
declare_types! { declare_types! {

View File

@ -5,7 +5,7 @@ use neon::prelude::*;
/// Processor /// Processor
pub struct Processor { pub struct Processor {
pub processor: Container<dyn tk::tokenizer::PostProcessor + Sync>, pub processor: Container<dyn tk::tokenizer::PostProcessor>,
} }
declare_types! { declare_types! {

View File

@ -9,7 +9,7 @@ use tk::tokenizer::Result;
#[pyclass(dict)] #[pyclass(dict)]
pub struct Decoder { pub struct Decoder {
pub decoder: Container<dyn tk::tokenizer::Decoder + Sync>, pub decoder: Container<dyn tk::tokenizer::Decoder>,
} }
#[pymethods] #[pymethods]
impl Decoder { impl Decoder {

View File

@ -78,7 +78,7 @@ impl<'source> FromPyObject<'source> for EncodeInput {
/// This class cannot be constructed directly. Please use one of the concrete models. /// This class cannot be constructed directly. Please use one of the concrete models.
#[pyclass] #[pyclass]
pub struct Model { pub struct Model {
pub model: Container<dyn tk::tokenizer::Model + Sync>, pub model: Container<dyn tk::tokenizer::Model>,
} }
#[pymethods] #[pymethods]

View File

@ -7,7 +7,7 @@ use pyo3::types::*;
#[pyclass(dict)] #[pyclass(dict)]
pub struct Normalizer { pub struct Normalizer {
pub normalizer: Container<dyn tk::tokenizer::Normalizer + Sync>, pub normalizer: Container<dyn tk::tokenizer::Normalizer>,
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer)]

View File

@ -9,7 +9,7 @@ use tk::tokenizer::{Offsets, Result};
#[pyclass(dict)] #[pyclass(dict)]
pub struct PreTokenizer { pub struct PreTokenizer {
pub pretok: Container<dyn tk::tokenizer::PreTokenizer + Sync>, pub pretok: Container<dyn tk::tokenizer::PreTokenizer>,
} }
#[pymethods] #[pymethods]
impl PreTokenizer { impl PreTokenizer {

View File

@ -6,7 +6,7 @@ use pyo3::types::*;
#[pyclass(dict)] #[pyclass(dict)]
pub struct PostProcessor { pub struct PostProcessor {
pub processor: Container<dyn tk::tokenizer::PostProcessor + Sync>, pub processor: Container<dyn tk::tokenizer::PostProcessor>,
} }
#[pymethods] #[pymethods]

View File

@ -601,7 +601,7 @@ impl Trainer for BpeTrainer {
fn train( fn train(
&self, &self,
word_counts: HashMap<String, u32>, word_counts: HashMap<String, u32>,
) -> Result<(Box<dyn Model + Sync>, Vec<AddedToken>)> { ) -> Result<(Box<dyn Model>, Vec<AddedToken>)> {
let (bpe, tokens) = self.train(word_counts)?; let (bpe, tokens) = self.train(word_counts)?;
Ok((Box::new(bpe), tokens)) Ok((Box::new(bpe), tokens))
} }

View File

@ -99,7 +99,7 @@ impl Trainer for WordPieceTrainer {
fn train( fn train(
&self, &self,
word_counts: HashMap<String, u32>, word_counts: HashMap<String, u32>,
) -> Result<(Box<dyn Model + Sync>, Vec<AddedToken>)> { ) -> Result<(Box<dyn Model>, Vec<AddedToken>)> {
let (wp, tokens) = self.train(word_counts)?; let (wp, tokens) = self.train(word_counts)?;
Ok((Box::new(wp), tokens)) Ok((Box::new(wp), tokens))
} }

View File

@ -3,11 +3,11 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result};
/// Allows concatenating multiple other Normalizer as a Sequence. /// Allows concatenating multiple other Normalizer as a Sequence.
/// All the normalizers run in sequence in the given order against the same NormalizedString. /// All the normalizers run in sequence in the given order against the same NormalizedString.
pub struct Sequence { pub struct Sequence {
normalizers: Vec<Box<dyn Normalizer + Sync>>, normalizers: Vec<Box<dyn Normalizer>>,
} }
impl Sequence { impl Sequence {
pub fn new(normalizers: Vec<Box<dyn Normalizer + Sync>>) -> Self { pub fn new(normalizers: Vec<Box<dyn Normalizer>>) -> Self {
Self { normalizers } Self { normalizers }
} }
} }

View File

@ -31,7 +31,7 @@ pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + S
pub type Offsets = (usize, usize); pub type Offsets = (usize, usize);
/// Takes care of pre-processing strings. /// Takes care of pre-processing strings.
pub trait Normalizer { pub trait Normalizer: Send + Sync {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>; fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
} }
@ -40,12 +40,12 @@ pub trait Normalizer {
/// `NormalizedString`. In some occasions, the `PreTokenizer` might need to modify the given /// `NormalizedString`. In some occasions, the `PreTokenizer` might need to modify the given
/// `NormalizedString` to ensure we can entirely keep track of the offsets and the mapping with /// `NormalizedString` to ensure we can entirely keep track of the offsets and the mapping with
/// the original string. /// the original string.
pub trait PreTokenizer { pub trait PreTokenizer: Send + Sync {
fn pre_tokenize(&self, normalized: &mut NormalizedString) -> Result<Vec<(String, Offsets)>>; fn pre_tokenize(&self, normalized: &mut NormalizedString) -> Result<Vec<(String, Offsets)>>;
} }
/// Represents a model used during Tokenization (like BPE or Word or Unigram). /// Represents a model used during Tokenization (like BPE or Word or Unigram).
pub trait Model { pub trait Model: Send + Sync {
fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>; fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>;
fn token_to_id(&self, token: &str) -> Option<u32>; fn token_to_id(&self, token: &str) -> Option<u32>;
fn id_to_token(&self, id: u32) -> Option<String>; fn id_to_token(&self, id: u32) -> Option<String>;
@ -56,7 +56,7 @@ pub trait Model {
/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`. /// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
/// It adds any special tokens that a language model would require. /// It adds any special tokens that a language model would require.
pub trait PostProcessor { pub trait PostProcessor: Send + Sync {
/// Returns the number of tokens that will be added during the processing step /// Returns the number of tokens that will be added during the processing step
fn added_tokens(&self, is_pair: bool) -> usize; fn added_tokens(&self, is_pair: bool) -> usize;
/// Process both encodings and returns a new merged one /// Process both encodings and returns a new merged one
@ -84,7 +84,7 @@ impl dyn PostProcessor {
} }
/// A `Decoder` has the responsibility to merge the given `Vec<String>` in a `String`. /// A `Decoder` has the responsibility to merge the given `Vec<String>` in a `String`.
pub trait Decoder { pub trait Decoder: Send + Sync {
fn decode(&self, tokens: Vec<String>) -> Result<String>; fn decode(&self, tokens: Vec<String>) -> Result<String>;
} }
@ -95,10 +95,7 @@ pub trait Trainer: Sync {
fn should_show_progress(&self) -> bool; fn should_show_progress(&self) -> bool;
/// The actual training method. This will return a new trained Model as well as a list /// The actual training method. This will return a new trained Model as well as a list
/// of `special_tokens` to be added directly to the tokenizer along with the model. /// of `special_tokens` to be added directly to the tokenizer along with the model.
fn train( fn train(&self, words: HashMap<String, u32>) -> Result<(Box<dyn Model>, Vec<AddedToken>)>;
&self,
words: HashMap<String, u32>,
) -> Result<(Box<dyn Model + Sync>, Vec<AddedToken>)>;
/// Process a bunch of token, counting them as relevant. /// Process a bunch of token, counting them as relevant.
fn process_tokens(&self, words: &mut HashMap<String, u32>, tokens: Vec<String>); fn process_tokens(&self, words: &mut HashMap<String, u32>, tokens: Vec<String>);
} }
@ -226,11 +223,11 @@ impl std::cmp::Eq for AddedToken {}
/// A `Tokenizer` is capable of encoding/decoding any text. /// A `Tokenizer` is capable of encoding/decoding any text.
pub struct Tokenizer { pub struct Tokenizer {
// Tokenizer parts // Tokenizer parts
normalizer: Option<Box<dyn Normalizer + Sync>>, normalizer: Option<Box<dyn Normalizer>>,
pre_tokenizer: Option<Box<dyn PreTokenizer + Sync>>, pre_tokenizer: Option<Box<dyn PreTokenizer>>,
model: Box<dyn Model + Sync>, model: Box<dyn Model>,
post_processor: Option<Box<dyn PostProcessor + Sync>>, post_processor: Option<Box<dyn PostProcessor>>,
decoder: Option<Box<dyn Decoder + Sync>>, decoder: Option<Box<dyn Decoder>>,
// Added Vocabulary capabilities // Added Vocabulary capabilities
/// Contains the mapping from String to ID as the user intended it. This map /// Contains the mapping from String to ID as the user intended it. This map
@ -256,7 +253,7 @@ pub struct Tokenizer {
impl Tokenizer { impl Tokenizer {
/// Instanciate a new Tokenizer, with the given Model /// Instanciate a new Tokenizer, with the given Model
pub fn new(model: Box<dyn Model + Sync>) -> Self { pub fn new(model: Box<dyn Model>) -> Self {
Tokenizer { Tokenizer {
normalizer: None, normalizer: None,
pre_tokenizer: None, pre_tokenizer: None,
@ -277,62 +274,62 @@ impl Tokenizer {
} }
/// Set the normalizer /// Set the normalizer
pub fn with_normalizer(&mut self, normalizer: Box<dyn Normalizer + Sync>) -> &Self { pub fn with_normalizer(&mut self, normalizer: Box<dyn Normalizer>) -> &Self {
self.normalizer = Some(normalizer); self.normalizer = Some(normalizer);
self self
} }
/// Get the normalizer /// Get the normalizer
#[allow(clippy::borrowed_box)] #[allow(clippy::borrowed_box)]
pub fn get_normalizer(&self) -> Option<&Box<dyn Normalizer + Sync>> { pub fn get_normalizer(&self) -> Option<&Box<dyn Normalizer>> {
self.normalizer.as_ref() self.normalizer.as_ref()
} }
/// Set the pre tokenizer /// Set the pre tokenizer
pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box<dyn PreTokenizer + Sync>) -> &Self { pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box<dyn PreTokenizer>) -> &Self {
self.pre_tokenizer = Some(pre_tokenizer); self.pre_tokenizer = Some(pre_tokenizer);
self self
} }
/// Get the pre tokenizer /// Get the pre tokenizer
#[allow(clippy::borrowed_box)] #[allow(clippy::borrowed_box)]
pub fn get_pre_tokenizer(&self) -> Option<&Box<dyn PreTokenizer + Sync>> { pub fn get_pre_tokenizer(&self) -> Option<&Box<dyn PreTokenizer>> {
self.pre_tokenizer.as_ref() self.pre_tokenizer.as_ref()
} }
/// Set the post processor /// Set the post processor
pub fn with_post_processor(&mut self, post_processor: Box<dyn PostProcessor + Sync>) -> &Self { pub fn with_post_processor(&mut self, post_processor: Box<dyn PostProcessor>) -> &Self {
self.post_processor = Some(post_processor); self.post_processor = Some(post_processor);
self self
} }
/// Get the post processor /// Get the post processor
#[allow(clippy::borrowed_box)] #[allow(clippy::borrowed_box)]
pub fn get_post_processor(&self) -> Option<&Box<dyn PostProcessor + Sync>> { pub fn get_post_processor(&self) -> Option<&Box<dyn PostProcessor>> {
self.post_processor.as_ref() self.post_processor.as_ref()
} }
/// Set the decoder /// Set the decoder
pub fn with_decoder(&mut self, decoder: Box<dyn Decoder + Sync>) -> &Self { pub fn with_decoder(&mut self, decoder: Box<dyn Decoder>) -> &Self {
self.decoder = Some(decoder); self.decoder = Some(decoder);
self self
} }
/// Get the decoder /// Get the decoder
#[allow(clippy::borrowed_box)] #[allow(clippy::borrowed_box)]
pub fn get_decoder(&self) -> Option<&Box<dyn Decoder + Sync>> { pub fn get_decoder(&self) -> Option<&Box<dyn Decoder>> {
self.decoder.as_ref() self.decoder.as_ref()
} }
/// Set the model /// Set the model
pub fn with_model(&mut self, model: Box<dyn Model + Sync>) -> &Self { pub fn with_model(&mut self, model: Box<dyn Model>) -> &Self {
self.model = model; self.model = model;
self self
} }
/// Get the model /// Get the model
#[allow(clippy::borrowed_box)] #[allow(clippy::borrowed_box)]
pub fn get_model(&self) -> &Box<dyn Model + Sync> { pub fn get_model(&self) -> &Box<dyn Model> {
&self.model &self.model
} }