mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-02 15:29:21 +00:00
Require Send for all parts of the tokenizer (#222)
This commit is contained in:
@ -5,7 +5,7 @@ use neon::prelude::*;
|
|||||||
|
|
||||||
/// Decoder
|
/// Decoder
|
||||||
pub struct Decoder {
|
pub struct Decoder {
|
||||||
pub decoder: Container<dyn tk::tokenizer::Decoder + Sync>,
|
pub decoder: Container<dyn tk::tokenizer::Decoder>,
|
||||||
}
|
}
|
||||||
|
|
||||||
declare_types! {
|
declare_types! {
|
||||||
|
@ -7,7 +7,7 @@ use std::path::Path;
|
|||||||
|
|
||||||
/// Model
|
/// Model
|
||||||
pub struct Model {
|
pub struct Model {
|
||||||
pub model: Container<dyn tk::tokenizer::Model + Sync>,
|
pub model: Container<dyn tk::tokenizer::Model>,
|
||||||
}
|
}
|
||||||
|
|
||||||
declare_types! {
|
declare_types! {
|
||||||
|
@ -5,7 +5,7 @@ use neon::prelude::*;
|
|||||||
|
|
||||||
/// Normalizer
|
/// Normalizer
|
||||||
pub struct Normalizer {
|
pub struct Normalizer {
|
||||||
pub normalizer: Container<dyn tk::tokenizer::Normalizer + Sync>,
|
pub normalizer: Container<dyn tk::tokenizer::Normalizer>,
|
||||||
}
|
}
|
||||||
|
|
||||||
declare_types! {
|
declare_types! {
|
||||||
|
@ -5,7 +5,7 @@ use neon::prelude::*;
|
|||||||
|
|
||||||
/// PreTokenizers
|
/// PreTokenizers
|
||||||
pub struct PreTokenizer {
|
pub struct PreTokenizer {
|
||||||
pub pretok: Container<dyn tk::tokenizer::PreTokenizer + Sync>,
|
pub pretok: Container<dyn tk::tokenizer::PreTokenizer>,
|
||||||
}
|
}
|
||||||
|
|
||||||
declare_types! {
|
declare_types! {
|
||||||
|
@ -5,7 +5,7 @@ use neon::prelude::*;
|
|||||||
|
|
||||||
/// Processor
|
/// Processor
|
||||||
pub struct Processor {
|
pub struct Processor {
|
||||||
pub processor: Container<dyn tk::tokenizer::PostProcessor + Sync>,
|
pub processor: Container<dyn tk::tokenizer::PostProcessor>,
|
||||||
}
|
}
|
||||||
|
|
||||||
declare_types! {
|
declare_types! {
|
||||||
|
@ -9,7 +9,7 @@ use tk::tokenizer::Result;
|
|||||||
|
|
||||||
#[pyclass(dict)]
|
#[pyclass(dict)]
|
||||||
pub struct Decoder {
|
pub struct Decoder {
|
||||||
pub decoder: Container<dyn tk::tokenizer::Decoder + Sync>,
|
pub decoder: Container<dyn tk::tokenizer::Decoder>,
|
||||||
}
|
}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl Decoder {
|
impl Decoder {
|
||||||
|
@ -78,7 +78,7 @@ impl<'source> FromPyObject<'source> for EncodeInput {
|
|||||||
/// This class cannot be constructed directly. Please use one of the concrete models.
|
/// This class cannot be constructed directly. Please use one of the concrete models.
|
||||||
#[pyclass]
|
#[pyclass]
|
||||||
pub struct Model {
|
pub struct Model {
|
||||||
pub model: Container<dyn tk::tokenizer::Model + Sync>,
|
pub model: Container<dyn tk::tokenizer::Model>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
@ -7,7 +7,7 @@ use pyo3::types::*;
|
|||||||
|
|
||||||
#[pyclass(dict)]
|
#[pyclass(dict)]
|
||||||
pub struct Normalizer {
|
pub struct Normalizer {
|
||||||
pub normalizer: Container<dyn tk::tokenizer::Normalizer + Sync>,
|
pub normalizer: Container<dyn tk::tokenizer::Normalizer>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pyclass(extends=Normalizer)]
|
#[pyclass(extends=Normalizer)]
|
||||||
|
@ -9,7 +9,7 @@ use tk::tokenizer::{Offsets, Result};
|
|||||||
|
|
||||||
#[pyclass(dict)]
|
#[pyclass(dict)]
|
||||||
pub struct PreTokenizer {
|
pub struct PreTokenizer {
|
||||||
pub pretok: Container<dyn tk::tokenizer::PreTokenizer + Sync>,
|
pub pretok: Container<dyn tk::tokenizer::PreTokenizer>,
|
||||||
}
|
}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PreTokenizer {
|
impl PreTokenizer {
|
||||||
|
@ -6,7 +6,7 @@ use pyo3::types::*;
|
|||||||
|
|
||||||
#[pyclass(dict)]
|
#[pyclass(dict)]
|
||||||
pub struct PostProcessor {
|
pub struct PostProcessor {
|
||||||
pub processor: Container<dyn tk::tokenizer::PostProcessor + Sync>,
|
pub processor: Container<dyn tk::tokenizer::PostProcessor>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
@ -601,7 +601,7 @@ impl Trainer for BpeTrainer {
|
|||||||
fn train(
|
fn train(
|
||||||
&self,
|
&self,
|
||||||
word_counts: HashMap<String, u32>,
|
word_counts: HashMap<String, u32>,
|
||||||
) -> Result<(Box<dyn Model + Sync>, Vec<AddedToken>)> {
|
) -> Result<(Box<dyn Model>, Vec<AddedToken>)> {
|
||||||
let (bpe, tokens) = self.train(word_counts)?;
|
let (bpe, tokens) = self.train(word_counts)?;
|
||||||
Ok((Box::new(bpe), tokens))
|
Ok((Box::new(bpe), tokens))
|
||||||
}
|
}
|
||||||
|
@ -99,7 +99,7 @@ impl Trainer for WordPieceTrainer {
|
|||||||
fn train(
|
fn train(
|
||||||
&self,
|
&self,
|
||||||
word_counts: HashMap<String, u32>,
|
word_counts: HashMap<String, u32>,
|
||||||
) -> Result<(Box<dyn Model + Sync>, Vec<AddedToken>)> {
|
) -> Result<(Box<dyn Model>, Vec<AddedToken>)> {
|
||||||
let (wp, tokens) = self.train(word_counts)?;
|
let (wp, tokens) = self.train(word_counts)?;
|
||||||
Ok((Box::new(wp), tokens))
|
Ok((Box::new(wp), tokens))
|
||||||
}
|
}
|
||||||
|
@ -3,11 +3,11 @@ use crate::tokenizer::{NormalizedString, Normalizer, Result};
|
|||||||
/// Allows concatenating multiple other Normalizer as a Sequence.
|
/// Allows concatenating multiple other Normalizer as a Sequence.
|
||||||
/// All the normalizers run in sequence in the given order against the same NormalizedString.
|
/// All the normalizers run in sequence in the given order against the same NormalizedString.
|
||||||
pub struct Sequence {
|
pub struct Sequence {
|
||||||
normalizers: Vec<Box<dyn Normalizer + Sync>>,
|
normalizers: Vec<Box<dyn Normalizer>>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Sequence {
|
impl Sequence {
|
||||||
pub fn new(normalizers: Vec<Box<dyn Normalizer + Sync>>) -> Self {
|
pub fn new(normalizers: Vec<Box<dyn Normalizer>>) -> Self {
|
||||||
Self { normalizers }
|
Self { normalizers }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -31,7 +31,7 @@ pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + S
|
|||||||
pub type Offsets = (usize, usize);
|
pub type Offsets = (usize, usize);
|
||||||
|
|
||||||
/// Takes care of pre-processing strings.
|
/// Takes care of pre-processing strings.
|
||||||
pub trait Normalizer {
|
pub trait Normalizer: Send + Sync {
|
||||||
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
|
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -40,12 +40,12 @@ pub trait Normalizer {
|
|||||||
/// `NormalizedString`. In some occasions, the `PreTokenizer` might need to modify the given
|
/// `NormalizedString`. In some occasions, the `PreTokenizer` might need to modify the given
|
||||||
/// `NormalizedString` to ensure we can entirely keep track of the offsets and the mapping with
|
/// `NormalizedString` to ensure we can entirely keep track of the offsets and the mapping with
|
||||||
/// the original string.
|
/// the original string.
|
||||||
pub trait PreTokenizer {
|
pub trait PreTokenizer: Send + Sync {
|
||||||
fn pre_tokenize(&self, normalized: &mut NormalizedString) -> Result<Vec<(String, Offsets)>>;
|
fn pre_tokenize(&self, normalized: &mut NormalizedString) -> Result<Vec<(String, Offsets)>>;
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Represents a model used during Tokenization (like BPE or Word or Unigram).
|
/// Represents a model used during Tokenization (like BPE or Word or Unigram).
|
||||||
pub trait Model {
|
pub trait Model: Send + Sync {
|
||||||
fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>;
|
fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>;
|
||||||
fn token_to_id(&self, token: &str) -> Option<u32>;
|
fn token_to_id(&self, token: &str) -> Option<u32>;
|
||||||
fn id_to_token(&self, id: u32) -> Option<String>;
|
fn id_to_token(&self, id: u32) -> Option<String>;
|
||||||
@ -56,7 +56,7 @@ pub trait Model {
|
|||||||
|
|
||||||
/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
|
/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
|
||||||
/// It adds any special tokens that a language model would require.
|
/// It adds any special tokens that a language model would require.
|
||||||
pub trait PostProcessor {
|
pub trait PostProcessor: Send + Sync {
|
||||||
/// Returns the number of tokens that will be added during the processing step
|
/// Returns the number of tokens that will be added during the processing step
|
||||||
fn added_tokens(&self, is_pair: bool) -> usize;
|
fn added_tokens(&self, is_pair: bool) -> usize;
|
||||||
/// Process both encodings and returns a new merged one
|
/// Process both encodings and returns a new merged one
|
||||||
@ -84,7 +84,7 @@ impl dyn PostProcessor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// A `Decoder` has the responsibility to merge the given `Vec<String>` in a `String`.
|
/// A `Decoder` has the responsibility to merge the given `Vec<String>` in a `String`.
|
||||||
pub trait Decoder {
|
pub trait Decoder: Send + Sync {
|
||||||
fn decode(&self, tokens: Vec<String>) -> Result<String>;
|
fn decode(&self, tokens: Vec<String>) -> Result<String>;
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -95,10 +95,7 @@ pub trait Trainer: Sync {
|
|||||||
fn should_show_progress(&self) -> bool;
|
fn should_show_progress(&self) -> bool;
|
||||||
/// The actual training method. This will return a new trained Model as well as a list
|
/// The actual training method. This will return a new trained Model as well as a list
|
||||||
/// of `special_tokens` to be added directly to the tokenizer along with the model.
|
/// of `special_tokens` to be added directly to the tokenizer along with the model.
|
||||||
fn train(
|
fn train(&self, words: HashMap<String, u32>) -> Result<(Box<dyn Model>, Vec<AddedToken>)>;
|
||||||
&self,
|
|
||||||
words: HashMap<String, u32>,
|
|
||||||
) -> Result<(Box<dyn Model + Sync>, Vec<AddedToken>)>;
|
|
||||||
/// Process a bunch of token, counting them as relevant.
|
/// Process a bunch of token, counting them as relevant.
|
||||||
fn process_tokens(&self, words: &mut HashMap<String, u32>, tokens: Vec<String>);
|
fn process_tokens(&self, words: &mut HashMap<String, u32>, tokens: Vec<String>);
|
||||||
}
|
}
|
||||||
@ -226,11 +223,11 @@ impl std::cmp::Eq for AddedToken {}
|
|||||||
/// A `Tokenizer` is capable of encoding/decoding any text.
|
/// A `Tokenizer` is capable of encoding/decoding any text.
|
||||||
pub struct Tokenizer {
|
pub struct Tokenizer {
|
||||||
// Tokenizer parts
|
// Tokenizer parts
|
||||||
normalizer: Option<Box<dyn Normalizer + Sync>>,
|
normalizer: Option<Box<dyn Normalizer>>,
|
||||||
pre_tokenizer: Option<Box<dyn PreTokenizer + Sync>>,
|
pre_tokenizer: Option<Box<dyn PreTokenizer>>,
|
||||||
model: Box<dyn Model + Sync>,
|
model: Box<dyn Model>,
|
||||||
post_processor: Option<Box<dyn PostProcessor + Sync>>,
|
post_processor: Option<Box<dyn PostProcessor>>,
|
||||||
decoder: Option<Box<dyn Decoder + Sync>>,
|
decoder: Option<Box<dyn Decoder>>,
|
||||||
|
|
||||||
// Added Vocabulary capabilities
|
// Added Vocabulary capabilities
|
||||||
/// Contains the mapping from String to ID as the user intended it. This map
|
/// Contains the mapping from String to ID as the user intended it. This map
|
||||||
@ -256,7 +253,7 @@ pub struct Tokenizer {
|
|||||||
|
|
||||||
impl Tokenizer {
|
impl Tokenizer {
|
||||||
/// Instanciate a new Tokenizer, with the given Model
|
/// Instanciate a new Tokenizer, with the given Model
|
||||||
pub fn new(model: Box<dyn Model + Sync>) -> Self {
|
pub fn new(model: Box<dyn Model>) -> Self {
|
||||||
Tokenizer {
|
Tokenizer {
|
||||||
normalizer: None,
|
normalizer: None,
|
||||||
pre_tokenizer: None,
|
pre_tokenizer: None,
|
||||||
@ -277,62 +274,62 @@ impl Tokenizer {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Set the normalizer
|
/// Set the normalizer
|
||||||
pub fn with_normalizer(&mut self, normalizer: Box<dyn Normalizer + Sync>) -> &Self {
|
pub fn with_normalizer(&mut self, normalizer: Box<dyn Normalizer>) -> &Self {
|
||||||
self.normalizer = Some(normalizer);
|
self.normalizer = Some(normalizer);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the normalizer
|
/// Get the normalizer
|
||||||
#[allow(clippy::borrowed_box)]
|
#[allow(clippy::borrowed_box)]
|
||||||
pub fn get_normalizer(&self) -> Option<&Box<dyn Normalizer + Sync>> {
|
pub fn get_normalizer(&self) -> Option<&Box<dyn Normalizer>> {
|
||||||
self.normalizer.as_ref()
|
self.normalizer.as_ref()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the pre tokenizer
|
/// Set the pre tokenizer
|
||||||
pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box<dyn PreTokenizer + Sync>) -> &Self {
|
pub fn with_pre_tokenizer(&mut self, pre_tokenizer: Box<dyn PreTokenizer>) -> &Self {
|
||||||
self.pre_tokenizer = Some(pre_tokenizer);
|
self.pre_tokenizer = Some(pre_tokenizer);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the pre tokenizer
|
/// Get the pre tokenizer
|
||||||
#[allow(clippy::borrowed_box)]
|
#[allow(clippy::borrowed_box)]
|
||||||
pub fn get_pre_tokenizer(&self) -> Option<&Box<dyn PreTokenizer + Sync>> {
|
pub fn get_pre_tokenizer(&self) -> Option<&Box<dyn PreTokenizer>> {
|
||||||
self.pre_tokenizer.as_ref()
|
self.pre_tokenizer.as_ref()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the post processor
|
/// Set the post processor
|
||||||
pub fn with_post_processor(&mut self, post_processor: Box<dyn PostProcessor + Sync>) -> &Self {
|
pub fn with_post_processor(&mut self, post_processor: Box<dyn PostProcessor>) -> &Self {
|
||||||
self.post_processor = Some(post_processor);
|
self.post_processor = Some(post_processor);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the post processor
|
/// Get the post processor
|
||||||
#[allow(clippy::borrowed_box)]
|
#[allow(clippy::borrowed_box)]
|
||||||
pub fn get_post_processor(&self) -> Option<&Box<dyn PostProcessor + Sync>> {
|
pub fn get_post_processor(&self) -> Option<&Box<dyn PostProcessor>> {
|
||||||
self.post_processor.as_ref()
|
self.post_processor.as_ref()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the decoder
|
/// Set the decoder
|
||||||
pub fn with_decoder(&mut self, decoder: Box<dyn Decoder + Sync>) -> &Self {
|
pub fn with_decoder(&mut self, decoder: Box<dyn Decoder>) -> &Self {
|
||||||
self.decoder = Some(decoder);
|
self.decoder = Some(decoder);
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the decoder
|
/// Get the decoder
|
||||||
#[allow(clippy::borrowed_box)]
|
#[allow(clippy::borrowed_box)]
|
||||||
pub fn get_decoder(&self) -> Option<&Box<dyn Decoder + Sync>> {
|
pub fn get_decoder(&self) -> Option<&Box<dyn Decoder>> {
|
||||||
self.decoder.as_ref()
|
self.decoder.as_ref()
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the model
|
/// Set the model
|
||||||
pub fn with_model(&mut self, model: Box<dyn Model + Sync>) -> &Self {
|
pub fn with_model(&mut self, model: Box<dyn Model>) -> &Self {
|
||||||
self.model = model;
|
self.model = model;
|
||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Get the model
|
/// Get the model
|
||||||
#[allow(clippy::borrowed_box)]
|
#[allow(clippy::borrowed_box)]
|
||||||
pub fn get_model(&self) -> &Box<dyn Model + Sync> {
|
pub fn get_model(&self) -> &Box<dyn Model> {
|
||||||
&self.model
|
&self.model
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user