mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
@ -2,14 +2,10 @@ use std::collections::HashMap;
|
||||
use std::hash::Hash;
|
||||
use std::sync::Mutex;
|
||||
|
||||
///
|
||||
/// # Cache
|
||||
///
|
||||
/// Provides a simple multithread cache that will try to retrieve values
|
||||
/// but won't block if someone else is already using it.
|
||||
/// The goal is clearly not the accuracy of the content, both get and set
|
||||
/// are not guaranteed to actually get or set.
|
||||
///
|
||||
#[derive(Default)]
|
||||
pub struct Cache<K, V>
|
||||
where
|
||||
|
@ -7,8 +7,7 @@ mod word;
|
||||
|
||||
pub type Pair = (u32, u32);
|
||||
|
||||
/// ## Error
|
||||
/// Errors that can be encountered while using BPE
|
||||
/// Errors that can be encountered while using BPE.
|
||||
#[derive(Debug)]
|
||||
pub enum Error {
|
||||
/// An error encountered while reading files mainly.
|
||||
|
@ -1,13 +1,13 @@
|
||||
use crate::tokenizer::NormalizedString;
|
||||
|
||||
/// The various possible padding directions
|
||||
/// The various possible padding directions.
|
||||
#[derive(Debug, Clone)]
|
||||
pub enum PaddingDirection {
|
||||
Left,
|
||||
Right,
|
||||
}
|
||||
|
||||
/// The Encoding struct represents the output of the Tokenizer
|
||||
/// Represents the output of a `Tokenizer`.
|
||||
#[derive(Default, PartialEq, Debug, Clone)]
|
||||
pub struct Encoding {
|
||||
normalized: NormalizedString,
|
||||
@ -180,7 +180,7 @@ impl Encoding {
|
||||
}
|
||||
}
|
||||
|
||||
/// Prepend the `stride` last elements of the `previous` Vec to the current Vec
|
||||
/// Prepend the `stride` last elements of the `previous` `Vec` to the current `Vec`.
|
||||
// A new Vec is instantiated though.
|
||||
fn prepend_stride<T: Clone>(previous: &[T], current: Vec<T>, stride: usize) -> Vec<T> {
|
||||
let prev = previous
|
||||
|
@ -1,17 +1,14 @@
|
||||
//!
|
||||
//! # Tokenizer module
|
||||
//!
|
||||
//! Represents a tokenization pipeline.
|
||||
//!
|
||||
//! A Tokenizer is composed of some of the following parts.
|
||||
//! - Normalizer: Takes care of the text normalization (like unicode normalization).
|
||||
//! - PreTokenizer: Takes care of the pre tokenization (ie. How to split tokens and pre-process
|
||||
//! A [`Tokenizer`](struct.Tokenizer.html) is composed of some of the following parts.
|
||||
//! - [`Normalizer`](trait.Normalizer.html): Takes care of the text normalization (like unicode normalization).
|
||||
//! - [`PreTokenizer`](trait.PreTokenizer.html): Takes care of the pre tokenization (ie. How to split tokens and pre-process
|
||||
//! them.
|
||||
//! - Model: A model encapsulates the tokenization algorithm. (Like BPE, Word base, character
|
||||
//! based, ...)
|
||||
//! - PostProcessor: Takes care of the processing after tokenization. (Like truncating, padding,
|
||||
//! ...)
|
||||
//!
|
||||
//! - [`Model`](trait.Model.html): A model encapsulates the tokenization algorithm (like BPE, Word base, character
|
||||
//! based, ...).
|
||||
//! - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding,
|
||||
//! ...).
|
||||
|
||||
pub use crate::utils::{
|
||||
pad_encodings, truncate_encodings, PaddingParams, PaddingStrategy, TruncationParams,
|
||||
TruncationStrategy,
|
||||
@ -32,12 +29,12 @@ pub use normalizer::*;
|
||||
pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
|
||||
pub type Offsets = (usize, usize);
|
||||
|
||||
/// A PreTokenizer takes care of pre-tokenizing strings before this goes to the model
|
||||
/// Takes care of pre-tokenizing strings before this goes to the model.
|
||||
pub trait PreTokenizer {
|
||||
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>>;
|
||||
}
|
||||
|
||||
/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
|
||||
/// Represents a model used during Tokenization (like BPE or Word or Unigram).
|
||||
pub trait Model {
|
||||
fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>;
|
||||
fn token_to_id(&self, token: &str) -> Option<u32>;
|
||||
@ -45,8 +42,8 @@ pub trait Model {
|
||||
fn get_vocab_size(&self) -> usize;
|
||||
}
|
||||
|
||||
/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
|
||||
/// It adds any special tokens that a language model would require
|
||||
/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
|
||||
/// It adds any special tokens that a language model would require.
|
||||
pub trait PostProcessor {
|
||||
/// Returns the number of tokens that will be added during the processing step
|
||||
fn added_tokens(&self, encoding: &Encoding, pair_encoding: &Option<Encoding>) -> Result<usize>;
|
||||
@ -54,19 +51,18 @@ pub trait PostProcessor {
|
||||
fn process(&self, encoding: Encoding, pair_encoding: Option<Encoding>) -> Result<Encoding>;
|
||||
}
|
||||
|
||||
/// A Decoder has the responsibility to merge the given Vec<String> in a String
|
||||
/// A `Decoder` has the responsibility to merge the given `Vec<String>` in a `String`.
|
||||
pub trait Decoder {
|
||||
fn decode(&self, tokens: Vec<String>) -> Result<String>;
|
||||
}
|
||||
|
||||
/// A Trainer has the responsibility to train a Model. We feed it with lines/sentences
|
||||
/// and it returns a Model when done.
|
||||
/// A `Trainer` has the responsibility to train a model. We feed it with lines/sentences
|
||||
/// and it returns a `Model` when done.
|
||||
pub trait Trainer: Sync {
|
||||
fn train(&self, words: HashMap<String, u32>) -> Result<Box<dyn Model + Sync>>;
|
||||
fn process_tokens(&self, words: &mut HashMap<String, u32>, tokens: Vec<String>);
|
||||
}
|
||||
|
||||
/// A Token
|
||||
#[derive(Debug, PartialEq)]
|
||||
pub struct Token {
|
||||
pub id: u32,
|
||||
@ -121,11 +117,7 @@ impl std::cmp::PartialEq for AddedToken {
|
||||
}
|
||||
impl std::cmp::Eq for AddedToken {}
|
||||
|
||||
///
|
||||
/// ## Tokenizer
|
||||
///
|
||||
/// A Tokenizer is capable of encoding/decoding any text
|
||||
///
|
||||
/// A `Tokenizer` is capable of encoding/decoding any text.
|
||||
pub struct Tokenizer {
|
||||
// Tokenizer parts
|
||||
normalizer: Option<Box<dyn Normalizer + Sync>>,
|
||||
|
@ -2,13 +2,13 @@ use super::Result;
|
||||
use std::cmp::Ordering;
|
||||
use unicode_normalization_alignments::UnicodeNormalization;
|
||||
|
||||
/// A Normalizer takes care of pre-processing strings
|
||||
/// Takes care of pre-processing strings.
|
||||
pub trait Normalizer {
|
||||
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
|
||||
}
|
||||
|
||||
/// A normalized string takes care of keeping both versions of a String, and
|
||||
/// provides necessary alignments to retrieve ranges of both strings
|
||||
/// A normalized string takes care of keeping both versions of a `String`, and
|
||||
/// provides necessary alignments to retrieve ranges of both strings.
|
||||
#[derive(Default, Debug, Clone)]
|
||||
pub struct NormalizedString {
|
||||
original: String,
|
||||
|
Reference in New Issue
Block a user