Merge pull request #20 from huggingface/docs

Clean up Rust docs
This commit is contained in:
MOI Anthony
2019-12-30 14:17:14 -05:00
committed by GitHub
5 changed files with 23 additions and 36 deletions

View File

@ -2,14 +2,10 @@ use std::collections::HashMap;
use std::hash::Hash;
use std::sync::Mutex;
///
/// # Cache
///
/// Provides a simple multithread cache that will try to retrieve values
/// but won't block if someone else is already using it.
/// The goal is clearly not the accuracy of the content, both get and set
/// are not guaranteed to actually get or set.
///
#[derive(Default)]
pub struct Cache<K, V>
where

View File

@ -7,8 +7,7 @@ mod word;
pub type Pair = (u32, u32);
/// ## Error
/// Errors that can be encountered while using BPE
/// Errors that can be encountered while using BPE.
#[derive(Debug)]
pub enum Error {
/// An error encountered while reading files mainly.

View File

@ -1,13 +1,13 @@
use crate::tokenizer::NormalizedString;
/// The various possible padding directions
/// The various possible padding directions.
#[derive(Debug, Clone)]
pub enum PaddingDirection {
Left,
Right,
}
/// The Encoding struct represents the output of the Tokenizer
/// Represents the output of a `Tokenizer`.
#[derive(Default, PartialEq, Debug, Clone)]
pub struct Encoding {
normalized: NormalizedString,
@ -180,7 +180,7 @@ impl Encoding {
}
}
/// Prepend the `stride` last elements of the `previous` Vec to the current Vec
/// Prepend the `stride` last elements of the `previous` `Vec` to the current `Vec`.
// A new Vec is instantiated though.
fn prepend_stride<T: Clone>(previous: &[T], current: Vec<T>, stride: usize) -> Vec<T> {
let prev = previous

View File

@ -1,17 +1,14 @@
//!
//! # Tokenizer module
//!
//! Represents a tokenization pipeline.
//!
//! A Tokenizer is composed of some of the following parts.
//! - Normalizer: Takes care of the text normalization (like unicode normalization).
//! - PreTokenizer: Takes care of the pre tokenization (ie. How to split tokens and pre-process
//! A [`Tokenizer`](struct.Tokenizer.html) is composed of some of the following parts.
//! - [`Normalizer`](trait.Normalizer.html): Takes care of the text normalization (like unicode normalization).
//! - [`PreTokenizer`](trait.PreTokenizer.html): Takes care of the pre tokenization (ie. How to split tokens and pre-process
//! them.
//! - Model: A model encapsulates the tokenization algorithm. (Like BPE, Word base, character
//! based, ...)
//! - PostProcessor: Takes care of the processing after tokenization. (Like truncating, padding,
//! ...)
//!
//! - [`Model`](trait.Model.html): A model encapsulates the tokenization algorithm (like BPE, Word base, character
//! based, ...).
//! - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding,
//! ...).
pub use crate::utils::{
pad_encodings, truncate_encodings, PaddingParams, PaddingStrategy, TruncationParams,
TruncationStrategy,
@ -32,12 +29,12 @@ pub use normalizer::*;
pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
pub type Offsets = (usize, usize);
/// A PreTokenizer takes care of pre-tokenizing strings before this goes to the model
/// Takes care of pre-tokenizing strings before this goes to the model.
pub trait PreTokenizer {
fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>>;
}
/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
/// Represents a model used during Tokenization (like BPE or Word or Unigram).
pub trait Model {
fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>;
fn token_to_id(&self, token: &str) -> Option<u32>;
@ -45,8 +42,8 @@ pub trait Model {
fn get_vocab_size(&self) -> usize;
}
/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
/// It adds any special tokens that a language model would require
/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
/// It adds any special tokens that a language model would require.
pub trait PostProcessor {
/// Returns the number of tokens that will be added during the processing step
fn added_tokens(&self, encoding: &Encoding, pair_encoding: &Option<Encoding>) -> Result<usize>;
@ -54,19 +51,18 @@ pub trait PostProcessor {
fn process(&self, encoding: Encoding, pair_encoding: Option<Encoding>) -> Result<Encoding>;
}
/// A Decoder has the responsibility to merge the given Vec<String> in a String
/// A `Decoder` has the responsibility to merge the given `Vec<String>` in a `String`.
pub trait Decoder {
fn decode(&self, tokens: Vec<String>) -> Result<String>;
}
/// A Trainer has the responsibility to train a Model. We feed it with lines/sentences
/// and it returns a Model when done.
/// A `Trainer` has the responsibility to train a model. We feed it with lines/sentences
/// and it returns a `Model` when done.
pub trait Trainer: Sync {
fn train(&self, words: HashMap<String, u32>) -> Result<Box<dyn Model + Sync>>;
fn process_tokens(&self, words: &mut HashMap<String, u32>, tokens: Vec<String>);
}
/// A Token
#[derive(Debug, PartialEq)]
pub struct Token {
pub id: u32,
@ -121,11 +117,7 @@ impl std::cmp::PartialEq for AddedToken {
}
impl std::cmp::Eq for AddedToken {}
///
/// ## Tokenizer
///
/// A Tokenizer is capable of encoding/decoding any text
///
/// A `Tokenizer` is capable of encoding/decoding any text.
pub struct Tokenizer {
// Tokenizer parts
normalizer: Option<Box<dyn Normalizer + Sync>>,

View File

@ -2,13 +2,13 @@ use super::Result;
use std::cmp::Ordering;
use unicode_normalization_alignments::UnicodeNormalization;
/// A Normalizer takes care of pre-processing strings
/// Takes care of pre-processing strings.
pub trait Normalizer {
fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
}
/// A normalized string takes care of keeping both versions of a String, and
/// provides necessary alignments to retrieve ranges of both strings
/// A normalized string takes care of keeping both versions of a `String`, and
/// provides necessary alignments to retrieve ranges of both strings.
#[derive(Default, Debug, Clone)]
pub struct NormalizedString {
original: String,