Merge pull request #20 from huggingface/docs

Clean up Rust docs
2025-08-22 16:25:30 +00:00 · 2019-12-30 14:17:14 -05:00
parent 225a886382 d163bbadae
commit 5194daa0ce
5 changed files with 23 additions and 36 deletions
--- a/tokenizers/src/models/bpe/cache.rs
+++ b/tokenizers/src/models/bpe/cache.rs
@ -2,14 +2,10 @@ use std::collections::HashMap;
 use std::hash::Hash;
 use std::sync::Mutex;

-///
-/// # Cache
-///
 /// Provides a simple multithread cache that will try to retrieve values
 /// but won't block if someone else is already using it.
 /// The goal is clearly not the accuracy of the content, both get and set
 /// are not guaranteed to actually get or set.
-///
 #[derive(Default)]
 pub struct Cache<K, V>
 where
--- a/tokenizers/src/models/bpe/mod.rs
+++ b/tokenizers/src/models/bpe/mod.rs
@ -7,8 +7,7 @@ mod word;

 pub type Pair = (u32, u32);

-/// ## Error
-/// Errors that can be encountered while using BPE
+/// Errors that can be encountered while using BPE.
 #[derive(Debug)]
 pub enum Error {
    /// An error encountered while reading files mainly.
--- a/tokenizers/src/tokenizer/encoding.rs
+++ b/tokenizers/src/tokenizer/encoding.rs
@ -1,13 +1,13 @@
 use crate::tokenizer::NormalizedString;

-/// The various possible padding directions
+/// The various possible padding directions.
 #[derive(Debug, Clone)]
 pub enum PaddingDirection {
    Left,
    Right,
 }

-/// The Encoding struct represents the output of the Tokenizer
+/// Represents the output of a `Tokenizer`.
 #[derive(Default, PartialEq, Debug, Clone)]
 pub struct Encoding {
    normalized: NormalizedString,
@ -180,7 +180,7 @@ impl Encoding {
    }
 }

-/// Prepend the `stride` last elements of the `previous` Vec to the current Vec
+/// Prepend the `stride` last elements of the `previous` `Vec` to the current `Vec`.
 // A new Vec is instantiated though.
 fn prepend_stride<T: Clone>(previous: &[T], current: Vec<T>, stride: usize) -> Vec<T> {
    let prev = previous
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@ -1,17 +1,14 @@
-//!
-//! # Tokenizer module
-//!
 //! Represents a tokenization pipeline.
 //!
-//! A Tokenizer is composed of some of the following parts.
-//!   - Normalizer: Takes care of the text normalization (like unicode normalization).
-//!   - PreTokenizer: Takes care of the pre tokenization (ie. How to split tokens and pre-process
+//! A [`Tokenizer`](struct.Tokenizer.html) is composed of some of the following parts.
+//!   - [`Normalizer`](trait.Normalizer.html): Takes care of the text normalization (like unicode normalization).
+//!   - [`PreTokenizer`](trait.PreTokenizer.html): Takes care of the pre tokenization (ie. How to split tokens and pre-process
 //!   them.
-//!   - Model: A model encapsulates the tokenization algorithm. (Like BPE, Word base, character
-//!   based, ...)
-//!   - PostProcessor: Takes care of the processing after tokenization. (Like truncating, padding,
-//!   ...)
-//!
+//!   - [`Model`](trait.Model.html): A model encapsulates the tokenization algorithm (like BPE, Word base, character
+//!   based, ...).
+//!   - [`PostProcessor`](trait.PostProcessor.html): Takes care of the processing after tokenization (like truncating, padding,
+//!   ...).
+
 pub use crate::utils::{
    pad_encodings, truncate_encodings, PaddingParams, PaddingStrategy, TruncationParams,
    TruncationStrategy,
@ -32,12 +29,12 @@ pub use normalizer::*;
 pub type Result<T> = std::result::Result<T, Box<dyn std::error::Error + Send + Sync>>;
 pub type Offsets = (usize, usize);

-/// A PreTokenizer takes care of pre-tokenizing strings before this goes to the model
+/// Takes care of pre-tokenizing strings before this goes to the model.
 pub trait PreTokenizer {
    fn pre_tokenize(&self, s: &str) -> Result<Vec<(String, Offsets)>>;
 }

-/// Represents a `Model` used during Tokenization (Like BPE or Word or Unigram)
+/// Represents a model used during Tokenization (like BPE or Word or Unigram).
 pub trait Model {
    fn tokenize(&self, tokens: Vec<(String, Offsets)>) -> Result<Vec<Token>>;
    fn token_to_id(&self, token: &str) -> Option<u32>;
@ -45,8 +42,8 @@ pub trait Model {
    fn get_vocab_size(&self) -> usize;
 }

-/// A PostProcessor has the responsibility to post process an encoded output of the Tokenizer.
-/// It adds any special tokens that a language model would require
+/// A `PostProcessor` has the responsibility to post process an encoded output of the `Tokenizer`.
+/// It adds any special tokens that a language model would require.
 pub trait PostProcessor {
    /// Returns the number of tokens that will be added during the processing step
    fn added_tokens(&self, encoding: &Encoding, pair_encoding: &Option<Encoding>) -> Result<usize>;
@ -54,19 +51,18 @@ pub trait PostProcessor {
    fn process(&self, encoding: Encoding, pair_encoding: Option<Encoding>) -> Result<Encoding>;
 }

-/// A Decoder has the responsibility to merge the given Vec<String> in a String
+/// A `Decoder` has the responsibility to merge the given `Vec<String>` in a `String`.
 pub trait Decoder {
    fn decode(&self, tokens: Vec<String>) -> Result<String>;
 }

-/// A Trainer has the responsibility to train a Model. We feed it with lines/sentences
-/// and it returns a Model when done.
+/// A `Trainer` has the responsibility to train a model. We feed it with lines/sentences
+/// and it returns a `Model` when done.
 pub trait Trainer: Sync {
    fn train(&self, words: HashMap<String, u32>) -> Result<Box<dyn Model + Sync>>;
    fn process_tokens(&self, words: &mut HashMap<String, u32>, tokens: Vec<String>);
 }

-/// A Token
 #[derive(Debug, PartialEq)]
 pub struct Token {
    pub id: u32,
@ -121,11 +117,7 @@ impl std::cmp::PartialEq for AddedToken {
 }
 impl std::cmp::Eq for AddedToken {}

-///
-/// ## Tokenizer
-///
-/// A Tokenizer is capable of encoding/decoding any text
-///
+/// A `Tokenizer` is capable of encoding/decoding any text.
 pub struct Tokenizer {
    // Tokenizer parts
    normalizer: Option<Box<dyn Normalizer + Sync>>,
--- a/tokenizers/src/tokenizer/normalizer.rs
+++ b/tokenizers/src/tokenizer/normalizer.rs
@ -2,13 +2,13 @@ use super::Result;
 use std::cmp::Ordering;
 use unicode_normalization_alignments::UnicodeNormalization;

-/// A Normalizer takes care of pre-processing strings
+/// Takes care of pre-processing strings.
 pub trait Normalizer {
    fn normalize(&self, normalized: &mut NormalizedString) -> Result<()>;
 }

-/// A normalized string takes care of keeping both versions of a String, and
-/// provides necessary alignments to retrieve ranges of both strings
+/// A normalized string takes care of keeping both versions of a `String`, and
+/// provides necessary alignments to retrieve ranges of both strings.
 #[derive(Default, Debug, Clone)]
 pub struct NormalizedString {
    original: String,