From 9428b9a21b6f97e5de225732dc22f7e96ab5c2f6 Mon Sep 17 00:00:00 2001 From: Anthony MOI Date: Sat, 4 Jan 2020 23:33:50 -0500 Subject: [PATCH] Documentation updates --- README.md | 39 ++++++++++++++++----------------------- tokenizers/Cargo.toml | 10 ++++++++++ tokenizers/src/lib.rs | 24 ++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 23 deletions(-) diff --git a/README.md b/README.md index f9dfafb3..754fba7a 100644 --- a/README.md +++ b/README.md @@ -1,31 +1,24 @@ # Tokenizers -Provides an implementation of today's most used tokenizers with a focus on performances -and versatility. The goal is to make it as easy as possible to construct a Tokenizer, learn a -vocabulary, and then process some text either in real time or in advance. +Provides an implementation of today's most used tokenizers, with a focus on performances and +versatility. ## What is a Tokenizer -A Tokenizer works as a pipeline taking some raw text as input, going through multiple steps to -finally output a list of `Token`s. The various steps of the pipeline are: -- Some optional `Normalizer`s. An example would be a Unicode normalization step. They take -some raw text as input, and also output raw text `String`. -- An optional `PreTokenizer` which should take some raw text and take care of spliting -as relevant, and pre-processing tokens if needed. Takes a raw text `String` as input, and -outputs a `Vec`. -- A `Model` to do the actual tokenization. An example of `Model` would be `BPE`. Takes -a `Vec` as input, and gives a `Vec`. -- Some optional `PostProcessor`s. These are in charge of post processing the list of `Token`s -in any relevant way. This includes truncating, adding some padding, ... +A Tokenizer works as a pipeline, processing some raw text as input, to finally output an +`Encoding`. +The various steps of the pipeline are: -## Try the shell +1. The `Normalizer` is in charge of normalizing the text. Common examples of Normalization are + the unicode normalization standards, such as `NFD` or `NFKC`. +2. The `PreTokenizer` is in charge of splitting the text as relevant. The most common way of + splitting text is simply on whitespaces, to manipulate words. +3. The `Model` is in charge of doing the actual tokenization. An example of `Model` would be + `BPE` or `WordPiece`. +4. The `PostProcessor` is in charge of post processing the `Encoding`, to add anything relevant + that a language model would need, like special tokens. -You can try a simple ByteLevel BPE Tokenizer by using the following command. This expects -`vocab.json` and `merges.txt` files, trained with ByteLevel BPE. +## Bindings -```bash -cd tokenizers -wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json -wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt -cargo run --release shell --vocab gpt2-vocab.json --merges gpt2-merges.txt -``` +We provide bindings to the following languages (more to come!): + - [Python](https://github.com/huggingface/tokenizers/tree/master/bindings/python) diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml index 8002758f..e224c435 100644 --- a/tokenizers/Cargo.toml +++ b/tokenizers/Cargo.toml @@ -3,6 +3,16 @@ authors = ["Anthony MOI "] edition = "2018" name = "tokenizers-lib" version = "0.0.11" +homepage = "https://github.com/huggingface/tokenizers" +repository = "https://github.com/huggingface/tokenizers" +documentation = "https://docs.rs/tokenizers/" +license = "Apache-2.0" +keywords = ["text", "tokenizer", "tokenization", "NLP", "huggingface", "BPE", "WordPiece"] +readme = "README.md" +description = """ +Provides an implementation of today's most used tokenizers, +with a focus on performances and versatility. +""" [lib] name = "tokenizers" diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs index 5af84a15..5cd5d7d9 100644 --- a/tokenizers/src/lib.rs +++ b/tokenizers/src/lib.rs @@ -1,4 +1,28 @@ #![warn(clippy::all)] +#![doc(html_favicon_url = "https://huggingface.co/favicon.ico")] +#![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")] + +//! +//! # Tokenizers +//! +//! Provides an implementation of today's most used tokenizers, with a focus on performances and +//! versatility. +//! +//! ## What is a Tokenizer +//! +//! A Tokenizer works as a pipeline, processing some raw text as input, to finally output an +//! `Encoding`. +//! The various steps of the pipeline are: +//! +//! 1. The `Normalizer` is in charge of normalizing the text. Common examples of Normalization are +//! the unicode normalization standards, such as `NFD` or `NFKC`. +//! 2. The `PreTokenizer` is in charge of splitting the text as relevant. The most common way of +//! splitting text is simply on whitespaces, to manipulate words. +//! 3. The `Model` is in charge of doing the actual tokenization. An example of `Model` would be +//! `BPE` or `WordPiece`. +//! 4. The `PostProcessor` is in charge of post processing the `Encoding`, to add anything relevant +//! that a language model would need, like special tokens. +//! #[macro_use] extern crate lazy_static;