From 9428b9a21b6f97e5de225732dc22f7e96ab5c2f6 Mon Sep 17 00:00:00 2001
From: Anthony MOI <m.anthony.moi@gmail.com>
Date: Sat, 4 Jan 2020 23:33:50 -0500
Subject: [PATCH] Documentation updates

---
 README.md             | 39 ++++++++++++++++-----------------------
 tokenizers/Cargo.toml | 10 ++++++++++
 tokenizers/src/lib.rs | 24 ++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 23 deletions(-)
diff --git a/README.md b/README.md
index f9dfafb3..754fba7a 100644
--- a/README.md
+++ b/README.md
@@ -1,31 +1,24 @@
 # Tokenizers
 
-Provides an implementation of today's most used tokenizers with a focus on performances
-and versatility. The goal is to make it as easy as possible to construct a Tokenizer, learn a
-vocabulary, and then process some text either in real time or in advance.
+Provides an implementation of today's most used tokenizers, with a focus on performances and
+versatility.
 
 ## What is a Tokenizer
 
-A Tokenizer works as a pipeline taking some raw text as input, going through multiple steps to
-finally output a list of `Token`s. The various steps of the pipeline are:
-- Some optional `Normalizer`s. An example would be a Unicode normalization step. They take
-some raw text as input, and also output raw text `String`.
-- An optional `PreTokenizer` which should take some raw text and take care of spliting
-as relevant, and pre-processing tokens if needed. Takes a raw text `String` as input, and
-outputs a `Vec<String>`.
-- A `Model` to do the actual tokenization. An example of `Model` would be `BPE`. Takes
-a `Vec<String>` as input, and gives a `Vec<Token>`.
-- Some optional `PostProcessor`s. These are in charge of post processing the list of `Token`s
-in any relevant way. This includes truncating, adding some padding, ...
+A Tokenizer works as a pipeline, processing some raw text as input, to finally output an
+`Encoding`.
+The various steps of the pipeline are:
 
-## Try the shell
+1. The `Normalizer` is in charge of normalizing the text. Common examples of Normalization are
+   the unicode normalization standards, such as `NFD` or `NFKC`.
+2. The `PreTokenizer` is in charge of splitting the text as relevant. The most common way of
+   splitting text is simply on whitespaces, to manipulate words.
+3. The `Model` is in charge of doing the actual tokenization. An example of `Model` would be
+   `BPE` or `WordPiece`.
+4. The `PostProcessor` is in charge of post processing the `Encoding`, to add anything relevant
+   that a language model would need, like special tokens.
 
-You can try a simple ByteLevel BPE Tokenizer by using the following command. This expects
-`vocab.json` and `merges.txt` files, trained with ByteLevel BPE.
+## Bindings
 
-```bash
-cd tokenizers
-wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
-wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
-cargo run --release shell --vocab gpt2-vocab.json --merges gpt2-merges.txt
-```
+We provide bindings to the following languages (more to come!):
+  - [Python](https://github.com/huggingface/tokenizers/tree/master/bindings/python)
diff --git a/tokenizers/Cargo.toml b/tokenizers/Cargo.toml
index 8002758f..e224c435 100644
--- a/tokenizers/Cargo.toml
+++ b/tokenizers/Cargo.toml
@@ -3,6 +3,16 @@ authors = ["Anthony MOI <m.anthony.moi@gmail.com>"]
 edition = "2018"
 name = "tokenizers-lib"
 version = "0.0.11"
+homepage = "https://github.com/huggingface/tokenizers"
+repository = "https://github.com/huggingface/tokenizers"
+documentation = "https://docs.rs/tokenizers/"
+license = "Apache-2.0"
+keywords = ["text", "tokenizer", "tokenization", "NLP", "huggingface", "BPE", "WordPiece"]
+readme = "README.md"
+description = """
+Provides an implementation of today's most used tokenizers,
+with a focus on performances and versatility.
+"""
 
 [lib]
 name = "tokenizers"
diff --git a/tokenizers/src/lib.rs b/tokenizers/src/lib.rs
index 5af84a15..5cd5d7d9 100644
--- a/tokenizers/src/lib.rs
+++ b/tokenizers/src/lib.rs
@@ -1,4 +1,28 @@
 #![warn(clippy::all)]
+#![doc(html_favicon_url = "https://huggingface.co/favicon.ico")]
+#![doc(html_logo_url = "https://huggingface.co/landing/assets/huggingface_logo.svg")]
+
+//!
+//! # Tokenizers
+//!
+//! Provides an implementation of today's most used tokenizers, with a focus on performances and
+//! versatility.
+//!
+//! ## What is a Tokenizer
+//!
+//! A Tokenizer works as a pipeline, processing some raw text as input, to finally output an
+//! `Encoding`.
+//! The various steps of the pipeline are:
+//!
+//! 1. The `Normalizer` is in charge of normalizing the text. Common examples of Normalization are
+//!    the unicode normalization standards, such as `NFD` or `NFKC`.
+//! 2. The `PreTokenizer` is in charge of splitting the text as relevant. The most common way of
+//!    splitting text is simply on whitespaces, to manipulate words.
+//! 3. The `Model` is in charge of doing the actual tokenization. An example of `Model` would be
+//!    `BPE` or `WordPiece`.
+//! 4. The `PostProcessor` is in charge of post processing the `Encoding`, to add anything relevant
+//!    that a language model would need, like special tokens.
+//!
 
 #[macro_use]
 extern crate lazy_static;