diff --git a/.gitignore b/.gitignore index 68add400..6141202f 100644 --- a/.gitignore +++ b/.gitignore @@ -10,7 +10,8 @@ Cargo.lock /data tokenizers/data bindings/python/tests/data -/docs +docs/build/ +docs/make.bat __pycache__ pip-wheel-metadata diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 00000000..d0c3cbf1 --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = source +BUILDDIR = build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/source/conf.py b/docs/source/conf.py new file mode 100644 index 00000000..7edca6fc --- /dev/null +++ b/docs/source/conf.py @@ -0,0 +1,56 @@ +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +# import os +# import sys +# sys.path.insert(0, os.path.abspath('.')) + + +# -- Project information ----------------------------------------------------- + +project = "tokenizers" +copyright = "2020, huggingface" +author = "huggingface" + +# The full version, including alpha/beta/rc tags +release = "0.9.0" + + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + "sphinx_tabs.tabs", +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ["_templates"] + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = [] + + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = "sphinx_rtd_theme" + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ["_static"] diff --git a/docs/source/index.rst b/docs/source/index.rst new file mode 100644 index 00000000..f564bf5e --- /dev/null +++ b/docs/source/index.rst @@ -0,0 +1,79 @@ +.. tokenizers documentation master file, created by + sphinx-quickstart on Fri Sep 25 14:32:54 2020. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to tokenizers's documentation! +====================================== + +.. toctree:: + + tokenizer_blocks + +Getting started +================== + + +Provides an implementation of today's most used tokenizers, with a focus on performance and +versatility. + +Main features: +-------------- + + - Train new vocabularies and tokenize, using today's most used tokenizers. + - Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes + less than 20 seconds to tokenize a GB of text on a server's CPU. + - Easy to use, but also extremely versatile. + - Designed for research and production. + - Normalization comes with alignments tracking. It's always possible to get the part of the + original sentence that corresponds to a given token. + - Does all the pre-processing: Truncate, Pad, add the special tokens your model needs. + - Bindings to Rust, Python and Node. + +Load an existing tokenizer: +--------------------------- + + +.. tabs:: + + .. group-tab:: Rust + + .. literalinclude:: ../../tokenizers/examples/load.rs + :language: rust + :emphasize-lines: 4 + + .. group-tab:: Python + + .. literalinclude:: ../../bindings/python/tests/examples/test_load.py + :language: python + :emphasize-lines: 4 + + .. group-tab:: Node + + .. literalinclude:: ../../bindings/node/examples/load.test.ts + :language: typescript + :emphasize-lines: 11 + + + +Train a tokenizer: +------------------ + +Small guide of :ref:`how to create a Tokenizer options`. + +.. tabs:: + .. group-tab:: Rust + + .. literalinclude:: ../../tokenizers/examples/train.rs + :language: rust + + .. group-tab:: Python + + .. literalinclude:: ../../bindings/python/tests/examples/test_train.py + :language: python + + .. group-tab:: Node + + .. literalinclude:: ../../bindings/node/examples/train.test.ts + :language: typescript + diff --git a/docs/source/tokenizer_blocks.rst b/docs/source/tokenizer_blocks.rst new file mode 100644 index 00000000..2f200a0a --- /dev/null +++ b/docs/source/tokenizer_blocks.rst @@ -0,0 +1,135 @@ + +Models +====== + +.. _tokenizer_blocks: + +Models are the core algorithms that serves for tokenizers. + +.. list-table:: + :header-rows: 1 + + * - Name + - Description + * - BPE + - Works by looking at most frequent pairs in a dataset, and iteratively fusing them in new tokens + * - Unigram + - Works by building a suffix array and using an EM algorithm to find best suitable tokens + * - WordPiece + - ... + * - WordLevel + - ... + + +Normalizers +=========== + +A normalizer will take a unicode string input, and modify it to make it more uniform for the underlying algorithm. +Usually fixes some unicode quirks. The specificity of ``tokenizers`` is that we keep track of all offsets +to know how a string was normalizers, which is especially useful to debug a tokenizer. + +.. list-table:: + :header-rows: 1 + + * - Name + - Desription + - Example + * - NFD + - NFD unicode normalization + - + * - NFKD + - NFKD unicode normalization + - + * - NFC + - NFC unicode normalization + - + * - NFKC + - NFKC unicode normalization + - + * - Lowercase + - Replaces all uppercase to lowercase + - "HELLO ὈΔΥΣΣΕΎΣ" -> "hello ὀδυσσεύς" + * - Strip + - Removes all spacelike characters on the sides of input + - " hi " -> "hi" + * - StripAccents + - Removes all accent symbols in unicode (to be used with NFD for consistency) + - "é" -> "e" + * - Nmt + - Removes some control characters and zero-width characters + - "\u200d" -> "" + * - Replace + - Replaces a custom string or regexp and changes it with given content + - Replace("a", "e")("banana") -> "benene" + * - Sequence + - Composes multiple normalizers + - Sequence([Nmt(), NFKC()]) + + +Pre tokenizers +============== + +A pre tokenizer splits an input string *before* it reaches the model, it's often used for efficiency. +It can also replace some characters. + +.. list-table:: + :header-rows: 1 + + * - Name + - Description + - Example + * - ByteLevel + - Splits on spaces but remaps all bytes into visible range (used in gpt-2) + - "Hello my friend, how are you?" -> "Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?" + * - Whitespace + - Splits on word boundaries + - "Hello there!" -> "Hello", "there", "!" + * - WhitespaceSplit + - Splits on spaces + - "Hello there!" -> "Hello", "there!" + * - Punctuation + - Will isolate all punctuation characters + - "Hello?" -> "Hello", "?" + * - Metaspace + - Splits on spaces an replaces it with a special char + - Metaspace("_", false)("Hello there") -> "Hello", "_there" + * - CharDelimiterSplit + - Splits on a given char + - CharDelimiterSplit("x")("Helloxthere") -> "Hello", "there" + * - Sequence + - Composes multiple pre_tokenizers + - Sequence([Punctuation(), WhitespaceSplit()]) + + +Decoders +======== + +As some normalizers and pre_tokenizers change some characters, we want to revert some changes to get back readable strings + +.. list-table:: + :header-rows: 1 + + * - Name + - Description + * - ByteLevel + - Reverts ByteLevel Pre_tokenizer + * - Metaspace + - Reverts Metaspace Pre_tokenizer + + +PostProcessor +============= + +After the whole pipeline, we sometimes want to insert some specific markers before feeding +a tokenized string into a model like "`` My horse is amazing ``". + +.. list-table:: + :header-rows: 1 + + * - Name + - Description + - Example + * - TemplateProcessing + - It should covert most needs. `seq_a` is a list of the outputs for single sentence, `seq_b` is used when encoding two sentences + - TemplateProcessing(seq_a = ["", "$0", ""], seq_b = ["$1", ""]) ("I like this", "but not this") -> " I like this but not this " +