mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Actually adding docs.
This commit is contained in:
committed by
Anthony MOI
parent
655809c718
commit
81bb4f6da3
3
.gitignore
vendored
3
.gitignore
vendored
@ -10,7 +10,8 @@ Cargo.lock
|
||||
/data
|
||||
tokenizers/data
|
||||
bindings/python/tests/data
|
||||
/docs
|
||||
docs/build/
|
||||
docs/make.bat
|
||||
|
||||
__pycache__
|
||||
pip-wheel-metadata
|
||||
|
20
docs/Makefile
Normal file
20
docs/Makefile
Normal file
@ -0,0 +1,20 @@
|
||||
# Minimal makefile for Sphinx documentation
|
||||
#
|
||||
|
||||
# You can set these variables from the command line, and also
|
||||
# from the environment for the first two.
|
||||
SPHINXOPTS ?=
|
||||
SPHINXBUILD ?= sphinx-build
|
||||
SOURCEDIR = source
|
||||
BUILDDIR = build
|
||||
|
||||
# Put it first so that "make" without argument is like "make help".
|
||||
help:
|
||||
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
||||
|
||||
.PHONY: help Makefile
|
||||
|
||||
# Catch-all target: route all unknown targets to Sphinx using the new
|
||||
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
||||
%: Makefile
|
||||
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
56
docs/source/conf.py
Normal file
56
docs/source/conf.py
Normal file
@ -0,0 +1,56 @@
|
||||
# Configuration file for the Sphinx documentation builder.
|
||||
#
|
||||
# This file only contains a selection of the most common options. For a full
|
||||
# list see the documentation:
|
||||
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
||||
|
||||
# -- Path setup --------------------------------------------------------------
|
||||
|
||||
# If extensions (or modules to document with autodoc) are in another directory,
|
||||
# add these directories to sys.path here. If the directory is relative to the
|
||||
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
||||
#
|
||||
# import os
|
||||
# import sys
|
||||
# sys.path.insert(0, os.path.abspath('.'))
|
||||
|
||||
|
||||
# -- Project information -----------------------------------------------------
|
||||
|
||||
project = "tokenizers"
|
||||
copyright = "2020, huggingface"
|
||||
author = "huggingface"
|
||||
|
||||
# The full version, including alpha/beta/rc tags
|
||||
release = "0.9.0"
|
||||
|
||||
|
||||
# -- General configuration ---------------------------------------------------
|
||||
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = [
|
||||
"sphinx_tabs.tabs",
|
||||
]
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
templates_path = ["_templates"]
|
||||
|
||||
# List of patterns, relative to source directory, that match files and
|
||||
# directories to ignore when looking for source files.
|
||||
# This pattern also affects html_static_path and html_extra_path.
|
||||
exclude_patterns = []
|
||||
|
||||
|
||||
# -- Options for HTML output -------------------------------------------------
|
||||
|
||||
# The theme to use for HTML and HTML Help pages. See the documentation for
|
||||
# a list of builtin themes.
|
||||
#
|
||||
html_theme = "sphinx_rtd_theme"
|
||||
|
||||
# Add any paths that contain custom static files (such as style sheets) here,
|
||||
# relative to this directory. They are copied after the builtin static files,
|
||||
# so a file named "default.css" will overwrite the builtin "default.css".
|
||||
html_static_path = ["_static"]
|
79
docs/source/index.rst
Normal file
79
docs/source/index.rst
Normal file
@ -0,0 +1,79 @@
|
||||
.. tokenizers documentation master file, created by
|
||||
sphinx-quickstart on Fri Sep 25 14:32:54 2020.
|
||||
You can adapt this file completely to your liking, but it should at least
|
||||
contain the root `toctree` directive.
|
||||
|
||||
Welcome to tokenizers's documentation!
|
||||
======================================
|
||||
|
||||
.. toctree::
|
||||
|
||||
tokenizer_blocks
|
||||
|
||||
Getting started
|
||||
==================
|
||||
|
||||
|
||||
Provides an implementation of today's most used tokenizers, with a focus on performance and
|
||||
versatility.
|
||||
|
||||
Main features:
|
||||
--------------
|
||||
|
||||
- Train new vocabularies and tokenize, using today's most used tokenizers.
|
||||
- Extremely fast (both training and tokenization), thanks to the Rust implementation. Takes
|
||||
less than 20 seconds to tokenize a GB of text on a server's CPU.
|
||||
- Easy to use, but also extremely versatile.
|
||||
- Designed for research and production.
|
||||
- Normalization comes with alignments tracking. It's always possible to get the part of the
|
||||
original sentence that corresponds to a given token.
|
||||
- Does all the pre-processing: Truncate, Pad, add the special tokens your model needs.
|
||||
- Bindings to Rust, Python and Node.
|
||||
|
||||
Load an existing tokenizer:
|
||||
---------------------------
|
||||
|
||||
|
||||
.. tabs::
|
||||
|
||||
.. group-tab:: Rust
|
||||
|
||||
.. literalinclude:: ../../tokenizers/examples/load.rs
|
||||
:language: rust
|
||||
:emphasize-lines: 4
|
||||
|
||||
.. group-tab:: Python
|
||||
|
||||
.. literalinclude:: ../../bindings/python/tests/examples/test_load.py
|
||||
:language: python
|
||||
:emphasize-lines: 4
|
||||
|
||||
.. group-tab:: Node
|
||||
|
||||
.. literalinclude:: ../../bindings/node/examples/load.test.ts
|
||||
:language: typescript
|
||||
:emphasize-lines: 11
|
||||
|
||||
|
||||
|
||||
Train a tokenizer:
|
||||
------------------
|
||||
|
||||
Small guide of :ref:`how to create a Tokenizer options<tokenizer_blocks>`.
|
||||
|
||||
.. tabs::
|
||||
.. group-tab:: Rust
|
||||
|
||||
.. literalinclude:: ../../tokenizers/examples/train.rs
|
||||
:language: rust
|
||||
|
||||
.. group-tab:: Python
|
||||
|
||||
.. literalinclude:: ../../bindings/python/tests/examples/test_train.py
|
||||
:language: python
|
||||
|
||||
.. group-tab:: Node
|
||||
|
||||
.. literalinclude:: ../../bindings/node/examples/train.test.ts
|
||||
:language: typescript
|
||||
|
135
docs/source/tokenizer_blocks.rst
Normal file
135
docs/source/tokenizer_blocks.rst
Normal file
@ -0,0 +1,135 @@
|
||||
|
||||
Models
|
||||
======
|
||||
|
||||
.. _tokenizer_blocks:
|
||||
|
||||
Models are the core algorithms that serves for tokenizers.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Description
|
||||
* - BPE
|
||||
- Works by looking at most frequent pairs in a dataset, and iteratively fusing them in new tokens
|
||||
* - Unigram
|
||||
- Works by building a suffix array and using an EM algorithm to find best suitable tokens
|
||||
* - WordPiece
|
||||
- ...
|
||||
* - WordLevel
|
||||
- ...
|
||||
|
||||
|
||||
Normalizers
|
||||
===========
|
||||
|
||||
A normalizer will take a unicode string input, and modify it to make it more uniform for the underlying algorithm.
|
||||
Usually fixes some unicode quirks. The specificity of ``tokenizers`` is that we keep track of all offsets
|
||||
to know how a string was normalizers, which is especially useful to debug a tokenizer.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Desription
|
||||
- Example
|
||||
* - NFD
|
||||
- NFD unicode normalization
|
||||
-
|
||||
* - NFKD
|
||||
- NFKD unicode normalization
|
||||
-
|
||||
* - NFC
|
||||
- NFC unicode normalization
|
||||
-
|
||||
* - NFKC
|
||||
- NFKC unicode normalization
|
||||
-
|
||||
* - Lowercase
|
||||
- Replaces all uppercase to lowercase
|
||||
- "HELLO ὈΔΥΣΣΕΎΣ" -> "hello ὀδυσσεύς"
|
||||
* - Strip
|
||||
- Removes all spacelike characters on the sides of input
|
||||
- " hi " -> "hi"
|
||||
* - StripAccents
|
||||
- Removes all accent symbols in unicode (to be used with NFD for consistency)
|
||||
- "é" -> "e"
|
||||
* - Nmt
|
||||
- Removes some control characters and zero-width characters
|
||||
- "\u200d" -> ""
|
||||
* - Replace
|
||||
- Replaces a custom string or regexp and changes it with given content
|
||||
- Replace("a", "e")("banana") -> "benene"
|
||||
* - Sequence
|
||||
- Composes multiple normalizers
|
||||
- Sequence([Nmt(), NFKC()])
|
||||
|
||||
|
||||
Pre tokenizers
|
||||
==============
|
||||
|
||||
A pre tokenizer splits an input string *before* it reaches the model, it's often used for efficiency.
|
||||
It can also replace some characters.
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Description
|
||||
- Example
|
||||
* - ByteLevel
|
||||
- Splits on spaces but remaps all bytes into visible range (used in gpt-2)
|
||||
- "Hello my friend, how are you?" -> "Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"
|
||||
* - Whitespace
|
||||
- Splits on word boundaries
|
||||
- "Hello there!" -> "Hello", "there", "!"
|
||||
* - WhitespaceSplit
|
||||
- Splits on spaces
|
||||
- "Hello there!" -> "Hello", "there!"
|
||||
* - Punctuation
|
||||
- Will isolate all punctuation characters
|
||||
- "Hello?" -> "Hello", "?"
|
||||
* - Metaspace
|
||||
- Splits on spaces an replaces it with a special char
|
||||
- Metaspace("_", false)("Hello there") -> "Hello", "_there"
|
||||
* - CharDelimiterSplit
|
||||
- Splits on a given char
|
||||
- CharDelimiterSplit("x")("Helloxthere") -> "Hello", "there"
|
||||
* - Sequence
|
||||
- Composes multiple pre_tokenizers
|
||||
- Sequence([Punctuation(), WhitespaceSplit()])
|
||||
|
||||
|
||||
Decoders
|
||||
========
|
||||
|
||||
As some normalizers and pre_tokenizers change some characters, we want to revert some changes to get back readable strings
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Description
|
||||
* - ByteLevel
|
||||
- Reverts ByteLevel Pre_tokenizer
|
||||
* - Metaspace
|
||||
- Reverts Metaspace Pre_tokenizer
|
||||
|
||||
|
||||
PostProcessor
|
||||
=============
|
||||
|
||||
After the whole pipeline, we sometimes want to insert some specific markers before feeding
|
||||
a tokenized string into a model like "`<cls>` My horse is amazing `<eos>`".
|
||||
|
||||
.. list-table::
|
||||
:header-rows: 1
|
||||
|
||||
* - Name
|
||||
- Description
|
||||
- Example
|
||||
* - TemplateProcessing
|
||||
- It should covert most needs. `seq_a` is a list of the outputs for single sentence, `seq_b` is used when encoding two sentences
|
||||
- TemplateProcessing(seq_a = ["<cls>", "$0", "<eos>"], seq_b = ["$1", "<eos>"]) ("I like this", "but not this") -> "<cls> I like this <eos> but not this <eos>"
|
||||
|
Reference in New Issue
Block a user