diff --git a/docs/source/_ext/rust_doc.py b/docs/source/_ext/rust_doc.py index 928c077d..699462d6 100644 --- a/docs/source/_ext/rust_doc.py +++ b/docs/source/_ext/rust_doc.py @@ -14,20 +14,21 @@ class RustRef: parts = text.split("::") if text.startswith("~"): - content = parts[-1] + title = parts[-1] parts[0] = parts[0][1:] else: content = text link = self.base_link() if doctype == "struct": - link += self.make_struct_link(parts) + l, title = self.make_struct_link(parts, title) if doctype == "func": - link += self.make_func_link(parts) + l, title = self.make_func_link(parts, title) if doctype == "meth": - link += self.make_meth_link(parts) + l, title = self.make_meth_link(parts, title) + link += l - node = nodes.reference(internal=False, refuri=link, text=content) + node = nodes.reference(internal=False, refuri=link, text=title) wrapper = nodes.literal(classes=["xref"]) wrapper += node @@ -36,7 +37,7 @@ class RustRef: def base_link(self): return f"https://docs.rs/tokenizers/{rust_version}" - def make_struct_link(self, parts): + def make_struct_link(self, parts, title): link = "" struct_name = parts[-1] path = parts[:-1] @@ -45,9 +46,9 @@ class RustRef: link += f"/{p}" link += f"/struct.{struct_name}.html" - return link + return link, title - def make_func_link(self, parts): + def make_func_link(self, parts, title): link = "" fn_name = parts[-1] @@ -56,17 +57,20 @@ class RustRef: link += f"/{p}" link += f"/fn.{fn_name}.html" - return link + return link, title - def make_meth_link(self, parts): + def make_meth_link(self, parts, title): meth_name = parts[-1] if meth_name.endswith("()"): meth_name = meth_name[:-2] - link = self.make_struct_link(parts[:-1]) + link, title = self.make_struct_link(parts[:-1], title) link += f"#method.{meth_name}" - return link + if not title.endswith(")"): + title += "()" + + return link, title def setup(app): diff --git a/docs/source/api/rust.inc b/docs/source/api/rust.inc index 315a7b4b..181e1d58 100644 --- a/docs/source/api/rust.inc +++ b/docs/source/api/rust.inc @@ -1,4 +1,5 @@ Documentation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -The Rust API has not been documented yet. +The Rust API Reference is available directly on the `Docs.rs `__ +website. diff --git a/docs/source/index.rst b/docs/source/index.rst index ae163f91..19f24e8b 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -55,6 +55,18 @@ Main features: :meth:`~tokenizers.Tokenizer.save` Tokenizer.from_file :meth:`~tokenizers.Tokenizer.from_file` + Tokenizer.encode + :meth:`~tokenizers.Tokenizer.encode` + Tokenizer.encode_batch + :meth:`~tokenizers.Tokenizer.encode_batch` + Tokenizer.token_to_id + :meth:`~tokenizers.Tokenizer.token_to_id` + Tokenizer.enable_padding + :meth:`~tokenizers.Tokenizer.enable_padding` + Encoding + :class:`~tokenizers.Encoding` + TemplateProcessing + :class:`~tokenizers.processors.TemplateProcessing` .. entities:: rust @@ -67,8 +79,20 @@ Main features: Tokenizer :rust:struct:`~tokenizers::tokenizer::Tokenizer` Tokenizer.train - :rust:meth:`~tokenizers::tokenizer::Tokenizer::train()` + :rust:meth:`~tokenizers::tokenizer::Tokenizer::train` Tokenizer.save - :rust:meth:`~tokenizers::tokenizer::Tokenizer::save()` + :rust:meth:`~tokenizers::tokenizer::Tokenizer::save` Tokenizer.from_file - :rust:meth:`~tokenizers::tokenizer::Tokenizer::from_file()` + :rust:meth:`~tokenizers::tokenizer::Tokenizer::from_file` + Tokenizer.encode + :rust:meth:`~tokenizers::tokenizer::Tokenizer::encode` + Tokenizer.encode_batch + :rust:meth:`~tokenizers::tokenizer::Tokenizer::encode_batch` + Tokenizer.token_to_id + :rust:meth:`~tokenizers::tokenizer::Tokenizer::token_to_id` + Tokenizer.enable_padding + :rust:meth:`~tokenizers::tokenizer::Tokenizer::enable_padding` + Encoding + :rust:struct:`~tokenizers::tokenizer::Encoding` + TemplateProcessing + :rust:struct:`~tokenizers::processors::template::TemplateProcessing` diff --git a/docs/source/quicktour.rst b/docs/source/quicktour.rst index b05781ef..f46ca05d 100644 --- a/docs/source/quicktour.rst +++ b/docs/source/quicktour.rst @@ -4,8 +4,10 @@ Quicktour Let's have a quick look at the 🤗 Tokenizers library features. The library provides an implementation of today's most used tokenizers that is both easy to use and blazing fast. -It can be used to instantiate a :ref:`pretrained tokenizer ` but we will start our -quicktour by building one from scratch and see how we can train it. +.. only:: python + + It can be used to instantiate a :ref:`pretrained tokenizer ` but we will start our + quicktour by building one from scratch and see how we can train it. Build a tokenizer from scratch @@ -34,6 +36,10 @@ Training the tokenizer :obj:`min_frequency` special_tokens :obj:`special_tokens` + unk_token + :obj:`unk_token` + pad_token + :obj:`pad_token` .. entities:: rust @@ -45,6 +51,10 @@ Training the tokenizer :obj:`min_frequency` special_tokens :obj:`special_tokens` + unk_token + :obj:`unk_token` + pad_token + :obj:`pad_token` .. entities:: node @@ -56,6 +66,10 @@ Training the tokenizer :obj:`minFrequency` special_tokens :obj:`specialTokens` + unk_token + :obj:`unkToken` + pad_token + :obj:`padToken` In this tour, we will build and train a Byte-Pair Encoding (BPE) tokenizer. For more information about the different type of tokenizers, check out this `guide @@ -158,7 +172,7 @@ to use: This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this is done, we need to save the model and reinstantiate it with the unknown token, or this token won't -be used. This will be simplified in a further release, to let you set the :obj:`unk_token` when +be used. This will be simplified in a further release, to let you set the :entity:`unk_token` when first instantiating the model. .. only:: python @@ -219,7 +233,7 @@ Using the tokenizer ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Now that we have trained a tokenizer, we can use it on any text we want with the -:meth:`~tokenizers.Tokenizer.encode` method: +:entity:`Tokenizer.encode` method: .. only:: python @@ -238,10 +252,10 @@ Now that we have trained a tokenizer, we can use it on any text we want with the :dedent: 4 This applied the full pipeline of the tokenizer on the text, returning an -:class:`~tokenizers.Encoding` object. To learn more about this pipeline, and how to apply (or +:entity:`Encoding` object. To learn more about this pipeline, and how to apply (or customize) parts of it, check out :doc:`this page `. -This :class:`~tokenizers.Encoding` object then has all the attributes you need for your deep +This :entity:`Encoding` object then has all the attributes you need for your deep learning model (or other). The :obj:`tokens` attribute contains the segmentation of your text in tokens: @@ -282,7 +296,7 @@ tokenizer's vocabulary: An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking, meaning you can always get the part of your original sentence that corresponds to a given token. -Those are stored in the :obj:`offsets` attribute of our :class:`~tokenizers.Encoding` object. For +Those are stored in the :obj:`offsets` attribute of our :entity:`Encoding` object. For instance, let's assume we would want to find back what caused the :obj:`"[UNK]"` token to appear, which is the token at index 9 in the list, we can just ask for the offset at the index: @@ -324,13 +338,13 @@ Post-processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ We might want our tokenizer to automatically add special tokens, like :obj:`"[CLS]"` or -:obj:`"[SEP]"`. To do this, we use a post-processor. :class:`~tokenizers.TemplateProcessing` is the +:obj:`"[SEP]"`. To do this, we use a post-processor. :entity:`TemplateProcessing` is the most commonly used, you just have so specify a template for the processing of single sentences and pairs of sentences, along with the special tokens and their IDs. When we built our tokenizer, we set :obj:`"[CLS]"` and :obj:`"[SEP]"` in positions 1 and 2 of our list of special tokens, so this should be their IDs. To double-check, we can use the -:meth:`~tokenizers.Tokenizer.token_to_id` method: +:entity:`Tokenizer.token_to_id` method: .. only:: python @@ -397,7 +411,7 @@ To check out this worked properly, let's try to encode the same sentence as befo :dedent: 4 To check the results on a pair of sentences, we just pass the two sentences to -:meth:`~tokenizers.Tokenizer.encode`: +:entity:`Tokenizer.encode`: .. only:: python @@ -433,14 +447,13 @@ You can then check the type IDs attributed to each token is correct with :end-before: END print_type_ids :dedent: 4 -If you save your tokenizer with :meth:`~tokenizers.Tokenizer.save`, the post-processor will be saved -along. +If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along. Encoding multiple sentences in a batch ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ To get the full speed of the 🤗 Tokenizers library, it's best to process your texts by batches by -using the :meth:`~tokenizers.Tokenizer.encode_batch` method: +using the :entity:`Tokenizer.encode_batch` method: .. only:: python @@ -458,11 +471,11 @@ using the :meth:`~tokenizers.Tokenizer.encode_batch` method: :end-before: END encode_batch :dedent: 4 -The output is then a list of :class:`~tokenizers.Encoding` objects like the ones we saw before. You +The output is then a list of :entity:`Encoding` objects like the ones we saw before. You can process together as many texts as you like, as long as it fits in memory. To process a batch of sentences pairs, pass two lists to the -:meth:`~tokenizers.Tokenizer.encode_batch` method: the list of sentences A and the list of sentences +:entity:`Tokenizer.encode_batch` method: the list of sentences A and the list of sentences B: .. only:: python @@ -482,9 +495,9 @@ B: :dedent: 4 When encoding multiple sentences, you can automatically pad the outputs to the longest sentence -present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pad_token` and its ID +present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token` and its ID (which we can double-check the id for the padding token with -:meth:`~tokenizers.Tokenizer.token_to_id` like before): +:entity:`Tokenizer.token_to_id` like before): .. only:: python @@ -542,25 +555,27 @@ In this case, the `attention mask` generated by the tokenizer takes the padding .. _pretrained: -Using a pretrained tokenizer ----------------------------------------------------------------------------------------------------- +.. only:: python -You can also use a pretrained tokenizer directly in, as long as you have its vocabulary file. For -instance, here is how to get the classic pretrained BERT tokenizer: + Using a pretrained tokenizer + ---------------------------------------------------------------------------------------------------- -.. code-block:: python + You can also use a pretrained tokenizer directly in, as long as you have its vocabulary file. For + instance, here is how to get the classic pretrained BERT tokenizer: - from tokenizers import ByteLevelBPETokenizer + .. code-block:: python - tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True) + from tokenizers import ByteLevelBPETokenizer -as long as you have downloaded the file `bert-base-uncased-vocab.txt` with + tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True) -.. code-block:: bash + as long as you have downloaded the file `bert-base-uncased-vocab.txt` with - wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt + .. code-block:: bash -.. note:: + wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt - Better support for pretrained tokenizers is coming in a next release, so expect this API to - change soon. + .. note:: + + Better support for pretrained tokenizers is coming in a next release, so expect this API to + change soon.