mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Doc - Improve quicktour support for multi-lang
This commit is contained in:
@ -14,20 +14,21 @@ class RustRef:
|
|||||||
parts = text.split("::")
|
parts = text.split("::")
|
||||||
|
|
||||||
if text.startswith("~"):
|
if text.startswith("~"):
|
||||||
content = parts[-1]
|
title = parts[-1]
|
||||||
parts[0] = parts[0][1:]
|
parts[0] = parts[0][1:]
|
||||||
else:
|
else:
|
||||||
content = text
|
content = text
|
||||||
link = self.base_link()
|
link = self.base_link()
|
||||||
|
|
||||||
if doctype == "struct":
|
if doctype == "struct":
|
||||||
link += self.make_struct_link(parts)
|
l, title = self.make_struct_link(parts, title)
|
||||||
if doctype == "func":
|
if doctype == "func":
|
||||||
link += self.make_func_link(parts)
|
l, title = self.make_func_link(parts, title)
|
||||||
if doctype == "meth":
|
if doctype == "meth":
|
||||||
link += self.make_meth_link(parts)
|
l, title = self.make_meth_link(parts, title)
|
||||||
|
link += l
|
||||||
|
|
||||||
node = nodes.reference(internal=False, refuri=link, text=content)
|
node = nodes.reference(internal=False, refuri=link, text=title)
|
||||||
wrapper = nodes.literal(classes=["xref"])
|
wrapper = nodes.literal(classes=["xref"])
|
||||||
wrapper += node
|
wrapper += node
|
||||||
|
|
||||||
@ -36,7 +37,7 @@ class RustRef:
|
|||||||
def base_link(self):
|
def base_link(self):
|
||||||
return f"https://docs.rs/tokenizers/{rust_version}"
|
return f"https://docs.rs/tokenizers/{rust_version}"
|
||||||
|
|
||||||
def make_struct_link(self, parts):
|
def make_struct_link(self, parts, title):
|
||||||
link = ""
|
link = ""
|
||||||
struct_name = parts[-1]
|
struct_name = parts[-1]
|
||||||
path = parts[:-1]
|
path = parts[:-1]
|
||||||
@ -45,9 +46,9 @@ class RustRef:
|
|||||||
link += f"/{p}"
|
link += f"/{p}"
|
||||||
link += f"/struct.{struct_name}.html"
|
link += f"/struct.{struct_name}.html"
|
||||||
|
|
||||||
return link
|
return link, title
|
||||||
|
|
||||||
def make_func_link(self, parts):
|
def make_func_link(self, parts, title):
|
||||||
link = ""
|
link = ""
|
||||||
fn_name = parts[-1]
|
fn_name = parts[-1]
|
||||||
|
|
||||||
@ -56,17 +57,20 @@ class RustRef:
|
|||||||
link += f"/{p}"
|
link += f"/{p}"
|
||||||
link += f"/fn.{fn_name}.html"
|
link += f"/fn.{fn_name}.html"
|
||||||
|
|
||||||
return link
|
return link, title
|
||||||
|
|
||||||
def make_meth_link(self, parts):
|
def make_meth_link(self, parts, title):
|
||||||
meth_name = parts[-1]
|
meth_name = parts[-1]
|
||||||
if meth_name.endswith("()"):
|
if meth_name.endswith("()"):
|
||||||
meth_name = meth_name[:-2]
|
meth_name = meth_name[:-2]
|
||||||
|
|
||||||
link = self.make_struct_link(parts[:-1])
|
link, title = self.make_struct_link(parts[:-1], title)
|
||||||
link += f"#method.{meth_name}"
|
link += f"#method.{meth_name}"
|
||||||
|
|
||||||
return link
|
if not title.endswith(")"):
|
||||||
|
title += "()"
|
||||||
|
|
||||||
|
return link, title
|
||||||
|
|
||||||
|
|
||||||
def setup(app):
|
def setup(app):
|
||||||
|
@ -1,4 +1,5 @@
|
|||||||
Documentation
|
Documentation
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
The Rust API has not been documented yet.
|
The Rust API Reference is available directly on the `Docs.rs <https://docs.rs/tokenizers>`__
|
||||||
|
website.
|
||||||
|
@ -55,6 +55,18 @@ Main features:
|
|||||||
:meth:`~tokenizers.Tokenizer.save`
|
:meth:`~tokenizers.Tokenizer.save`
|
||||||
Tokenizer.from_file
|
Tokenizer.from_file
|
||||||
:meth:`~tokenizers.Tokenizer.from_file`
|
:meth:`~tokenizers.Tokenizer.from_file`
|
||||||
|
Tokenizer.encode
|
||||||
|
:meth:`~tokenizers.Tokenizer.encode`
|
||||||
|
Tokenizer.encode_batch
|
||||||
|
:meth:`~tokenizers.Tokenizer.encode_batch`
|
||||||
|
Tokenizer.token_to_id
|
||||||
|
:meth:`~tokenizers.Tokenizer.token_to_id`
|
||||||
|
Tokenizer.enable_padding
|
||||||
|
:meth:`~tokenizers.Tokenizer.enable_padding`
|
||||||
|
Encoding
|
||||||
|
:class:`~tokenizers.Encoding`
|
||||||
|
TemplateProcessing
|
||||||
|
:class:`~tokenizers.processors.TemplateProcessing`
|
||||||
|
|
||||||
.. entities:: rust
|
.. entities:: rust
|
||||||
|
|
||||||
@ -67,8 +79,20 @@ Main features:
|
|||||||
Tokenizer
|
Tokenizer
|
||||||
:rust:struct:`~tokenizers::tokenizer::Tokenizer`
|
:rust:struct:`~tokenizers::tokenizer::Tokenizer`
|
||||||
Tokenizer.train
|
Tokenizer.train
|
||||||
:rust:meth:`~tokenizers::tokenizer::Tokenizer::train()`
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::train`
|
||||||
Tokenizer.save
|
Tokenizer.save
|
||||||
:rust:meth:`~tokenizers::tokenizer::Tokenizer::save()`
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::save`
|
||||||
Tokenizer.from_file
|
Tokenizer.from_file
|
||||||
:rust:meth:`~tokenizers::tokenizer::Tokenizer::from_file()`
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::from_file`
|
||||||
|
Tokenizer.encode
|
||||||
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::encode`
|
||||||
|
Tokenizer.encode_batch
|
||||||
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::encode_batch`
|
||||||
|
Tokenizer.token_to_id
|
||||||
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::token_to_id`
|
||||||
|
Tokenizer.enable_padding
|
||||||
|
:rust:meth:`~tokenizers::tokenizer::Tokenizer::enable_padding`
|
||||||
|
Encoding
|
||||||
|
:rust:struct:`~tokenizers::tokenizer::Encoding`
|
||||||
|
TemplateProcessing
|
||||||
|
:rust:struct:`~tokenizers::processors::template::TemplateProcessing`
|
||||||
|
@ -4,8 +4,10 @@ Quicktour
|
|||||||
Let's have a quick look at the 🤗 Tokenizers library features. The library provides an
|
Let's have a quick look at the 🤗 Tokenizers library features. The library provides an
|
||||||
implementation of today's most used tokenizers that is both easy to use and blazing fast.
|
implementation of today's most used tokenizers that is both easy to use and blazing fast.
|
||||||
|
|
||||||
It can be used to instantiate a :ref:`pretrained tokenizer <pretrained>` but we will start our
|
.. only:: python
|
||||||
quicktour by building one from scratch and see how we can train it.
|
|
||||||
|
It can be used to instantiate a :ref:`pretrained tokenizer <pretrained>` but we will start our
|
||||||
|
quicktour by building one from scratch and see how we can train it.
|
||||||
|
|
||||||
|
|
||||||
Build a tokenizer from scratch
|
Build a tokenizer from scratch
|
||||||
@ -34,6 +36,10 @@ Training the tokenizer
|
|||||||
:obj:`min_frequency`
|
:obj:`min_frequency`
|
||||||
special_tokens
|
special_tokens
|
||||||
:obj:`special_tokens`
|
:obj:`special_tokens`
|
||||||
|
unk_token
|
||||||
|
:obj:`unk_token`
|
||||||
|
pad_token
|
||||||
|
:obj:`pad_token`
|
||||||
|
|
||||||
.. entities:: rust
|
.. entities:: rust
|
||||||
|
|
||||||
@ -45,6 +51,10 @@ Training the tokenizer
|
|||||||
:obj:`min_frequency`
|
:obj:`min_frequency`
|
||||||
special_tokens
|
special_tokens
|
||||||
:obj:`special_tokens`
|
:obj:`special_tokens`
|
||||||
|
unk_token
|
||||||
|
:obj:`unk_token`
|
||||||
|
pad_token
|
||||||
|
:obj:`pad_token`
|
||||||
|
|
||||||
.. entities:: node
|
.. entities:: node
|
||||||
|
|
||||||
@ -56,6 +66,10 @@ Training the tokenizer
|
|||||||
:obj:`minFrequency`
|
:obj:`minFrequency`
|
||||||
special_tokens
|
special_tokens
|
||||||
:obj:`specialTokens`
|
:obj:`specialTokens`
|
||||||
|
unk_token
|
||||||
|
:obj:`unkToken`
|
||||||
|
pad_token
|
||||||
|
:obj:`padToken`
|
||||||
|
|
||||||
In this tour, we will build and train a Byte-Pair Encoding (BPE) tokenizer. For more information
|
In this tour, we will build and train a Byte-Pair Encoding (BPE) tokenizer. For more information
|
||||||
about the different type of tokenizers, check out this `guide
|
about the different type of tokenizers, check out this `guide
|
||||||
@ -158,7 +172,7 @@ to use:
|
|||||||
|
|
||||||
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
This should only take a few seconds to train our tokenizer on the full wikitext dataset! Once this
|
||||||
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
|
is done, we need to save the model and reinstantiate it with the unknown token, or this token won't
|
||||||
be used. This will be simplified in a further release, to let you set the :obj:`unk_token` when
|
be used. This will be simplified in a further release, to let you set the :entity:`unk_token` when
|
||||||
first instantiating the model.
|
first instantiating the model.
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -219,7 +233,7 @@ Using the tokenizer
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
Now that we have trained a tokenizer, we can use it on any text we want with the
|
Now that we have trained a tokenizer, we can use it on any text we want with the
|
||||||
:meth:`~tokenizers.Tokenizer.encode` method:
|
:entity:`Tokenizer.encode` method:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
|
|
||||||
@ -238,10 +252,10 @@ Now that we have trained a tokenizer, we can use it on any text we want with the
|
|||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
This applied the full pipeline of the tokenizer on the text, returning an
|
This applied the full pipeline of the tokenizer on the text, returning an
|
||||||
:class:`~tokenizers.Encoding` object. To learn more about this pipeline, and how to apply (or
|
:entity:`Encoding` object. To learn more about this pipeline, and how to apply (or
|
||||||
customize) parts of it, check out :doc:`this page <pipeline>`.
|
customize) parts of it, check out :doc:`this page <pipeline>`.
|
||||||
|
|
||||||
This :class:`~tokenizers.Encoding` object then has all the attributes you need for your deep
|
This :entity:`Encoding` object then has all the attributes you need for your deep
|
||||||
learning model (or other). The :obj:`tokens` attribute contains the segmentation of your text in
|
learning model (or other). The :obj:`tokens` attribute contains the segmentation of your text in
|
||||||
tokens:
|
tokens:
|
||||||
|
|
||||||
@ -282,7 +296,7 @@ tokenizer's vocabulary:
|
|||||||
|
|
||||||
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
An important feature of the 🤗 Tokenizers library is that it comes with full alignment tracking,
|
||||||
meaning you can always get the part of your original sentence that corresponds to a given token.
|
meaning you can always get the part of your original sentence that corresponds to a given token.
|
||||||
Those are stored in the :obj:`offsets` attribute of our :class:`~tokenizers.Encoding` object. For
|
Those are stored in the :obj:`offsets` attribute of our :entity:`Encoding` object. For
|
||||||
instance, let's assume we would want to find back what caused the :obj:`"[UNK]"` token to appear,
|
instance, let's assume we would want to find back what caused the :obj:`"[UNK]"` token to appear,
|
||||||
which is the token at index 9 in the list, we can just ask for the offset at the index:
|
which is the token at index 9 in the list, we can just ask for the offset at the index:
|
||||||
|
|
||||||
@ -324,13 +338,13 @@ Post-processing
|
|||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
We might want our tokenizer to automatically add special tokens, like :obj:`"[CLS]"` or
|
We might want our tokenizer to automatically add special tokens, like :obj:`"[CLS]"` or
|
||||||
:obj:`"[SEP]"`. To do this, we use a post-processor. :class:`~tokenizers.TemplateProcessing` is the
|
:obj:`"[SEP]"`. To do this, we use a post-processor. :entity:`TemplateProcessing` is the
|
||||||
most commonly used, you just have so specify a template for the processing of single sentences and
|
most commonly used, you just have so specify a template for the processing of single sentences and
|
||||||
pairs of sentences, along with the special tokens and their IDs.
|
pairs of sentences, along with the special tokens and their IDs.
|
||||||
|
|
||||||
When we built our tokenizer, we set :obj:`"[CLS]"` and :obj:`"[SEP]"` in positions 1 and 2 of our
|
When we built our tokenizer, we set :obj:`"[CLS]"` and :obj:`"[SEP]"` in positions 1 and 2 of our
|
||||||
list of special tokens, so this should be their IDs. To double-check, we can use the
|
list of special tokens, so this should be their IDs. To double-check, we can use the
|
||||||
:meth:`~tokenizers.Tokenizer.token_to_id` method:
|
:entity:`Tokenizer.token_to_id` method:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
|
|
||||||
@ -397,7 +411,7 @@ To check out this worked properly, let's try to encode the same sentence as befo
|
|||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
To check the results on a pair of sentences, we just pass the two sentences to
|
To check the results on a pair of sentences, we just pass the two sentences to
|
||||||
:meth:`~tokenizers.Tokenizer.encode`:
|
:entity:`Tokenizer.encode`:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
|
|
||||||
@ -433,14 +447,13 @@ You can then check the type IDs attributed to each token is correct with
|
|||||||
:end-before: END print_type_ids
|
:end-before: END print_type_ids
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
If you save your tokenizer with :meth:`~tokenizers.Tokenizer.save`, the post-processor will be saved
|
If you save your tokenizer with :entity:`Tokenizer.save`, the post-processor will be saved along.
|
||||||
along.
|
|
||||||
|
|
||||||
Encoding multiple sentences in a batch
|
Encoding multiple sentences in a batch
|
||||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||||
|
|
||||||
To get the full speed of the 🤗 Tokenizers library, it's best to process your texts by batches by
|
To get the full speed of the 🤗 Tokenizers library, it's best to process your texts by batches by
|
||||||
using the :meth:`~tokenizers.Tokenizer.encode_batch` method:
|
using the :entity:`Tokenizer.encode_batch` method:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
|
|
||||||
@ -458,11 +471,11 @@ using the :meth:`~tokenizers.Tokenizer.encode_batch` method:
|
|||||||
:end-before: END encode_batch
|
:end-before: END encode_batch
|
||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
The output is then a list of :class:`~tokenizers.Encoding` objects like the ones we saw before. You
|
The output is then a list of :entity:`Encoding` objects like the ones we saw before. You
|
||||||
can process together as many texts as you like, as long as it fits in memory.
|
can process together as many texts as you like, as long as it fits in memory.
|
||||||
|
|
||||||
To process a batch of sentences pairs, pass two lists to the
|
To process a batch of sentences pairs, pass two lists to the
|
||||||
:meth:`~tokenizers.Tokenizer.encode_batch` method: the list of sentences A and the list of sentences
|
:entity:`Tokenizer.encode_batch` method: the list of sentences A and the list of sentences
|
||||||
B:
|
B:
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
@ -482,9 +495,9 @@ B:
|
|||||||
:dedent: 4
|
:dedent: 4
|
||||||
|
|
||||||
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
When encoding multiple sentences, you can automatically pad the outputs to the longest sentence
|
||||||
present by using :meth:`~tokenizers.Tokenizer.enable_padding`, with the :obj:`pad_token` and its ID
|
present by using :entity:`Tokenizer.enable_padding`, with the :entity:`pad_token` and its ID
|
||||||
(which we can double-check the id for the padding token with
|
(which we can double-check the id for the padding token with
|
||||||
:meth:`~tokenizers.Tokenizer.token_to_id` like before):
|
:entity:`Tokenizer.token_to_id` like before):
|
||||||
|
|
||||||
.. only:: python
|
.. only:: python
|
||||||
|
|
||||||
@ -542,25 +555,27 @@ In this case, the `attention mask` generated by the tokenizer takes the padding
|
|||||||
|
|
||||||
.. _pretrained:
|
.. _pretrained:
|
||||||
|
|
||||||
Using a pretrained tokenizer
|
.. only:: python
|
||||||
----------------------------------------------------------------------------------------------------
|
|
||||||
|
|
||||||
You can also use a pretrained tokenizer directly in, as long as you have its vocabulary file. For
|
Using a pretrained tokenizer
|
||||||
instance, here is how to get the classic pretrained BERT tokenizer:
|
----------------------------------------------------------------------------------------------------
|
||||||
|
|
||||||
.. code-block:: python
|
You can also use a pretrained tokenizer directly in, as long as you have its vocabulary file. For
|
||||||
|
instance, here is how to get the classic pretrained BERT tokenizer:
|
||||||
|
|
||||||
from tokenizers import ByteLevelBPETokenizer
|
.. code-block:: python
|
||||||
|
|
||||||
tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
|
from tokenizers import ByteLevelBPETokenizer
|
||||||
|
|
||||||
as long as you have downloaded the file `bert-base-uncased-vocab.txt` with
|
tokenizer = BertWordPieceTokenizer("bert-base-uncased-vocab.txt", lowercase=True)
|
||||||
|
|
||||||
.. code-block:: bash
|
as long as you have downloaded the file `bert-base-uncased-vocab.txt` with
|
||||||
|
|
||||||
wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
|
.. code-block:: bash
|
||||||
|
|
||||||
.. note::
|
wget https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt
|
||||||
|
|
||||||
Better support for pretrained tokenizers is coming in a next release, so expect this API to
|
.. note::
|
||||||
change soon.
|
|
||||||
|
Better support for pretrained tokenizers is coming in a next release, so expect this API to
|
||||||
|
change soon.
|
||||||
|
Reference in New Issue
Block a user