Python - Improve pre-tokenizers docs

This commit is contained in:
Anthony MOI
2020-11-20 17:17:46 -05:00
committed by Anthony MOI
parent 5842b3db73
commit 933a2a9c99
3 changed files with 375 additions and 89 deletions

View File

@@ -9,12 +9,37 @@ class PreTokenizer:
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -30,12 +55,37 @@ class BertPreTokenizer(PreTokenizer):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -47,11 +97,9 @@ class ByteLevel(PreTokenizer):
with a corresponding representation, as well as splitting into words.
Args:
add_prefix_space: (`optional`) boolean:
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
Returns:
PreTokenizer
"""
def __init__(self, add_prefix_space=True):
@@ -62,18 +110,46 @@ class ByteLevel(PreTokenizer):
Returns the alphabet used by this PreTokenizer.
Since the ByteLevel works as its name suggests, at the byte level, it
encodes any byte to one visible character. This means that there is a
encodes each byte value to a unique visible character. This means that there is a
total of 256 different characters composing this alphabet.
Returns:
:obj:`List[str]`: A list of characters that compose the alphabet
"""
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -88,34 +164,90 @@ class CharDelimiterSplit(PreTokenizer):
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
class Digits(PreTokenizer):
"""
This pre-tokenizer simply splits using the digits in separate tokens
Args:
individual_digits: bool:
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
If set to True, digits will each be separated as follows::
"Call 123 please" -> "Call ", "1", "2", "3", " please"
If set to False, digits will grouped as follows::
"Call 123 please" -> "Call ", "123", " please"
"""
def __init__(self, individual_digits=False):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -125,26 +257,52 @@ class Metaspace(PreTokenizer):
This pre-tokenizer replaces any whitespace by the provided replacement character.
It then tries to split on these spaces.
Args:
replacement: str:
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
The replacement character. Must be exactly one character. By default we
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
add_prefix_space: boolean:
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
Whether to add a space to the first word if there isn't already one. This
lets us treat `hello` exactly like `say hello`.
"""
def __init__(self, replacement="", add_prefix_space=True):
def __init__(self, replacement="_", add_prefix_space=True):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -157,12 +315,37 @@ class Punctuation(PreTokenizer):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -175,12 +358,37 @@ class Sequence(PreTokenizer):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -196,12 +404,37 @@ class UnicodeScripts(PreTokenizer):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -214,12 +447,37 @@ class Whitespace(PreTokenizer):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass
@@ -232,11 +490,36 @@ class WhitespaceSplit(PreTokenizer):
pass
def pre_tokenize(self, pretok):
"""
Pre tokenize the given PreTokenizedString in-place
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
keep track of the pre-tokenization, and leverage the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
the pre-tokenization of a raw string, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
Args:
pretok (:class:`~tokenizers.PreTokenizedString):
The pre-tokenized string on which to apply this
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
"""
pass
def pre_tokenize_str(self, sequence):
"""
Pre tokenize the given sequence
Pre tokenize the given string
This method provides a way to visualize the effect of a
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
alignment, nor does it provide all the capabilities of the
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
Args:
sequence (:obj:`str`):
A string to pre-tokeize
Returns:
:obj:`List[Tuple[str, Offsets]]`:
A list of tuple with the pre-tokenized parts and their offsets
"""
pass