mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-08 05:38:23 +00:00
Python - Improve pre-tokenizers docs
This commit is contained in:
@@ -9,12 +9,37 @@ class PreTokenizer:
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -30,12 +55,37 @@ class BertPreTokenizer(PreTokenizer):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -47,11 +97,9 @@ class ByteLevel(PreTokenizer):
|
||||
with a corresponding representation, as well as splitting into words.
|
||||
|
||||
Args:
|
||||
add_prefix_space: (`optional`) boolean:
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
Returns:
|
||||
PreTokenizer
|
||||
"""
|
||||
|
||||
def __init__(self, add_prefix_space=True):
|
||||
@@ -62,18 +110,46 @@ class ByteLevel(PreTokenizer):
|
||||
Returns the alphabet used by this PreTokenizer.
|
||||
|
||||
Since the ByteLevel works as its name suggests, at the byte level, it
|
||||
encodes any byte to one visible character. This means that there is a
|
||||
encodes each byte value to a unique visible character. This means that there is a
|
||||
total of 256 different characters composing this alphabet.
|
||||
|
||||
Returns:
|
||||
:obj:`List[str]`: A list of characters that compose the alphabet
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -88,34 +164,90 @@ class CharDelimiterSplit(PreTokenizer):
|
||||
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
class Digits(PreTokenizer):
|
||||
"""
|
||||
This pre-tokenizer simply splits using the digits in separate tokens
|
||||
|
||||
Args:
|
||||
individual_digits: bool:
|
||||
If set to True, digits will each be separated "Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
If set to False, digits will grouped "Call 123 please" -> "Call ", "123", " please"
|
||||
individual_digits (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
||||
If set to True, digits will each be separated as follows::
|
||||
|
||||
"Call 123 please" -> "Call ", "1", "2", "3", " please"
|
||||
|
||||
If set to False, digits will grouped as follows::
|
||||
|
||||
"Call 123 please" -> "Call ", "123", " please"
|
||||
"""
|
||||
|
||||
def __init__(self, individual_digits=False):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -125,26 +257,52 @@ class Metaspace(PreTokenizer):
|
||||
|
||||
This pre-tokenizer replaces any whitespace by the provided replacement character.
|
||||
It then tries to split on these spaces.
|
||||
|
||||
Args:
|
||||
replacement: str:
|
||||
replacement (:obj:`str`, `optional`, defaults to :obj:`▁`):
|
||||
The replacement character. Must be exactly one character. By default we
|
||||
use the `▁` (U+2581) meta symbol (Same as in SentencePiece).
|
||||
|
||||
add_prefix_space: boolean:
|
||||
add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`True`):
|
||||
Whether to add a space to the first word if there isn't already one. This
|
||||
lets us treat `hello` exactly like `say hello`.
|
||||
"""
|
||||
|
||||
def __init__(self, replacement="▁", add_prefix_space=True):
|
||||
def __init__(self, replacement="_", add_prefix_space=True):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -157,12 +315,37 @@ class Punctuation(PreTokenizer):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -175,12 +358,37 @@ class Sequence(PreTokenizer):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -196,12 +404,37 @@ class UnicodeScripts(PreTokenizer):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -214,12 +447,37 @@ class Whitespace(PreTokenizer):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
@@ -232,11 +490,36 @@ class WhitespaceSplit(PreTokenizer):
|
||||
pass
|
||||
def pre_tokenize(self, pretok):
|
||||
"""
|
||||
Pre tokenize the given PreTokenizedString in-place
|
||||
Pre-tokenize a :class:`~tokenizers.PyPreTokenizedString` in-place
|
||||
|
||||
This method allows to modify a :class:`~tokenizers.PreTokenizedString` to
|
||||
keep track of the pre-tokenization, and leverage the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you just want to see the result of
|
||||
the pre-tokenization of a raw string, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize_str`
|
||||
|
||||
Args:
|
||||
pretok (:class:`~tokenizers.PreTokenizedString):
|
||||
The pre-tokenized string on which to apply this
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer`
|
||||
"""
|
||||
pass
|
||||
def pre_tokenize_str(self, sequence):
|
||||
"""
|
||||
Pre tokenize the given sequence
|
||||
Pre tokenize the given string
|
||||
|
||||
This method provides a way to visualize the effect of a
|
||||
:class:`~tokenizers.pre_tokenizers.PreTokenizer` but it does not keep track of the
|
||||
alignment, nor does it provide all the capabilities of the
|
||||
:class:`~tokenizers.PreTokenizedString`. If you need some of these, you can use
|
||||
:meth:`~tokenizers.pre_tokenizers.PreTokenizer.pre_tokenize`
|
||||
|
||||
Args:
|
||||
sequence (:obj:`str`):
|
||||
A string to pre-tokeize
|
||||
|
||||
Returns:
|
||||
:obj:`List[Tuple[str, Offsets]]`:
|
||||
A list of tuple with the pre-tokenized parts and their offsets
|
||||
"""
|
||||
pass
|
||||
|
||||
Reference in New Issue
Block a user