mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 08:15:49 +00:00
Fix typos (#1715)
* Fix typos Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> * Update docs/source/quicktour.rst * Update docs/source-doc-builder/quicktour.mdx --------- Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -49,7 +49,7 @@ class CustomNormalizer:
|
|||||||
def normalize(self, normalized: NormalizedString):
|
def normalize(self, normalized: NormalizedString):
|
||||||
# Most of these can be replaced by a `Sequence` combining some provided Normalizer,
|
# Most of these can be replaced by a `Sequence` combining some provided Normalizer,
|
||||||
# (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
|
# (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
|
||||||
# and it should be the prefered way. That being said, here is an example of the kind
|
# and it should be the preferred way. That being said, here is an example of the kind
|
||||||
# of things that can be done here:
|
# of things that can be done here:
|
||||||
normalized.nfkc()
|
normalized.nfkc()
|
||||||
normalized.filter(lambda char: not char.isnumeric())
|
normalized.filter(lambda char: not char.isnumeric())
|
||||||
|
@ -57,7 +57,7 @@ class ByteFallback(Decoder):
|
|||||||
ByteFallback Decoder
|
ByteFallback Decoder
|
||||||
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||||
to pure bytes, and attempts to make them into a string. If the tokens
|
to pure bytes, and attempts to make them into a string. If the tokens
|
||||||
cannot be decoded you will get <20> instead for each inconvertable byte token
|
cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||||
|
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
|
@ -389,7 +389,7 @@ class Nmt(Normalizer):
|
|||||||
class Precompiled(Normalizer):
|
class Precompiled(Normalizer):
|
||||||
"""
|
"""
|
||||||
Precompiled normalizer
|
Precompiled normalizer
|
||||||
Don't use manually it is used for compatiblity for SentencePiece.
|
Don't use manually it is used for compatibility for SentencePiece.
|
||||||
"""
|
"""
|
||||||
def __init__(self, precompiled_charsmap):
|
def __init__(self, precompiled_charsmap):
|
||||||
pass
|
pass
|
||||||
|
@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer):
|
|||||||
BertPreTokenizer
|
BertPreTokenizer
|
||||||
|
|
||||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||||
Each occurence of a punctuation character will be treated separately.
|
Each occurrence of a punctuation character will be treated separately.
|
||||||
"""
|
"""
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
pass
|
pass
|
||||||
|
@ -325,7 +325,7 @@ class EncodingVisualizer:
|
|||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
A list of length len(text) whose entry at index i is None if there is no annotation on
|
A list of length len(text) whose entry at index i is None if there is no annotation on
|
||||||
charachter i or k, the index of the annotation that covers index i where k is with
|
character i or k, the index of the annotation that covers index i where k is with
|
||||||
respect to the list of annotations
|
respect to the list of annotations
|
||||||
"""
|
"""
|
||||||
annotation_map = [None] * len(text)
|
annotation_map = [None] * len(text)
|
||||||
|
@ -263,7 +263,7 @@ impl PyWordPieceDec {
|
|||||||
/// ByteFallback Decoder
|
/// ByteFallback Decoder
|
||||||
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||||
/// to pure bytes, and attempts to make them into a string. If the tokens
|
/// to pure bytes, and attempts to make them into a string. If the tokens
|
||||||
/// cannot be decoded you will get <20> instead for each inconvertable byte token
|
/// cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||||
///
|
///
|
||||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
|
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
|
||||||
pub struct PyByteFallbackDec {}
|
pub struct PyByteFallbackDec {}
|
||||||
|
@ -23,7 +23,7 @@ use pyo3::wrap_pymodule;
|
|||||||
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||||
|
|
||||||
// For users using multiprocessing in python, it is quite easy to fork the process running
|
// For users using multiprocessing in python, it is quite easy to fork the process running
|
||||||
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
|
// tokenizers, ending up with a deadlock because we internally make use of multithreading. So
|
||||||
// we register a callback to be called in the event of a fork so that we can warn the user.
|
// we register a callback to be called in the event of a fork so that we can warn the user.
|
||||||
#[cfg(target_family = "unix")]
|
#[cfg(target_family = "unix")]
|
||||||
static mut REGISTERED_FORK_CALLBACK: bool = false;
|
static mut REGISTERED_FORK_CALLBACK: bool = false;
|
||||||
|
@ -534,7 +534,7 @@ impl PyNmt {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Precompiled normalizer
|
/// Precompiled normalizer
|
||||||
/// Don't use manually it is used for compatiblity for SentencePiece.
|
/// Don't use manually it is used for compatibility for SentencePiece.
|
||||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
|
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
|
||||||
pub struct PyPrecompiled {}
|
pub struct PyPrecompiled {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
@ -430,7 +430,7 @@ impl PyCharDelimiterSplit {
|
|||||||
/// BertPreTokenizer
|
/// BertPreTokenizer
|
||||||
///
|
///
|
||||||
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||||
/// Each occurence of a punctuation character will be treated separately.
|
/// Each occurrence of a punctuation character will be treated separately.
|
||||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
|
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
|
||||||
pub struct PyBertPreTokenizer {}
|
pub struct PyBertPreTokenizer {}
|
||||||
#[pymethods]
|
#[pymethods]
|
||||||
|
@ -100,7 +100,7 @@ def pyi_file(obj, indent=""):
|
|||||||
string += function(obj, indent)
|
string += function(obj, indent)
|
||||||
|
|
||||||
elif inspect.isgetsetdescriptor(obj):
|
elif inspect.isgetsetdescriptor(obj):
|
||||||
# TODO it would be interesing to add the setter maybe ?
|
# TODO it would be interesting to add the setter maybe ?
|
||||||
string += f"{indent}@property\n"
|
string += f"{indent}@property\n"
|
||||||
string += function(obj, indent, text_signature="(self)")
|
string += function(obj, indent, text_signature="(self)")
|
||||||
else:
|
else:
|
||||||
|
@ -287,7 +287,7 @@ class TestUnigram:
|
|||||||
trainer.initial_alphabet = ["d", "z"]
|
trainer.initial_alphabet = ["d", "z"]
|
||||||
assert sorted(trainer.initial_alphabet) == ["d", "z"]
|
assert sorted(trainer.initial_alphabet) == ["d", "z"]
|
||||||
|
|
||||||
def test_continuing_prefix_trainer_mistmatch(self):
|
def test_continuing_prefix_trainer_mismatch(self):
|
||||||
UNK = "[UNK]"
|
UNK = "[UNK]"
|
||||||
special_tokens = [UNK]
|
special_tokens = [UNK]
|
||||||
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
|
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
|
||||||
|
@ -25,8 +25,8 @@ The `Normalizer` is optional.
|
|||||||
| NFKC | NFKC unicode normalization | |
|
| NFKC | NFKC unicode normalization | |
|
||||||
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
||||||
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
||||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` |
|
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
|
||||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` |
|
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
|
||||||
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | |
|
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | |
|
||||||
| Sequence | Composes multiple normalizers that will run in the provided order | `Sequence([NFKC(), Lowercase()])` |
|
| Sequence | Composes multiple normalizers that will run in the provided order | `Sequence([NFKC(), Lowercase()])` |
|
||||||
</python>
|
</python>
|
||||||
@ -39,8 +39,8 @@ The `Normalizer` is optional.
|
|||||||
| NFKC | NFKC unicode normalization | |
|
| NFKC | NFKC unicode normalization | |
|
||||||
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
||||||
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
||||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` |
|
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
|
||||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` |
|
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
|
||||||
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | |
|
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | |
|
||||||
| Sequence | Composes multiple normalizers that will run in the provided order | `Sequence::new(vec![NFKC, Lowercase])` |
|
| Sequence | Composes multiple normalizers that will run in the provided order | `Sequence::new(vec![NFKC, Lowercase])` |
|
||||||
</rust>
|
</rust>
|
||||||
@ -53,8 +53,8 @@ The `Normalizer` is optional.
|
|||||||
| NFKC | NFKC unicode normalization | |
|
| NFKC | NFKC unicode normalization | |
|
||||||
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
||||||
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
||||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` |
|
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
|
||||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` |
|
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
|
||||||
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>cleanText</li> <li>handleChineseChars</li> <li>stripAccents</li> <li>lowercase</li> </ul> | |
|
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>cleanText</li> <li>handleChineseChars</li> <li>stripAccents</li> <li>lowercase</li> </ul> | |
|
||||||
| Sequence | Composes multiple normalizers that will run in the provided order | |
|
| Sequence | Composes multiple normalizers that will run in the provided order | |
|
||||||
</node>
|
</node>
|
||||||
@ -78,12 +78,12 @@ the ByteLevel)
|
|||||||
<python>
|
<python>
|
||||||
| Name | Description | Example |
|
| Name | Description | Example |
|
||||||
| :--- | :--- | :--- |
|
| :--- | :--- | :--- |
|
||||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||||
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
||||||
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
||||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` |
|
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
|
||||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` |
|
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
|
||||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` |
|
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
|
||||||
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
||||||
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>merged_with_previous</li><li>merged_with_next</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>merged_with_previous</li><li>merged_with_next</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
||||||
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence([Punctuation(), WhitespaceSplit()])` |
|
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence([Punctuation(), WhitespaceSplit()])` |
|
||||||
@ -91,12 +91,12 @@ the ByteLevel)
|
|||||||
<rust>
|
<rust>
|
||||||
| Name | Description | Example |
|
| Name | Description | Example |
|
||||||
| :--- | :--- | :--- |
|
| :--- | :--- | :--- |
|
||||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||||
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
||||||
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
||||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` |
|
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
|
||||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` |
|
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
|
||||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` |
|
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
|
||||||
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
||||||
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>Removed</li><li>Isolated</li><li>MergedWithPrevious</li><li>MergedWithNext</li><li>Contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>Removed</li><li>Isolated</li><li>MergedWithPrevious</li><li>MergedWithNext</li><li>Contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
||||||
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence::new(vec![Punctuation, WhitespaceSplit])` |
|
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence::new(vec![Punctuation, WhitespaceSplit])` |
|
||||||
@ -104,12 +104,12 @@ the ByteLevel)
|
|||||||
<node>
|
<node>
|
||||||
| Name | Description | Example |
|
| Name | Description | Example |
|
||||||
| :--- | :--- | :--- |
|
| :--- | :--- | :--- |
|
||||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||||
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
||||||
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
||||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` |
|
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
|
||||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` |
|
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
|
||||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` |
|
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
|
||||||
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
||||||
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>mergedWithPrevious</li><li>mergedWithNext</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>mergedWithPrevious</li><li>mergedWithNext</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
||||||
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | |
|
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | |
|
||||||
@ -148,5 +148,5 @@ special characters or identifiers that need to be reverted for example.
|
|||||||
| Name | Description |
|
| Name | Description |
|
||||||
| :--- | :--- |
|
| :--- | :--- |
|
||||||
| ByteLevel | Reverts the ByteLevel PreTokenizer. This PreTokenizer encodes at the byte-level, using a set of visible Unicode characters to represent each byte, so we need a Decoder to revert this process and get something readable again. |
|
| ByteLevel | Reverts the ByteLevel PreTokenizer. This PreTokenizer encodes at the byte-level, using a set of visible Unicode characters to represent each byte, so we need a Decoder to revert this process and get something readable again. |
|
||||||
| Metaspace | Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifer `▁` to identify whitespaces, and so this Decoder helps with decoding these. |
|
| Metaspace | Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifier `▁` to identify whitespaces, and so this Decoder helps with decoding these. |
|
||||||
| WordPiece | Reverts the WordPiece Model. This model uses a special identifier `##` for continuing subwords, and so this Decoder helps with decoding these. |
|
| WordPiece | Reverts the WordPiece Model. This model uses a special identifier `##` for continuing subwords, and so this Decoder helps with decoding these. |
|
||||||
|
@ -32,7 +32,7 @@ as running:
|
|||||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||||
```
|
```
|
||||||
|
|
||||||
Or you can easiy update it with the following command:
|
Or you can easily update it with the following command:
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
rustup update
|
rustup update
|
||||||
|
@ -290,7 +290,7 @@ The role of the model is to split your "words" into tokens, using the
|
|||||||
rules it has learned. It's also responsible for mapping those tokens to
|
rules it has learned. It's also responsible for mapping those tokens to
|
||||||
their corresponding IDs in the vocabulary of the model.
|
their corresponding IDs in the vocabulary of the model.
|
||||||
|
|
||||||
This model is passed along when intializing the
|
This model is passed along when initializing the
|
||||||
`Tokenizer` so you already know how to
|
`Tokenizer` so you already know how to
|
||||||
customize this part. Currently, the 🤗 Tokenizers library supports:
|
customize this part. Currently, the 🤗 Tokenizers library supports:
|
||||||
|
|
||||||
|
@ -132,14 +132,14 @@ The ``Normalizer`` is optional.
|
|||||||
- Removes all accent symbols in unicode (to be used with NFD for consistency)
|
- Removes all accent symbols in unicode (to be used with NFD for consistency)
|
||||||
- Input: ``é``
|
- Input: ``é``
|
||||||
|
|
||||||
Ouput: ``e``
|
Output: ``e``
|
||||||
|
|
||||||
* - Replace
|
* - Replace
|
||||||
- Replaces a custom string or regexp and changes it with given content
|
- Replaces a custom string or regexp and changes it with given content
|
||||||
- ``Replace("a", "e")`` will behave like this:
|
- ``Replace("a", "e")`` will behave like this:
|
||||||
|
|
||||||
Input: ``"banana"``
|
Input: ``"banana"``
|
||||||
Ouput: ``"benene"``
|
Output: ``"benene"``
|
||||||
|
|
||||||
* - BertNormalizer
|
* - BertNormalizer
|
||||||
- Provides an implementation of the Normalizer used in the original BERT. Options
|
- Provides an implementation of the Normalizer used in the original BERT. Options
|
||||||
@ -193,7 +193,7 @@ the ByteLevel)
|
|||||||
|
|
||||||
- Input: ``"Hello my friend, how are you?"``
|
- Input: ``"Hello my friend, how are you?"``
|
||||||
|
|
||||||
Ouput: ``"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"``
|
Output: ``"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"``
|
||||||
|
|
||||||
* - Whitespace
|
* - Whitespace
|
||||||
- Splits on word boundaries (using the following regular expression: ``\w+|[^\w\s]+``
|
- Splits on word boundaries (using the following regular expression: ``\w+|[^\w\s]+``
|
||||||
@ -211,13 +211,13 @@ the ByteLevel)
|
|||||||
- Will isolate all punctuation characters
|
- Will isolate all punctuation characters
|
||||||
- Input: ``"Hello?"``
|
- Input: ``"Hello?"``
|
||||||
|
|
||||||
Ouput: ``"Hello", "?"``
|
Output: ``"Hello", "?"``
|
||||||
|
|
||||||
* - Metaspace
|
* - Metaspace
|
||||||
- Splits on whitespaces and replaces them with a special char "▁" (U+2581)
|
- Splits on whitespaces and replaces them with a special char "▁" (U+2581)
|
||||||
- Input: ``"Hello there"``
|
- Input: ``"Hello there"``
|
||||||
|
|
||||||
Ouput: ``"Hello", "▁there"``
|
Output: ``"Hello", "▁there"``
|
||||||
|
|
||||||
* - CharDelimiterSplit
|
* - CharDelimiterSplit
|
||||||
- Splits on a given character
|
- Splits on a given character
|
||||||
@ -225,7 +225,7 @@ the ByteLevel)
|
|||||||
|
|
||||||
Input: ``"Helloxthere"``
|
Input: ``"Helloxthere"``
|
||||||
|
|
||||||
Ouput: ``"Hello", "there"``
|
Output: ``"Hello", "there"``
|
||||||
|
|
||||||
* - Digits
|
* - Digits
|
||||||
- Splits the numbers from any other characters.
|
- Splits the numbers from any other characters.
|
||||||
@ -361,7 +361,7 @@ reverted for example.
|
|||||||
a set of visible Unicode characters to represent each byte, so we need a Decoder to
|
a set of visible Unicode characters to represent each byte, so we need a Decoder to
|
||||||
revert this process and get something readable again.
|
revert this process and get something readable again.
|
||||||
* - Metaspace
|
* - Metaspace
|
||||||
- Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifer ``▁`` to
|
- Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifier ``▁`` to
|
||||||
identify whitespaces, and so this Decoder helps with decoding these.
|
identify whitespaces, and so this Decoder helps with decoding these.
|
||||||
* - WordPiece
|
* - WordPiece
|
||||||
- Reverts the WordPiece Model. This model uses a special identifier ``##`` for continuing
|
- Reverts the WordPiece Model. This model uses a special identifier ``##`` for continuing
|
||||||
|
@ -24,7 +24,7 @@ If you are using a unix based OS, the installation should be as simple as runnin
|
|||||||
|
|
||||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||||
|
|
||||||
Or you can easiy update it with the following command::
|
Or you can easily update it with the following command::
|
||||||
|
|
||||||
rustup update
|
rustup update
|
||||||
|
|
||||||
|
@ -253,7 +253,7 @@ been trained if you are using a pretrained tokenizer).
|
|||||||
The role of the model is to split your "words" into tokens, using the rules it has learned. It's
|
The role of the model is to split your "words" into tokens, using the rules it has learned. It's
|
||||||
also responsible for mapping those tokens to their corresponding IDs in the vocabulary of the model.
|
also responsible for mapping those tokens to their corresponding IDs in the vocabulary of the model.
|
||||||
|
|
||||||
This model is passed along when intializing the :entity:`Tokenizer` so you already know
|
This model is passed along when initializing the :entity:`Tokenizer` so you already know
|
||||||
how to customize this part. Currently, the 🤗 Tokenizers library supports:
|
how to customize this part. Currently, the 🤗 Tokenizers library supports:
|
||||||
|
|
||||||
- :entity:`models.BPE`
|
- :entity:`models.BPE`
|
||||||
|
@ -62,7 +62,7 @@ special tokens and/or added tokens in the sequence).
|
|||||||
- [#363]: Fix panic from unwrapping `File::open` in `count_words`
|
- [#363]: Fix panic from unwrapping `File::open` in `count_words`
|
||||||
|
|
||||||
### Changed
|
### Changed
|
||||||
- [#234]: Completely changed the alignement mappings available on `Encoding`. Previous mappings
|
- [#234]: Completely changed the alignment mappings available on `Encoding`. Previous mappings
|
||||||
were misleading and only providing offsets. New ones provide methods to easily convert between
|
were misleading and only providing offsets. New ones provide methods to easily convert between
|
||||||
`char` or `word` (input space) and `token` (output space)
|
`char` or `word` (input space) and `token` (output space)
|
||||||
- [#236]: `AddedToken` with special options like `rstrip` will keep the matched whitespaces
|
- [#236]: `AddedToken` with special options like `rstrip` will keep the matched whitespaces
|
||||||
|
@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
#[derive(Deserialize, Clone, Debug, Serialize, Default)]
|
#[derive(Deserialize, Clone, Debug, Serialize, Default)]
|
||||||
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||||
/// to pure bytes, and attempts to make them into a string. If the tokens
|
/// to pure bytes, and attempts to make them into a string. If the tokens
|
||||||
/// cannot be decoded you will get <20> instead for each inconvertable byte token
|
/// cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||||
#[non_exhaustive]
|
#[non_exhaustive]
|
||||||
pub struct ByteFallback {
|
pub struct ByteFallback {
|
||||||
#[serde(rename = "type")]
|
#[serde(rename = "type")]
|
||||||
|
@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||||
/// The CTC (Connectionist Temporal Classification) decoder takes care
|
/// The CTC (Connectionist Temporal Classification) decoder takes care
|
||||||
/// of sanitizing a list of inputs token.
|
/// of sanitizing a list of inputs token.
|
||||||
/// Due to some alignement problem the output of some models can come
|
/// Due to some alignment problem the output of some models can come
|
||||||
/// with duplicated token.
|
/// with duplicated token.
|
||||||
#[serde(tag = "type")]
|
#[serde(tag = "type")]
|
||||||
#[non_exhaustive]
|
#[non_exhaustive]
|
||||||
|
@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
#[derive(Deserialize, Clone, Debug, Serialize, Default)]
|
#[derive(Deserialize, Clone, Debug, Serialize, Default)]
|
||||||
/// Strip is a simple trick which converts tokens looking like `<0x61>`
|
/// Strip is a simple trick which converts tokens looking like `<0x61>`
|
||||||
/// to pure bytes, and attempts to make them into a string. If the tokens
|
/// to pure bytes, and attempts to make them into a string. If the tokens
|
||||||
/// cannot be decoded you will get <20> instead for each inconvertable byte token
|
/// cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||||
#[serde(tag = "type")]
|
#[serde(tag = "type")]
|
||||||
#[non_exhaustive]
|
#[non_exhaustive]
|
||||||
pub struct Strip {
|
pub struct Strip {
|
||||||
|
@ -508,7 +508,7 @@ impl TemplateProcessing {
|
|||||||
}
|
}
|
||||||
Piece::SpecialToken { id, type_id } => {
|
Piece::SpecialToken { id, type_id } => {
|
||||||
if add_special_tokens {
|
if add_special_tokens {
|
||||||
let tok = &self.special_tokens.0[id]; // We already checked existance above
|
let tok = &self.special_tokens.0[id]; // We already checked existence above
|
||||||
let len = tok.ids.len();
|
let len = tok.ids.len();
|
||||||
|
|
||||||
let encoding = Encoding::new(
|
let encoding = Encoding::new(
|
||||||
|
@ -195,9 +195,9 @@ impl NormalizedString {
|
|||||||
});
|
});
|
||||||
|
|
||||||
match (start, end) {
|
match (start, end) {
|
||||||
// Targetting inexistant beginning
|
// Targeting inexistant beginning
|
||||||
(Some(s), None) => Some(s..s),
|
(Some(s), None) => Some(s..s),
|
||||||
// Targetting inexistant end
|
// Targeting inexistant end
|
||||||
(None, Some(e)) => Some(e..e),
|
(None, Some(e)) => Some(e..e),
|
||||||
// Found the range
|
// Found the range
|
||||||
(Some(s), Some(e)) => Some(s..e),
|
(Some(s), Some(e)) => Some(s..e),
|
||||||
|
@ -3,7 +3,7 @@ use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
|
|||||||
use std::collections::HashMap;
|
use std::collections::HashMap;
|
||||||
use std::path::PathBuf;
|
use std::path::PathBuf;
|
||||||
|
|
||||||
/// Defines the aditional parameters available for the `from_pretrained` function
|
/// Defines the additional parameters available for the `from_pretrained` function
|
||||||
#[derive(Debug, Clone)]
|
#[derive(Debug, Clone)]
|
||||||
pub struct FromPretrainedParameters {
|
pub struct FromPretrainedParameters {
|
||||||
pub revision: String,
|
pub revision: String,
|
||||||
|
@ -136,7 +136,7 @@ pub fn truncate_encodings(
|
|||||||
n2 = n1 + params.max_length % 2;
|
n2 = n1 + params.max_length % 2;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Swap lengths if we swapped previosuly
|
// Swap lengths if we swapped previously
|
||||||
if swap {
|
if swap {
|
||||||
mem::swap(&mut n1, &mut n2);
|
mem::swap(&mut n1, &mut n2);
|
||||||
}
|
}
|
||||||
|
Reference in New Issue
Block a user