mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 08:15:49 +00:00
Fix typos (#1715)
* Fix typos Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> * Update docs/source/quicktour.rst * Update docs/source-doc-builder/quicktour.mdx --------- Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -49,7 +49,7 @@ class CustomNormalizer:
|
||||
def normalize(self, normalized: NormalizedString):
|
||||
# Most of these can be replaced by a `Sequence` combining some provided Normalizer,
|
||||
# (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
|
||||
# and it should be the prefered way. That being said, here is an example of the kind
|
||||
# and it should be the preferred way. That being said, here is an example of the kind
|
||||
# of things that can be done here:
|
||||
normalized.nfkc()
|
||||
normalized.filter(lambda char: not char.isnumeric())
|
||||
|
@ -57,7 +57,7 @@ class ByteFallback(Decoder):
|
||||
ByteFallback Decoder
|
||||
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||
to pure bytes, and attempts to make them into a string. If the tokens
|
||||
cannot be decoded you will get <20> instead for each inconvertable byte token
|
||||
cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
|
@ -389,7 +389,7 @@ class Nmt(Normalizer):
|
||||
class Precompiled(Normalizer):
|
||||
"""
|
||||
Precompiled normalizer
|
||||
Don't use manually it is used for compatiblity for SentencePiece.
|
||||
Don't use manually it is used for compatibility for SentencePiece.
|
||||
"""
|
||||
def __init__(self, precompiled_charsmap):
|
||||
pass
|
||||
|
@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer):
|
||||
BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
Each occurrence of a punctuation character will be treated separately.
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
@ -325,7 +325,7 @@ class EncodingVisualizer:
|
||||
|
||||
Returns:
|
||||
A list of length len(text) whose entry at index i is None if there is no annotation on
|
||||
charachter i or k, the index of the annotation that covers index i where k is with
|
||||
character i or k, the index of the annotation that covers index i where k is with
|
||||
respect to the list of annotations
|
||||
"""
|
||||
annotation_map = [None] * len(text)
|
||||
|
@ -263,7 +263,7 @@ impl PyWordPieceDec {
|
||||
/// ByteFallback Decoder
|
||||
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||
/// to pure bytes, and attempts to make them into a string. If the tokens
|
||||
/// cannot be decoded you will get <20> instead for each inconvertable byte token
|
||||
/// cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||
///
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
|
||||
pub struct PyByteFallbackDec {}
|
||||
|
@ -23,7 +23,7 @@ use pyo3::wrap_pymodule;
|
||||
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
// For users using multiprocessing in python, it is quite easy to fork the process running
|
||||
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
|
||||
// tokenizers, ending up with a deadlock because we internally make use of multithreading. So
|
||||
// we register a callback to be called in the event of a fork so that we can warn the user.
|
||||
#[cfg(target_family = "unix")]
|
||||
static mut REGISTERED_FORK_CALLBACK: bool = false;
|
||||
|
@ -534,7 +534,7 @@ impl PyNmt {
|
||||
}
|
||||
|
||||
/// Precompiled normalizer
|
||||
/// Don't use manually it is used for compatiblity for SentencePiece.
|
||||
/// Don't use manually it is used for compatibility for SentencePiece.
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
|
||||
pub struct PyPrecompiled {}
|
||||
#[pymethods]
|
||||
|
@ -430,7 +430,7 @@ impl PyCharDelimiterSplit {
|
||||
/// BertPreTokenizer
|
||||
///
|
||||
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
/// Each occurence of a punctuation character will be treated separately.
|
||||
/// Each occurrence of a punctuation character will be treated separately.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
|
||||
pub struct PyBertPreTokenizer {}
|
||||
#[pymethods]
|
||||
|
@ -100,7 +100,7 @@ def pyi_file(obj, indent=""):
|
||||
string += function(obj, indent)
|
||||
|
||||
elif inspect.isgetsetdescriptor(obj):
|
||||
# TODO it would be interesing to add the setter maybe ?
|
||||
# TODO it would be interesting to add the setter maybe ?
|
||||
string += f"{indent}@property\n"
|
||||
string += function(obj, indent, text_signature="(self)")
|
||||
else:
|
||||
|
@ -287,7 +287,7 @@ class TestUnigram:
|
||||
trainer.initial_alphabet = ["d", "z"]
|
||||
assert sorted(trainer.initial_alphabet) == ["d", "z"]
|
||||
|
||||
def test_continuing_prefix_trainer_mistmatch(self):
|
||||
def test_continuing_prefix_trainer_mismatch(self):
|
||||
UNK = "[UNK]"
|
||||
special_tokens = [UNK]
|
||||
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
|
||||
|
@ -25,8 +25,8 @@ The `Normalizer` is optional.
|
||||
| NFKC | NFKC unicode normalization | |
|
||||
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
||||
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` |
|
||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` |
|
||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
|
||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
|
||||
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | |
|
||||
| Sequence | Composes multiple normalizers that will run in the provided order | `Sequence([NFKC(), Lowercase()])` |
|
||||
</python>
|
||||
@ -39,8 +39,8 @@ The `Normalizer` is optional.
|
||||
| NFKC | NFKC unicode normalization | |
|
||||
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
||||
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` |
|
||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` |
|
||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
|
||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
|
||||
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | |
|
||||
| Sequence | Composes multiple normalizers that will run in the provided order | `Sequence::new(vec![NFKC, Lowercase])` |
|
||||
</rust>
|
||||
@ -53,8 +53,8 @@ The `Normalizer` is optional.
|
||||
| NFKC | NFKC unicode normalization | |
|
||||
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
|
||||
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
|
||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` |
|
||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` |
|
||||
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
|
||||
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
|
||||
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>cleanText</li> <li>handleChineseChars</li> <li>stripAccents</li> <li>lowercase</li> </ul> | |
|
||||
| Sequence | Composes multiple normalizers that will run in the provided order | |
|
||||
</node>
|
||||
@ -78,12 +78,12 @@ the ByteLevel)
|
||||
<python>
|
||||
| Name | Description | Example |
|
||||
| :--- | :--- | :--- |
|
||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
||||
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` |
|
||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` |
|
||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` |
|
||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
|
||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
|
||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
|
||||
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
||||
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>merged_with_previous</li><li>merged_with_next</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
||||
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence([Punctuation(), WhitespaceSplit()])` |
|
||||
@ -91,12 +91,12 @@ the ByteLevel)
|
||||
<rust>
|
||||
| Name | Description | Example |
|
||||
| :--- | :--- | :--- |
|
||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
||||
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` |
|
||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` |
|
||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` |
|
||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
|
||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
|
||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
|
||||
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
||||
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>Removed</li><li>Isolated</li><li>MergedWithPrevious</li><li>MergedWithNext</li><li>Contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
||||
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence::new(vec![Punctuation, WhitespaceSplit])` |
|
||||
@ -104,12 +104,12 @@ the ByteLevel)
|
||||
<node>
|
||||
| Name | Description | Example |
|
||||
| :--- | :--- | :--- |
|
||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
|
||||
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+|[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
|
||||
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
|
||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` |
|
||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` |
|
||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` |
|
||||
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
|
||||
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
|
||||
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
|
||||
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
|
||||
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>mergedWithPrevious</li><li>mergedWithNext</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
|
||||
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | |
|
||||
@ -148,5 +148,5 @@ special characters or identifiers that need to be reverted for example.
|
||||
| Name | Description |
|
||||
| :--- | :--- |
|
||||
| ByteLevel | Reverts the ByteLevel PreTokenizer. This PreTokenizer encodes at the byte-level, using a set of visible Unicode characters to represent each byte, so we need a Decoder to revert this process and get something readable again. |
|
||||
| Metaspace | Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifer `▁` to identify whitespaces, and so this Decoder helps with decoding these. |
|
||||
| Metaspace | Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifier `▁` to identify whitespaces, and so this Decoder helps with decoding these. |
|
||||
| WordPiece | Reverts the WordPiece Model. This model uses a special identifier `##` for continuing subwords, and so this Decoder helps with decoding these. |
|
||||
|
@ -32,7 +32,7 @@ as running:
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
```
|
||||
|
||||
Or you can easiy update it with the following command:
|
||||
Or you can easily update it with the following command:
|
||||
|
||||
```bash
|
||||
rustup update
|
||||
|
@ -290,7 +290,7 @@ The role of the model is to split your "words" into tokens, using the
|
||||
rules it has learned. It's also responsible for mapping those tokens to
|
||||
their corresponding IDs in the vocabulary of the model.
|
||||
|
||||
This model is passed along when intializing the
|
||||
This model is passed along when initializing the
|
||||
`Tokenizer` so you already know how to
|
||||
customize this part. Currently, the 🤗 Tokenizers library supports:
|
||||
|
||||
|
@ -132,14 +132,14 @@ The ``Normalizer`` is optional.
|
||||
- Removes all accent symbols in unicode (to be used with NFD for consistency)
|
||||
- Input: ``é``
|
||||
|
||||
Ouput: ``e``
|
||||
Output: ``e``
|
||||
|
||||
* - Replace
|
||||
- Replaces a custom string or regexp and changes it with given content
|
||||
- ``Replace("a", "e")`` will behave like this:
|
||||
|
||||
Input: ``"banana"``
|
||||
Ouput: ``"benene"``
|
||||
Output: ``"benene"``
|
||||
|
||||
* - BertNormalizer
|
||||
- Provides an implementation of the Normalizer used in the original BERT. Options
|
||||
@ -193,7 +193,7 @@ the ByteLevel)
|
||||
|
||||
- Input: ``"Hello my friend, how are you?"``
|
||||
|
||||
Ouput: ``"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"``
|
||||
Output: ``"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"``
|
||||
|
||||
* - Whitespace
|
||||
- Splits on word boundaries (using the following regular expression: ``\w+|[^\w\s]+``
|
||||
@ -211,13 +211,13 @@ the ByteLevel)
|
||||
- Will isolate all punctuation characters
|
||||
- Input: ``"Hello?"``
|
||||
|
||||
Ouput: ``"Hello", "?"``
|
||||
Output: ``"Hello", "?"``
|
||||
|
||||
* - Metaspace
|
||||
- Splits on whitespaces and replaces them with a special char "▁" (U+2581)
|
||||
- Input: ``"Hello there"``
|
||||
|
||||
Ouput: ``"Hello", "▁there"``
|
||||
Output: ``"Hello", "▁there"``
|
||||
|
||||
* - CharDelimiterSplit
|
||||
- Splits on a given character
|
||||
@ -225,7 +225,7 @@ the ByteLevel)
|
||||
|
||||
Input: ``"Helloxthere"``
|
||||
|
||||
Ouput: ``"Hello", "there"``
|
||||
Output: ``"Hello", "there"``
|
||||
|
||||
* - Digits
|
||||
- Splits the numbers from any other characters.
|
||||
@ -361,7 +361,7 @@ reverted for example.
|
||||
a set of visible Unicode characters to represent each byte, so we need a Decoder to
|
||||
revert this process and get something readable again.
|
||||
* - Metaspace
|
||||
- Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifer ``▁`` to
|
||||
- Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifier ``▁`` to
|
||||
identify whitespaces, and so this Decoder helps with decoding these.
|
||||
* - WordPiece
|
||||
- Reverts the WordPiece Model. This model uses a special identifier ``##`` for continuing
|
||||
|
@ -24,7 +24,7 @@ If you are using a unix based OS, the installation should be as simple as runnin
|
||||
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
|
||||
|
||||
Or you can easiy update it with the following command::
|
||||
Or you can easily update it with the following command::
|
||||
|
||||
rustup update
|
||||
|
||||
|
@ -253,7 +253,7 @@ been trained if you are using a pretrained tokenizer).
|
||||
The role of the model is to split your "words" into tokens, using the rules it has learned. It's
|
||||
also responsible for mapping those tokens to their corresponding IDs in the vocabulary of the model.
|
||||
|
||||
This model is passed along when intializing the :entity:`Tokenizer` so you already know
|
||||
This model is passed along when initializing the :entity:`Tokenizer` so you already know
|
||||
how to customize this part. Currently, the 🤗 Tokenizers library supports:
|
||||
|
||||
- :entity:`models.BPE`
|
||||
|
@ -62,7 +62,7 @@ special tokens and/or added tokens in the sequence).
|
||||
- [#363]: Fix panic from unwrapping `File::open` in `count_words`
|
||||
|
||||
### Changed
|
||||
- [#234]: Completely changed the alignement mappings available on `Encoding`. Previous mappings
|
||||
- [#234]: Completely changed the alignment mappings available on `Encoding`. Previous mappings
|
||||
were misleading and only providing offsets. New ones provide methods to easily convert between
|
||||
`char` or `word` (input space) and `token` (output space)
|
||||
- [#236]: `AddedToken` with special options like `rstrip` will keep the matched whitespaces
|
||||
|
@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize};
|
||||
#[derive(Deserialize, Clone, Debug, Serialize, Default)]
|
||||
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||
/// to pure bytes, and attempts to make them into a string. If the tokens
|
||||
/// cannot be decoded you will get <20> instead for each inconvertable byte token
|
||||
/// cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||
#[non_exhaustive]
|
||||
pub struct ByteFallback {
|
||||
#[serde(rename = "type")]
|
||||
|
@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
|
||||
#[derive(Debug, Clone, Serialize, Deserialize)]
|
||||
/// The CTC (Connectionist Temporal Classification) decoder takes care
|
||||
/// of sanitizing a list of inputs token.
|
||||
/// Due to some alignement problem the output of some models can come
|
||||
/// Due to some alignment problem the output of some models can come
|
||||
/// with duplicated token.
|
||||
#[serde(tag = "type")]
|
||||
#[non_exhaustive]
|
||||
|
@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};
|
||||
#[derive(Deserialize, Clone, Debug, Serialize, Default)]
|
||||
/// Strip is a simple trick which converts tokens looking like `<0x61>`
|
||||
/// to pure bytes, and attempts to make them into a string. If the tokens
|
||||
/// cannot be decoded you will get <20> instead for each inconvertable byte token
|
||||
/// cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||
#[serde(tag = "type")]
|
||||
#[non_exhaustive]
|
||||
pub struct Strip {
|
||||
|
@ -508,7 +508,7 @@ impl TemplateProcessing {
|
||||
}
|
||||
Piece::SpecialToken { id, type_id } => {
|
||||
if add_special_tokens {
|
||||
let tok = &self.special_tokens.0[id]; // We already checked existance above
|
||||
let tok = &self.special_tokens.0[id]; // We already checked existence above
|
||||
let len = tok.ids.len();
|
||||
|
||||
let encoding = Encoding::new(
|
||||
|
@ -195,9 +195,9 @@ impl NormalizedString {
|
||||
});
|
||||
|
||||
match (start, end) {
|
||||
// Targetting inexistant beginning
|
||||
// Targeting inexistant beginning
|
||||
(Some(s), None) => Some(s..s),
|
||||
// Targetting inexistant end
|
||||
// Targeting inexistant end
|
||||
(None, Some(e)) => Some(e..e),
|
||||
// Found the range
|
||||
(Some(s), Some(e)) => Some(s..e),
|
||||
|
@ -3,7 +3,7 @@ use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
|
||||
/// Defines the aditional parameters available for the `from_pretrained` function
|
||||
/// Defines the additional parameters available for the `from_pretrained` function
|
||||
#[derive(Debug, Clone)]
|
||||
pub struct FromPretrainedParameters {
|
||||
pub revision: String,
|
||||
|
@ -136,7 +136,7 @@ pub fn truncate_encodings(
|
||||
n2 = n1 + params.max_length % 2;
|
||||
}
|
||||
|
||||
// Swap lengths if we swapped previosuly
|
||||
// Swap lengths if we swapped previously
|
||||
if swap {
|
||||
mem::swap(&mut n1, &mut n2);
|
||||
}
|
||||
|
Reference in New Issue
Block a user