* Fix typos

Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com>

* Update docs/source/quicktour.rst

* Update docs/source-doc-builder/quicktour.mdx

---------

Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
tinyboxvk
2025-01-09 06:53:20 -04:00
committed by GitHub
parent 6945933829
commit bdfc38b78d
25 changed files with 50 additions and 50 deletions

View File

@ -49,7 +49,7 @@ class CustomNormalizer:
def normalize(self, normalized: NormalizedString): def normalize(self, normalized: NormalizedString):
# Most of these can be replaced by a `Sequence` combining some provided Normalizer, # Most of these can be replaced by a `Sequence` combining some provided Normalizer,
# (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ]) # (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
# and it should be the prefered way. That being said, here is an example of the kind # and it should be the preferred way. That being said, here is an example of the kind
# of things that can be done here: # of things that can be done here:
normalized.nfkc() normalized.nfkc()
normalized.filter(lambda char: not char.isnumeric()) normalized.filter(lambda char: not char.isnumeric())

View File

@ -57,7 +57,7 @@ class ByteFallback(Decoder):
ByteFallback Decoder ByteFallback Decoder
ByteFallback is a simple trick which converts tokens looking like `<0x61>` ByteFallback is a simple trick which converts tokens looking like `<0x61>`
to pure bytes, and attempts to make them into a string. If the tokens to pure bytes, and attempts to make them into a string. If the tokens
cannot be decoded you will get <20> instead for each inconvertable byte token cannot be decoded you will get <20> instead for each inconvertible byte token
""" """
def __init__(self): def __init__(self):

View File

@ -389,7 +389,7 @@ class Nmt(Normalizer):
class Precompiled(Normalizer): class Precompiled(Normalizer):
""" """
Precompiled normalizer Precompiled normalizer
Don't use manually it is used for compatiblity for SentencePiece. Don't use manually it is used for compatibility for SentencePiece.
""" """
def __init__(self, precompiled_charsmap): def __init__(self, precompiled_charsmap):
pass pass

View File

@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer):
BertPreTokenizer BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation. This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately. Each occurrence of a punctuation character will be treated separately.
""" """
def __init__(self): def __init__(self):
pass pass

View File

@ -325,7 +325,7 @@ class EncodingVisualizer:
Returns: Returns:
A list of length len(text) whose entry at index i is None if there is no annotation on A list of length len(text) whose entry at index i is None if there is no annotation on
charachter i or k, the index of the annotation that covers index i where k is with character i or k, the index of the annotation that covers index i where k is with
respect to the list of annotations respect to the list of annotations
""" """
annotation_map = [None] * len(text) annotation_map = [None] * len(text)

View File

@ -263,7 +263,7 @@ impl PyWordPieceDec {
/// ByteFallback Decoder /// ByteFallback Decoder
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>` /// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
/// to pure bytes, and attempts to make them into a string. If the tokens /// to pure bytes, and attempts to make them into a string. If the tokens
/// cannot be decoded you will get <20> instead for each inconvertable byte token /// cannot be decoded you will get <20> instead for each inconvertible byte token
/// ///
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
pub struct PyByteFallbackDec {} pub struct PyByteFallbackDec {}

View File

@ -23,7 +23,7 @@ use pyo3::wrap_pymodule;
pub const VERSION: &str = env!("CARGO_PKG_VERSION"); pub const VERSION: &str = env!("CARGO_PKG_VERSION");
// For users using multiprocessing in python, it is quite easy to fork the process running // For users using multiprocessing in python, it is quite easy to fork the process running
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So // tokenizers, ending up with a deadlock because we internally make use of multithreading. So
// we register a callback to be called in the event of a fork so that we can warn the user. // we register a callback to be called in the event of a fork so that we can warn the user.
#[cfg(target_family = "unix")] #[cfg(target_family = "unix")]
static mut REGISTERED_FORK_CALLBACK: bool = false; static mut REGISTERED_FORK_CALLBACK: bool = false;

View File

@ -534,7 +534,7 @@ impl PyNmt {
} }
/// Precompiled normalizer /// Precompiled normalizer
/// Don't use manually it is used for compatiblity for SentencePiece. /// Don't use manually it is used for compatibility for SentencePiece.
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")] #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
pub struct PyPrecompiled {} pub struct PyPrecompiled {}
#[pymethods] #[pymethods]

View File

@ -430,7 +430,7 @@ impl PyCharDelimiterSplit {
/// BertPreTokenizer /// BertPreTokenizer
/// ///
/// This pre-tokenizer splits tokens on spaces, and also on punctuation. /// This pre-tokenizer splits tokens on spaces, and also on punctuation.
/// Each occurence of a punctuation character will be treated separately. /// Each occurrence of a punctuation character will be treated separately.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")] #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
pub struct PyBertPreTokenizer {} pub struct PyBertPreTokenizer {}
#[pymethods] #[pymethods]

View File

@ -100,7 +100,7 @@ def pyi_file(obj, indent=""):
string += function(obj, indent) string += function(obj, indent)
elif inspect.isgetsetdescriptor(obj): elif inspect.isgetsetdescriptor(obj):
# TODO it would be interesing to add the setter maybe ? # TODO it would be interesting to add the setter maybe ?
string += f"{indent}@property\n" string += f"{indent}@property\n"
string += function(obj, indent, text_signature="(self)") string += function(obj, indent, text_signature="(self)")
else: else:

View File

@ -287,7 +287,7 @@ class TestUnigram:
trainer.initial_alphabet = ["d", "z"] trainer.initial_alphabet = ["d", "z"]
assert sorted(trainer.initial_alphabet) == ["d", "z"] assert sorted(trainer.initial_alphabet) == ["d", "z"]
def test_continuing_prefix_trainer_mistmatch(self): def test_continuing_prefix_trainer_mismatch(self):
UNK = "[UNK]" UNK = "[UNK]"
special_tokens = [UNK] special_tokens = [UNK]
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##")) tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))

View File

@ -25,8 +25,8 @@ The `Normalizer` is optional.
| NFKC | NFKC unicode normalization | | | NFKC | NFKC unicode normalization | |
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` | | Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` | | Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` | | StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` | | Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | | | BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | |
| Sequence | Composes multiple normalizers that will run in the provided order | `Sequence([NFKC(), Lowercase()])` | | Sequence | Composes multiple normalizers that will run in the provided order | `Sequence([NFKC(), Lowercase()])` |
</python> </python>
@ -39,8 +39,8 @@ The `Normalizer` is optional.
| NFKC | NFKC unicode normalization | | | NFKC | NFKC unicode normalization | |
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` | | Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` | | Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` | | StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` | | Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | | | BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>clean_text</li> <li>handle_chinese_chars</li> <li>strip_accents</li> <li>lowercase</li> </ul> | |
| Sequence | Composes multiple normalizers that will run in the provided order | `Sequence::new(vec![NFKC, Lowercase])` | | Sequence | Composes multiple normalizers that will run in the provided order | `Sequence::new(vec![NFKC, Lowercase])` |
</rust> </rust>
@ -53,8 +53,8 @@ The `Normalizer` is optional.
| NFKC | NFKC unicode normalization | | | NFKC | NFKC unicode normalization | |
| Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` | | Lowercase | Replaces all uppercase to lowercase | Input: `HELLO ὈΔΥΣΣΕΎΣ` <br> Output: `hello`ὀδυσσεύς` |
| Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` | | Strip | Removes all whitespace characters on the specified sides (left, right or both) of the input | Input: `"`hi`"` <br> Output: `"hi"` |
| StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Ouput: `e` | | StripAccents | Removes all accent symbols in unicode (to be used with NFD for consistency) | Input: `é` <br> Output: `e` |
| Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Ouput: `"benene"` | | Replace | Replaces a custom string or regexp and changes it with given content | `Replace("a", "e")` will behave like this: <br> Input: `"banana"` <br> Output: `"benene"` |
| BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>cleanText</li> <li>handleChineseChars</li> <li>stripAccents</li> <li>lowercase</li> </ul> | | | BertNormalizer | Provides an implementation of the Normalizer used in the original BERT. Options that can be set are: <ul> <li>cleanText</li> <li>handleChineseChars</li> <li>stripAccents</li> <li>lowercase</li> </ul> | |
| Sequence | Composes multiple normalizers that will run in the provided order | | | Sequence | Composes multiple normalizers that will run in the provided order | |
</node> </node>
@ -78,12 +78,12 @@ the ByteLevel)
<python> <python>
| Name | Description | Example | | Name | Description | Example |
| :--- | :--- | :--- | | :--- | :--- | :--- |
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` | | ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+&#124;[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` | | Whitespace | Splits on word boundaries (using the following regular expression: `\w+&#124;[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` | | WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` | | Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` | | Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` | | CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` | | Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>merged_with_previous</li><li>merged_with_next</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` | | Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>merged_with_previous</li><li>merged_with_next</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence([Punctuation(), WhitespaceSplit()])` | | Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence([Punctuation(), WhitespaceSplit()])` |
@ -91,12 +91,12 @@ the ByteLevel)
<rust> <rust>
| Name | Description | Example | | Name | Description | Example |
| :--- | :--- | :--- | | :--- | :--- | :--- |
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` | | ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+&#124;[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` | | Whitespace | Splits on word boundaries (using the following regular expression: `\w+&#124;[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` | | WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` | | Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` | | Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` | | CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` | | Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>Removed</li><li>Isolated</li><li>MergedWithPrevious</li><li>MergedWithNext</li><li>Contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` | | Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>Removed</li><li>Isolated</li><li>MergedWithPrevious</li><li>MergedWithNext</li><li>Contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence::new(vec![Punctuation, WhitespaceSplit])` | | Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | `Sequence::new(vec![Punctuation, WhitespaceSplit])` |
@ -104,12 +104,12 @@ the ByteLevel)
<node> <node>
| Name | Description | Example | | Name | Description | Example |
| :--- | :--- | :--- | | :--- | :--- | :--- |
| ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Ouput: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` | | ByteLevel | Splits on whitespaces while remapping all the bytes to a set of visible characters. This technique as been introduced by OpenAI with GPT-2 and has some more or less nice properties: <ul> <li>Since it maps on bytes, a tokenizer using this only requires **256** characters as initial alphabet (the number of values a byte can have), as opposed to the 130,000+ Unicode characters.</li> <li>A consequence of the previous point is that it is absolutely unnecessary to have an unknown token using this since we can represent anything with 256 tokens (Youhou!! 🎉🎉)</li> <li>For non ascii characters, it gets completely unreadable, but it works nonetheless!</li> </ul> | Input: `"Hello my friend, how are you?"` <br> Output: `"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"` |
| Whitespace | Splits on word boundaries (using the following regular expression: `\w+&#124;[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` | | Whitespace | Splits on word boundaries (using the following regular expression: `\w+&#124;[^\w\s]+` | Input: `"Hello there!"` <br> Output: `"Hello", "there", "!"` |
| WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` | | WhitespaceSplit | Splits on any whitespace character | Input: `"Hello there!"` <br> Output: `"Hello", "there!"` |
| Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Ouput: `"Hello", "?"` | | Punctuation | Will isolate all punctuation characters | Input: `"Hello?"` <br> Output: `"Hello", "?"` |
| Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Ouput: `"Hello", "▁there"` | | Metaspace | Splits on whitespaces and replaces them with a special char “▁” (U+2581) | Input: `"Hello there"` <br> Output: `"Hello", "▁there"` |
| CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Ouput: `"Hello", "there"` | | CharDelimiterSplit | Splits on a given character | Example with `x`: <br> Input: `"Helloxthere"` <br> Output: `"Hello", "there"` |
| Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` | | Digits | Splits the numbers from any other characters. | Input: `"Hello123there"` <br> Output: ``"Hello", "123", "there"`` |
| Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>mergedWithPrevious</li><li>mergedWithNext</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` | | Split | Versatile pre-tokenizer that splits on provided pattern and according to provided behavior. The pattern can be inverted if necessary. <ul> <li>pattern should be either a custom string or regexp.</li> <li>behavior should be one of: <ul><li>removed</li><li>isolated</li><li>mergedWithPrevious</li><li>mergedWithNext</li><li>contiguous</li></ul></li> <li>invert should be a boolean flag.</li> </ul> | Example with pattern = ` `, behavior = `"isolated"`, invert = `False`: <br> Input: `"Hello, how are you?"` <br> Output: `"Hello,", " ", "how", " ", "are", " ", "you?"` |
| Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | | | Sequence | Lets you compose multiple `PreTokenizer` that will be run in the given order | |
@ -148,5 +148,5 @@ special characters or identifiers that need to be reverted for example.
| Name | Description | | Name | Description |
| :--- | :--- | | :--- | :--- |
| ByteLevel | Reverts the ByteLevel PreTokenizer. This PreTokenizer encodes at the byte-level, using a set of visible Unicode characters to represent each byte, so we need a Decoder to revert this process and get something readable again. | | ByteLevel | Reverts the ByteLevel PreTokenizer. This PreTokenizer encodes at the byte-level, using a set of visible Unicode characters to represent each byte, so we need a Decoder to revert this process and get something readable again. |
| Metaspace | Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifer `▁` to identify whitespaces, and so this Decoder helps with decoding these. | | Metaspace | Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifier `▁` to identify whitespaces, and so this Decoder helps with decoding these. |
| WordPiece | Reverts the WordPiece Model. This model uses a special identifier `##` for continuing subwords, and so this Decoder helps with decoding these. | | WordPiece | Reverts the WordPiece Model. This model uses a special identifier `##` for continuing subwords, and so this Decoder helps with decoding these. |

View File

@ -32,7 +32,7 @@ as running:
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
``` ```
Or you can easiy update it with the following command: Or you can easily update it with the following command:
```bash ```bash
rustup update rustup update

View File

@ -290,7 +290,7 @@ The role of the model is to split your "words" into tokens, using the
rules it has learned. It's also responsible for mapping those tokens to rules it has learned. It's also responsible for mapping those tokens to
their corresponding IDs in the vocabulary of the model. their corresponding IDs in the vocabulary of the model.
This model is passed along when intializing the This model is passed along when initializing the
`Tokenizer` so you already know how to `Tokenizer` so you already know how to
customize this part. Currently, the 🤗 Tokenizers library supports: customize this part. Currently, the 🤗 Tokenizers library supports:

View File

@ -132,14 +132,14 @@ The ``Normalizer`` is optional.
- Removes all accent symbols in unicode (to be used with NFD for consistency) - Removes all accent symbols in unicode (to be used with NFD for consistency)
- Input: ``é`` - Input: ``é``
Ouput: ``e`` Output: ``e``
* - Replace * - Replace
- Replaces a custom string or regexp and changes it with given content - Replaces a custom string or regexp and changes it with given content
- ``Replace("a", "e")`` will behave like this: - ``Replace("a", "e")`` will behave like this:
Input: ``"banana"`` Input: ``"banana"``
Ouput: ``"benene"`` Output: ``"benene"``
* - BertNormalizer * - BertNormalizer
- Provides an implementation of the Normalizer used in the original BERT. Options - Provides an implementation of the Normalizer used in the original BERT. Options
@ -193,7 +193,7 @@ the ByteLevel)
- Input: ``"Hello my friend, how are you?"`` - Input: ``"Hello my friend, how are you?"``
Ouput: ``"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"`` Output: ``"Hello", "Ġmy", Ġfriend", ",", "Ġhow", "Ġare", "Ġyou", "?"``
* - Whitespace * - Whitespace
- Splits on word boundaries (using the following regular expression: ``\w+|[^\w\s]+`` - Splits on word boundaries (using the following regular expression: ``\w+|[^\w\s]+``
@ -211,13 +211,13 @@ the ByteLevel)
- Will isolate all punctuation characters - Will isolate all punctuation characters
- Input: ``"Hello?"`` - Input: ``"Hello?"``
Ouput: ``"Hello", "?"`` Output: ``"Hello", "?"``
* - Metaspace * - Metaspace
- Splits on whitespaces and replaces them with a special char "▁" (U+2581) - Splits on whitespaces and replaces them with a special char "▁" (U+2581)
- Input: ``"Hello there"`` - Input: ``"Hello there"``
Ouput: ``"Hello", "▁there"`` Output: ``"Hello", "▁there"``
* - CharDelimiterSplit * - CharDelimiterSplit
- Splits on a given character - Splits on a given character
@ -225,7 +225,7 @@ the ByteLevel)
Input: ``"Helloxthere"`` Input: ``"Helloxthere"``
Ouput: ``"Hello", "there"`` Output: ``"Hello", "there"``
* - Digits * - Digits
- Splits the numbers from any other characters. - Splits the numbers from any other characters.
@ -361,7 +361,7 @@ reverted for example.
a set of visible Unicode characters to represent each byte, so we need a Decoder to a set of visible Unicode characters to represent each byte, so we need a Decoder to
revert this process and get something readable again. revert this process and get something readable again.
* - Metaspace * - Metaspace
- Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifer ```` to - Reverts the Metaspace PreTokenizer. This PreTokenizer uses a special identifier ```` to
identify whitespaces, and so this Decoder helps with decoding these. identify whitespaces, and so this Decoder helps with decoding these.
* - WordPiece * - WordPiece
- Reverts the WordPiece Model. This model uses a special identifier ``##`` for continuing - Reverts the WordPiece Model. This model uses a special identifier ``##`` for continuing

View File

@ -24,7 +24,7 @@ If you are using a unix based OS, the installation should be as simple as runnin
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh
Or you can easiy update it with the following command:: Or you can easily update it with the following command::
rustup update rustup update

View File

@ -253,7 +253,7 @@ been trained if you are using a pretrained tokenizer).
The role of the model is to split your "words" into tokens, using the rules it has learned. It's The role of the model is to split your "words" into tokens, using the rules it has learned. It's
also responsible for mapping those tokens to their corresponding IDs in the vocabulary of the model. also responsible for mapping those tokens to their corresponding IDs in the vocabulary of the model.
This model is passed along when intializing the :entity:`Tokenizer` so you already know This model is passed along when initializing the :entity:`Tokenizer` so you already know
how to customize this part. Currently, the 🤗 Tokenizers library supports: how to customize this part. Currently, the 🤗 Tokenizers library supports:
- :entity:`models.BPE` - :entity:`models.BPE`

View File

@ -62,7 +62,7 @@ special tokens and/or added tokens in the sequence).
- [#363]: Fix panic from unwrapping `File::open` in `count_words` - [#363]: Fix panic from unwrapping `File::open` in `count_words`
### Changed ### Changed
- [#234]: Completely changed the alignement mappings available on `Encoding`. Previous mappings - [#234]: Completely changed the alignment mappings available on `Encoding`. Previous mappings
were misleading and only providing offsets. New ones provide methods to easily convert between were misleading and only providing offsets. New ones provide methods to easily convert between
`char` or `word` (input space) and `token` (output space) `char` or `word` (input space) and `token` (output space)
- [#236]: `AddedToken` with special options like `rstrip` will keep the matched whitespaces - [#236]: `AddedToken` with special options like `rstrip` will keep the matched whitespaces

View File

@ -6,7 +6,7 @@ use serde::{Deserialize, Serialize};
#[derive(Deserialize, Clone, Debug, Serialize, Default)] #[derive(Deserialize, Clone, Debug, Serialize, Default)]
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>` /// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
/// to pure bytes, and attempts to make them into a string. If the tokens /// to pure bytes, and attempts to make them into a string. If the tokens
/// cannot be decoded you will get <20> instead for each inconvertable byte token /// cannot be decoded you will get <20> instead for each inconvertible byte token
#[non_exhaustive] #[non_exhaustive]
pub struct ByteFallback { pub struct ByteFallback {
#[serde(rename = "type")] #[serde(rename = "type")]

View File

@ -7,7 +7,7 @@ use serde::{Deserialize, Serialize};
#[derive(Debug, Clone, Serialize, Deserialize)] #[derive(Debug, Clone, Serialize, Deserialize)]
/// The CTC (Connectionist Temporal Classification) decoder takes care /// The CTC (Connectionist Temporal Classification) decoder takes care
/// of sanitizing a list of inputs token. /// of sanitizing a list of inputs token.
/// Due to some alignement problem the output of some models can come /// Due to some alignment problem the output of some models can come
/// with duplicated token. /// with duplicated token.
#[serde(tag = "type")] #[serde(tag = "type")]
#[non_exhaustive] #[non_exhaustive]

View File

@ -5,7 +5,7 @@ use serde::{Deserialize, Serialize};
#[derive(Deserialize, Clone, Debug, Serialize, Default)] #[derive(Deserialize, Clone, Debug, Serialize, Default)]
/// Strip is a simple trick which converts tokens looking like `<0x61>` /// Strip is a simple trick which converts tokens looking like `<0x61>`
/// to pure bytes, and attempts to make them into a string. If the tokens /// to pure bytes, and attempts to make them into a string. If the tokens
/// cannot be decoded you will get <20> instead for each inconvertable byte token /// cannot be decoded you will get <20> instead for each inconvertible byte token
#[serde(tag = "type")] #[serde(tag = "type")]
#[non_exhaustive] #[non_exhaustive]
pub struct Strip { pub struct Strip {

View File

@ -508,7 +508,7 @@ impl TemplateProcessing {
} }
Piece::SpecialToken { id, type_id } => { Piece::SpecialToken { id, type_id } => {
if add_special_tokens { if add_special_tokens {
let tok = &self.special_tokens.0[id]; // We already checked existance above let tok = &self.special_tokens.0[id]; // We already checked existence above
let len = tok.ids.len(); let len = tok.ids.len();
let encoding = Encoding::new( let encoding = Encoding::new(

View File

@ -195,9 +195,9 @@ impl NormalizedString {
}); });
match (start, end) { match (start, end) {
// Targetting inexistant beginning // Targeting inexistant beginning
(Some(s), None) => Some(s..s), (Some(s), None) => Some(s..s),
// Targetting inexistant end // Targeting inexistant end
(None, Some(e)) => Some(e..e), (None, Some(e)) => Some(e..e),
// Found the range // Found the range
(Some(s), Some(e)) => Some(s..e), (Some(s), Some(e)) => Some(s..e),

View File

@ -3,7 +3,7 @@ use hf_hub::{api::sync::ApiBuilder, Repo, RepoType};
use std::collections::HashMap; use std::collections::HashMap;
use std::path::PathBuf; use std::path::PathBuf;
/// Defines the aditional parameters available for the `from_pretrained` function /// Defines the additional parameters available for the `from_pretrained` function
#[derive(Debug, Clone)] #[derive(Debug, Clone)]
pub struct FromPretrainedParameters { pub struct FromPretrainedParameters {
pub revision: String, pub revision: String,

View File

@ -136,7 +136,7 @@ pub fn truncate_encodings(
n2 = n1 + params.max_length % 2; n2 = n1 + params.max_length % 2;
} }
// Swap lengths if we swapped previosuly // Swap lengths if we swapped previously
if swap { if swap {
mem::swap(&mut n1, &mut n2); mem::swap(&mut n1, &mut n2);
} }