mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fix typos (#1715)
* Fix typos Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> * Update docs/source/quicktour.rst * Update docs/source-doc-builder/quicktour.mdx --------- Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
@ -49,7 +49,7 @@ class CustomNormalizer:
|
||||
def normalize(self, normalized: NormalizedString):
|
||||
# Most of these can be replaced by a `Sequence` combining some provided Normalizer,
|
||||
# (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
|
||||
# and it should be the prefered way. That being said, here is an example of the kind
|
||||
# and it should be the preferred way. That being said, here is an example of the kind
|
||||
# of things that can be done here:
|
||||
normalized.nfkc()
|
||||
normalized.filter(lambda char: not char.isnumeric())
|
||||
|
@ -57,7 +57,7 @@ class ByteFallback(Decoder):
|
||||
ByteFallback Decoder
|
||||
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||
to pure bytes, and attempts to make them into a string. If the tokens
|
||||
cannot be decoded you will get <20> instead for each inconvertable byte token
|
||||
cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
|
@ -389,7 +389,7 @@ class Nmt(Normalizer):
|
||||
class Precompiled(Normalizer):
|
||||
"""
|
||||
Precompiled normalizer
|
||||
Don't use manually it is used for compatiblity for SentencePiece.
|
||||
Don't use manually it is used for compatibility for SentencePiece.
|
||||
"""
|
||||
def __init__(self, precompiled_charsmap):
|
||||
pass
|
||||
|
@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer):
|
||||
BertPreTokenizer
|
||||
|
||||
This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
Each occurence of a punctuation character will be treated separately.
|
||||
Each occurrence of a punctuation character will be treated separately.
|
||||
"""
|
||||
def __init__(self):
|
||||
pass
|
||||
|
@ -325,7 +325,7 @@ class EncodingVisualizer:
|
||||
|
||||
Returns:
|
||||
A list of length len(text) whose entry at index i is None if there is no annotation on
|
||||
charachter i or k, the index of the annotation that covers index i where k is with
|
||||
character i or k, the index of the annotation that covers index i where k is with
|
||||
respect to the list of annotations
|
||||
"""
|
||||
annotation_map = [None] * len(text)
|
||||
|
@ -263,7 +263,7 @@ impl PyWordPieceDec {
|
||||
/// ByteFallback Decoder
|
||||
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
|
||||
/// to pure bytes, and attempts to make them into a string. If the tokens
|
||||
/// cannot be decoded you will get <20> instead for each inconvertable byte token
|
||||
/// cannot be decoded you will get <20> instead for each inconvertible byte token
|
||||
///
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
|
||||
pub struct PyByteFallbackDec {}
|
||||
|
@ -23,7 +23,7 @@ use pyo3::wrap_pymodule;
|
||||
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
|
||||
|
||||
// For users using multiprocessing in python, it is quite easy to fork the process running
|
||||
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
|
||||
// tokenizers, ending up with a deadlock because we internally make use of multithreading. So
|
||||
// we register a callback to be called in the event of a fork so that we can warn the user.
|
||||
#[cfg(target_family = "unix")]
|
||||
static mut REGISTERED_FORK_CALLBACK: bool = false;
|
||||
|
@ -534,7 +534,7 @@ impl PyNmt {
|
||||
}
|
||||
|
||||
/// Precompiled normalizer
|
||||
/// Don't use manually it is used for compatiblity for SentencePiece.
|
||||
/// Don't use manually it is used for compatibility for SentencePiece.
|
||||
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
|
||||
pub struct PyPrecompiled {}
|
||||
#[pymethods]
|
||||
|
@ -430,7 +430,7 @@ impl PyCharDelimiterSplit {
|
||||
/// BertPreTokenizer
|
||||
///
|
||||
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
|
||||
/// Each occurence of a punctuation character will be treated separately.
|
||||
/// Each occurrence of a punctuation character will be treated separately.
|
||||
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
|
||||
pub struct PyBertPreTokenizer {}
|
||||
#[pymethods]
|
||||
|
@ -100,7 +100,7 @@ def pyi_file(obj, indent=""):
|
||||
string += function(obj, indent)
|
||||
|
||||
elif inspect.isgetsetdescriptor(obj):
|
||||
# TODO it would be interesing to add the setter maybe ?
|
||||
# TODO it would be interesting to add the setter maybe ?
|
||||
string += f"{indent}@property\n"
|
||||
string += function(obj, indent, text_signature="(self)")
|
||||
else:
|
||||
|
@ -287,7 +287,7 @@ class TestUnigram:
|
||||
trainer.initial_alphabet = ["d", "z"]
|
||||
assert sorted(trainer.initial_alphabet) == ["d", "z"]
|
||||
|
||||
def test_continuing_prefix_trainer_mistmatch(self):
|
||||
def test_continuing_prefix_trainer_mismatch(self):
|
||||
UNK = "[UNK]"
|
||||
special_tokens = [UNK]
|
||||
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))
|
||||
|
Reference in New Issue
Block a user