* Fix typos

Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com>

* Update docs/source/quicktour.rst

* Update docs/source-doc-builder/quicktour.mdx

---------

Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com>
Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
This commit is contained in:
tinyboxvk
2025-01-09 06:53:20 -04:00
committed by GitHub
parent 6945933829
commit bdfc38b78d
25 changed files with 50 additions and 50 deletions

View File

@ -49,7 +49,7 @@ class CustomNormalizer:
def normalize(self, normalized: NormalizedString):
# Most of these can be replaced by a `Sequence` combining some provided Normalizer,
# (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
# and it should be the prefered way. That being said, here is an example of the kind
# and it should be the preferred way. That being said, here is an example of the kind
# of things that can be done here:
normalized.nfkc()
normalized.filter(lambda char: not char.isnumeric())

View File

@ -57,7 +57,7 @@ class ByteFallback(Decoder):
ByteFallback Decoder
ByteFallback is a simple trick which converts tokens looking like `<0x61>`
to pure bytes, and attempts to make them into a string. If the tokens
cannot be decoded you will get <20> instead for each inconvertable byte token
cannot be decoded you will get <20> instead for each inconvertible byte token
"""
def __init__(self):

View File

@ -389,7 +389,7 @@ class Nmt(Normalizer):
class Precompiled(Normalizer):
"""
Precompiled normalizer
Don't use manually it is used for compatiblity for SentencePiece.
Don't use manually it is used for compatibility for SentencePiece.
"""
def __init__(self, precompiled_charsmap):
pass

View File

@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer):
BertPreTokenizer
This pre-tokenizer splits tokens on spaces, and also on punctuation.
Each occurence of a punctuation character will be treated separately.
Each occurrence of a punctuation character will be treated separately.
"""
def __init__(self):
pass

View File

@ -325,7 +325,7 @@ class EncodingVisualizer:
Returns:
A list of length len(text) whose entry at index i is None if there is no annotation on
charachter i or k, the index of the annotation that covers index i where k is with
character i or k, the index of the annotation that covers index i where k is with
respect to the list of annotations
"""
annotation_map = [None] * len(text)

View File

@ -263,7 +263,7 @@ impl PyWordPieceDec {
/// ByteFallback Decoder
/// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
/// to pure bytes, and attempts to make them into a string. If the tokens
/// cannot be decoded you will get <20> instead for each inconvertable byte token
/// cannot be decoded you will get <20> instead for each inconvertible byte token
///
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
pub struct PyByteFallbackDec {}

View File

@ -23,7 +23,7 @@ use pyo3::wrap_pymodule;
pub const VERSION: &str = env!("CARGO_PKG_VERSION");
// For users using multiprocessing in python, it is quite easy to fork the process running
// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
// tokenizers, ending up with a deadlock because we internally make use of multithreading. So
// we register a callback to be called in the event of a fork so that we can warn the user.
#[cfg(target_family = "unix")]
static mut REGISTERED_FORK_CALLBACK: bool = false;

View File

@ -534,7 +534,7 @@ impl PyNmt {
}
/// Precompiled normalizer
/// Don't use manually it is used for compatiblity for SentencePiece.
/// Don't use manually it is used for compatibility for SentencePiece.
#[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
pub struct PyPrecompiled {}
#[pymethods]

View File

@ -430,7 +430,7 @@ impl PyCharDelimiterSplit {
/// BertPreTokenizer
///
/// This pre-tokenizer splits tokens on spaces, and also on punctuation.
/// Each occurence of a punctuation character will be treated separately.
/// Each occurrence of a punctuation character will be treated separately.
#[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
pub struct PyBertPreTokenizer {}
#[pymethods]

View File

@ -100,7 +100,7 @@ def pyi_file(obj, indent=""):
string += function(obj, indent)
elif inspect.isgetsetdescriptor(obj):
# TODO it would be interesing to add the setter maybe ?
# TODO it would be interesting to add the setter maybe ?
string += f"{indent}@property\n"
string += function(obj, indent, text_signature="(self)")
else:

View File

@ -287,7 +287,7 @@ class TestUnigram:
trainer.initial_alphabet = ["d", "z"]
assert sorted(trainer.initial_alphabet) == ["d", "z"]
def test_continuing_prefix_trainer_mistmatch(self):
def test_continuing_prefix_trainer_mismatch(self):
UNK = "[UNK]"
special_tokens = [UNK]
tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))