Fix typos (#1715)

* Fix typos Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> * Update docs/source/quicktour.rst * Update docs/source-doc-builder/quicktour.mdx --------- Signed-off-by: tinyboxvk <13696594+tinyboxvk@users.noreply.github.com> Co-authored-by: Nicolas Patry <patry.nicolas@protonmail.com>
2025-12-03 03:08:21 +00:00 · 2025-01-09 06:53:20 -04:00
parent 6945933829
commit bdfc38b78d
25 changed files with 50 additions and 50 deletions
--- a/bindings/python/examples/custom_components.py
+++ b/bindings/python/examples/custom_components.py
@@ -49,7 +49,7 @@ class CustomNormalizer:
    def normalize(self, normalized: NormalizedString):
        # Most of these can be replaced by a `Sequence` combining some provided Normalizer,
        # (ie Sequence([ NFKC(), Replace(Regex("\s+"), " "), Lowercase() ])
-        # and it should be the prefered way. That being said, here is an example of the kind
+        # and it should be the preferred way. That being said, here is an example of the kind
        # of things that can be done here:
        normalized.nfkc()
        normalized.filter(lambda char: not char.isnumeric())
--- a/bindings/python/py_src/tokenizers/decoders/init.pyi
+++ b/bindings/python/py_src/tokenizers/decoders/init.pyi
@@ -57,7 +57,7 @@ class ByteFallback(Decoder):
    ByteFallback Decoder
    ByteFallback is a simple trick which converts tokens looking like `<0x61>`
    to pure bytes, and attempts to make them into a string. If the tokens
-    cannot be decoded you will get <20> instead for each inconvertable byte token
+    cannot be decoded you will get <20> instead for each inconvertible byte token

    """
    def __init__(self):
--- a/bindings/python/py_src/tokenizers/normalizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/normalizers/init.pyi
@@ -389,7 +389,7 @@ class Nmt(Normalizer):
 class Precompiled(Normalizer):
    """
    Precompiled normalizer
-    Don't use manually it is used for compatiblity for SentencePiece.
+    Don't use manually it is used for compatibility for SentencePiece.
    """
    def __init__(self, precompiled_charsmap):
        pass
--- a/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
+++ b/bindings/python/py_src/tokenizers/pre_tokenizers/init.pyi
@@ -48,7 +48,7 @@ class BertPreTokenizer(PreTokenizer):
    BertPreTokenizer

    This pre-tokenizer splits tokens on spaces, and also on punctuation.
-    Each occurence of a punctuation character will be treated separately.
+    Each occurrence of a punctuation character will be treated separately.
    """
    def __init__(self):
        pass
--- a/bindings/python/py_src/tokenizers/tools/visualizer.py
+++ b/bindings/python/py_src/tokenizers/tools/visualizer.py
@@ -325,7 +325,7 @@ class EncodingVisualizer:

        Returns:
            A list of  length len(text) whose entry at index i is None if there is no annotation on
-            charachter i or k, the index of the annotation that covers index i where k is with
+            character i or k, the index of the annotation that covers index i where k is with
            respect to the list of annotations
        """
        annotation_map = [None] * len(text)
--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -263,7 +263,7 @@ impl PyWordPieceDec {
 /// ByteFallback Decoder
 /// ByteFallback is a simple trick which converts tokens looking like `<0x61>`
 /// to pure bytes, and attempts to make them into a string. If the tokens
-/// cannot be decoded you will get <20> instead for each inconvertable byte token
+/// cannot be decoded you will get <20> instead for each inconvertible byte token
 ///
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "ByteFallback")]
 pub struct PyByteFallbackDec {}
--- a/bindings/python/src/lib.rs
+++ b/bindings/python/src/lib.rs
@@ -23,7 +23,7 @@ use pyo3::wrap_pymodule;
 pub const VERSION: &str = env!("CARGO_PKG_VERSION");

 // For users using multiprocessing in python, it is quite easy to fork the process running
-// tokenizers, ending up with a deadlock because we internaly make use of multithreading. So
+// tokenizers, ending up with a deadlock because we internally make use of multithreading. So
 // we register a callback to be called in the event of a fork so that we can warn the user.
 #[cfg(target_family = "unix")]
 static mut REGISTERED_FORK_CALLBACK: bool = false;
--- a/bindings/python/src/normalizers.rs
+++ b/bindings/python/src/normalizers.rs
@@ -534,7 +534,7 @@ impl PyNmt {
 }

 /// Precompiled normalizer
-/// Don't use manually it is used for compatiblity for SentencePiece.
+/// Don't use manually it is used for compatibility for SentencePiece.
 #[pyclass(extends=PyNormalizer, module = "tokenizers.normalizers", name = "Precompiled")]
 pub struct PyPrecompiled {}
 #[pymethods]
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -430,7 +430,7 @@ impl PyCharDelimiterSplit {
 /// BertPreTokenizer
 ///
 /// This pre-tokenizer splits tokens on spaces, and also on punctuation.
-/// Each occurence of a punctuation character will be treated separately.
+/// Each occurrence of a punctuation character will be treated separately.
 #[pyclass(extends=PyPreTokenizer, module = "tokenizers.pre_tokenizers", name = "BertPreTokenizer")]
 pub struct PyBertPreTokenizer {}
 #[pymethods]
--- a/bindings/python/stub.py
+++ b/bindings/python/stub.py
@@ -100,7 +100,7 @@ def pyi_file(obj, indent=""):
        string += function(obj, indent)

    elif inspect.isgetsetdescriptor(obj):
-        # TODO it would be interesing to add the setter maybe ?
+        # TODO it would be interesting to add the setter maybe ?
        string += f"{indent}@property\n"
        string += function(obj, indent, text_signature="(self)")
    else:
--- a/bindings/python/tests/bindings/test_trainers.py
+++ b/bindings/python/tests/bindings/test_trainers.py
@@ -287,7 +287,7 @@ class TestUnigram:
        trainer.initial_alphabet = ["d", "z"]
        assert sorted(trainer.initial_alphabet) == ["d", "z"]

-    def test_continuing_prefix_trainer_mistmatch(self):
+    def test_continuing_prefix_trainer_mismatch(self):
        UNK = "[UNK]"
        special_tokens = [UNK]
        tokenizer = Tokenizer(models.BPE(unk_token=UNK, continuing_subword_prefix="##"))