Fix typos in strings and comments (#1770)

2025-12-03 03:08:21 +00:00 · 2025-05-27 14:17:36 +08:00
parent 67db0cd1dd
commit f1faec1756
15 changed files with 16 additions and 16 deletions
--- a/bindings/node/lib/bindings/encoding.test.ts
+++ b/bindings/node/lib/bindings/encoding.test.ts
@@ -122,7 +122,7 @@ describe('Encoding', () => {
      expect(indexes).toEqual([3, 5])
    })

-    it('returns the corrent indexes with pair sequences', () => {
+    it('returns the correct indexes with pair sequences', () => {
      expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
      expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
    })
--- a/bindings/python/Cargo.toml
+++ b/bindings/python/Cargo.toml
@@ -27,4 +27,4 @@ tempfile = "3.10"
 pyo3 = { version = "0.23", features = ["auto-initialize"] }

 [features]
-defaut = ["pyo3/extension-module"]
+default = ["pyo3/extension-module"]
--- a/bindings/python/scripts/convert.py
+++ b/bindings/python/scripts/convert.py
@@ -397,7 +397,7 @@ def main():
        "--models",
        type=lambda s: s.split(","),
        default=pretraineds,
-        help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",
+        help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",
    )
    args = parser.parse_args()

--- a/bindings/python/src/decoders.rs
+++ b/bindings/python/src/decoders.rs
@@ -404,7 +404,7 @@ impl PyMetaspaceDec {
 ///
 /// Args:
 ///     suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
-///         The suffix that was used to caracterize an end-of-word. This suffix will
+///         The suffix that was used to characterize an end-of-word. This suffix will
 ///         be replaced by whitespaces during the decoding
 #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
 pub struct PyBPEDecoder {}
--- a/tokenizers/src/models/bpe/model.rs
+++ b/tokenizers/src/models/bpe/model.rs
@@ -221,7 +221,7 @@ pub struct BPE {
    pub unk_token: Option<String>,
    /// An optional prefix to use on any subword that exist only behind another one
    pub continuing_subword_prefix: Option<String>,
-    /// An optional suffix to caracterize and end-of-word subword
+    /// An optional suffix to characterize and end-of-word subword
    pub end_of_word_suffix: Option<String>,
    /// Do multiple unk tokens get fused
    pub fuse_unk: bool,
--- a/tokenizers/src/models/bpe/trainer.rs
+++ b/tokenizers/src/models/bpe/trainer.rs
@@ -190,7 +190,7 @@ pub struct BpeTrainer {
    pub initial_alphabet: HashSet<char>,
    /// An optional prefix to use on any subword that exist only behind another one
    pub continuing_subword_prefix: Option<String>,
-    /// An optional suffix to caracterize and end-of-word subword
+    /// An optional suffix to characterize and end-of-word subword
    pub end_of_word_suffix: Option<String>,
    /// An optional parameter to limit the max length of any single token
    pub max_token_length: Option<usize>,
--- a/tokenizers/src/models/unigram/trainer.rs
+++ b/tokenizers/src/models/unigram/trainer.rs
@@ -401,7 +401,7 @@ impl UnigramTrainer {

                let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln();

-                // The frequencies of altenatives are increased by freq[i].
+                // The frequencies of alternatives are increased by freq[i].
                let mut logprob_alt = 0.0;
                for n in &alternatives[id] {
                    logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt;
--- a/tokenizers/src/models/wordlevel/mod.rs
+++ b/tokenizers/src/models/wordlevel/mod.rs
@@ -73,7 +73,7 @@ impl WordLevelBuilder {
        self
    }

-    /// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
+    /// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
    pub fn build(mut self) -> Result<WordLevel> {
        if let Some(vocab) = self.config.files {
            self.config.vocab = WordLevel::read_file(&vocab)?;
--- a/tokenizers/src/models/wordpiece/mod.rs
+++ b/tokenizers/src/models/wordpiece/mod.rs
@@ -93,7 +93,7 @@ impl WordPieceBuilder {
        self
    }

-    /// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
+    /// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
    pub fn build(mut self) -> Result<WordPiece> {
        if let Some(vocab) = self.config.files {
            self.config.vocab = WordPiece::read_file(&vocab)?;
--- a/tokenizers/src/models/wordpiece/trainer.rs
+++ b/tokenizers/src/models/wordpiece/trainer.rs
@@ -170,7 +170,7 @@ impl WordPieceTrainer {
        // Transfer the vocab
        model.vocab = new_wordpiece.vocab;
        model.vocab_r = new_wordpiece.vocab_r;
-        // The continuing_subword_prefix is the only other option to be overriden by the trainer
+        // The continuing_subword_prefix is the only other option to be overridden by the trainer
        model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix;

        Ok(special_tokens)
--- a/tokenizers/src/normalizers/precompiled.rs
+++ b/tokenizers/src/normalizers/precompiled.rs
@@ -12,7 +12,7 @@ fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &
    transformations.extend(new_part.chars().map(|c| (c, 0)));

    match diff.cmp(&0) {
-        // If we are adding some characters, the last DIFF characters shoud be == 1
+        // If we are adding some characters, the last DIFF characters should be == 1
        Ordering::Greater => {
            transformations
                .iter_mut()
--- a/tokenizers/src/tokenizer/added_vocabulary.rs
+++ b/tokenizers/src/tokenizer/added_vocabulary.rs
@@ -29,7 +29,7 @@ pub struct AddedToken {
 }

 impl AddedToken {
-    /// Build this token from the given content, specifying if it is intented to be a
+    /// Build this token from the given content, specifying if it is intended to be a
    /// special token. Special tokens are not normalized by default.
    pub fn from<S: Into<String>>(content: S, special: bool) -> Self {
        Self {
--- a/tokenizers/src/tokenizer/mod.rs
+++ b/tokenizers/src/tokenizer/mod.rs
@@ -389,7 +389,7 @@ where
        self
    }

-    /// Set the trunaction parameters.
+    /// Set the truncation parameters.
    #[must_use]
    pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {
        self.truncation = trunc;
--- a/tokenizers/src/tokenizer/normalizer.rs
+++ b/tokenizers/src/tokenizer/normalizer.rs
@@ -201,9 +201,9 @@ impl NormalizedString {
                });

            match (start, end) {
-                // Targeting inexistant beginning
+                // Targeting inexistent beginning
                (Some(s), None) => Some(s..s),
-                // Targeting inexistant end
+                // Targeting inexistent end
                (None, Some(e)) => Some(e..e),
                // Found the range
                (Some(s), Some(e)) => Some(s..e),
--- a/tokenizers/src/tokenizer/pattern.rs
+++ b/tokenizers/src/tokenizer/pattern.rs
@@ -122,7 +122,7 @@ where
    }
 }

-/// Invert the `is_match` flags for the wrapped Pattern. This is usefull
+/// Invert the `is_match` flags for the wrapped Pattern. This is useful
 /// for example when we use a regex that matches words instead of a delimiter,
 /// and we want to match the delimiter.
 pub struct Invert<P: Pattern>(pub P);