Fix typos in strings and comments (#1770)

This commit is contained in:
co63oc
2025-05-27 14:17:36 +08:00
committed by GitHub
parent 67db0cd1dd
commit f1faec1756
15 changed files with 16 additions and 16 deletions

View File

@ -122,7 +122,7 @@ describe('Encoding', () => {
expect(indexes).toEqual([3, 5]) expect(indexes).toEqual([3, 5])
}) })
it('returns the corrent indexes with pair sequences', () => { it('returns the correct indexes with pair sequences', () => {
expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5]) expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9]) expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
}) })

View File

@ -27,4 +27,4 @@ tempfile = "3.10"
pyo3 = { version = "0.23", features = ["auto-initialize"] } pyo3 = { version = "0.23", features = ["auto-initialize"] }
[features] [features]
defaut = ["pyo3/extension-module"] default = ["pyo3/extension-module"]

View File

@ -397,7 +397,7 @@ def main():
"--models", "--models",
type=lambda s: s.split(","), type=lambda s: s.split(","),
default=pretraineds, default=pretraineds,
help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})", help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",
) )
args = parser.parse_args() args = parser.parse_args()

View File

@ -404,7 +404,7 @@ impl PyMetaspaceDec {
/// ///
/// Args: /// Args:
/// suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`): /// suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
/// The suffix that was used to caracterize an end-of-word. This suffix will /// The suffix that was used to characterize an end-of-word. This suffix will
/// be replaced by whitespaces during the decoding /// be replaced by whitespaces during the decoding
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")] #[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
pub struct PyBPEDecoder {} pub struct PyBPEDecoder {}

View File

@ -221,7 +221,7 @@ pub struct BPE {
pub unk_token: Option<String>, pub unk_token: Option<String>,
/// An optional prefix to use on any subword that exist only behind another one /// An optional prefix to use on any subword that exist only behind another one
pub continuing_subword_prefix: Option<String>, pub continuing_subword_prefix: Option<String>,
/// An optional suffix to caracterize and end-of-word subword /// An optional suffix to characterize and end-of-word subword
pub end_of_word_suffix: Option<String>, pub end_of_word_suffix: Option<String>,
/// Do multiple unk tokens get fused /// Do multiple unk tokens get fused
pub fuse_unk: bool, pub fuse_unk: bool,

View File

@ -190,7 +190,7 @@ pub struct BpeTrainer {
pub initial_alphabet: HashSet<char>, pub initial_alphabet: HashSet<char>,
/// An optional prefix to use on any subword that exist only behind another one /// An optional prefix to use on any subword that exist only behind another one
pub continuing_subword_prefix: Option<String>, pub continuing_subword_prefix: Option<String>,
/// An optional suffix to caracterize and end-of-word subword /// An optional suffix to characterize and end-of-word subword
pub end_of_word_suffix: Option<String>, pub end_of_word_suffix: Option<String>,
/// An optional parameter to limit the max length of any single token /// An optional parameter to limit the max length of any single token
pub max_token_length: Option<usize>, pub max_token_length: Option<usize>,

View File

@ -401,7 +401,7 @@ impl UnigramTrainer {
let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln(); let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln();
// The frequencies of altenatives are increased by freq[i]. // The frequencies of alternatives are increased by freq[i].
let mut logprob_alt = 0.0; let mut logprob_alt = 0.0;
for n in &alternatives[id] { for n in &alternatives[id] {
logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt; logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt;

View File

@ -73,7 +73,7 @@ impl WordLevelBuilder {
self self
} }
/// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration. /// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
pub fn build(mut self) -> Result<WordLevel> { pub fn build(mut self) -> Result<WordLevel> {
if let Some(vocab) = self.config.files { if let Some(vocab) = self.config.files {
self.config.vocab = WordLevel::read_file(&vocab)?; self.config.vocab = WordLevel::read_file(&vocab)?;

View File

@ -93,7 +93,7 @@ impl WordPieceBuilder {
self self
} }
/// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration. /// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
pub fn build(mut self) -> Result<WordPiece> { pub fn build(mut self) -> Result<WordPiece> {
if let Some(vocab) = self.config.files { if let Some(vocab) = self.config.files {
self.config.vocab = WordPiece::read_file(&vocab)?; self.config.vocab = WordPiece::read_file(&vocab)?;

View File

@ -170,7 +170,7 @@ impl WordPieceTrainer {
// Transfer the vocab // Transfer the vocab
model.vocab = new_wordpiece.vocab; model.vocab = new_wordpiece.vocab;
model.vocab_r = new_wordpiece.vocab_r; model.vocab_r = new_wordpiece.vocab_r;
// The continuing_subword_prefix is the only other option to be overriden by the trainer // The continuing_subword_prefix is the only other option to be overridden by the trainer
model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix; model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix;
Ok(special_tokens) Ok(special_tokens)

View File

@ -12,7 +12,7 @@ fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &
transformations.extend(new_part.chars().map(|c| (c, 0))); transformations.extend(new_part.chars().map(|c| (c, 0)));
match diff.cmp(&0) { match diff.cmp(&0) {
// If we are adding some characters, the last DIFF characters shoud be == 1 // If we are adding some characters, the last DIFF characters should be == 1
Ordering::Greater => { Ordering::Greater => {
transformations transformations
.iter_mut() .iter_mut()

View File

@ -29,7 +29,7 @@ pub struct AddedToken {
} }
impl AddedToken { impl AddedToken {
/// Build this token from the given content, specifying if it is intented to be a /// Build this token from the given content, specifying if it is intended to be a
/// special token. Special tokens are not normalized by default. /// special token. Special tokens are not normalized by default.
pub fn from<S: Into<String>>(content: S, special: bool) -> Self { pub fn from<S: Into<String>>(content: S, special: bool) -> Self {
Self { Self {

View File

@ -389,7 +389,7 @@ where
self self
} }
/// Set the trunaction parameters. /// Set the truncation parameters.
#[must_use] #[must_use]
pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self { pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {
self.truncation = trunc; self.truncation = trunc;

View File

@ -201,9 +201,9 @@ impl NormalizedString {
}); });
match (start, end) { match (start, end) {
// Targeting inexistant beginning // Targeting inexistent beginning
(Some(s), None) => Some(s..s), (Some(s), None) => Some(s..s),
// Targeting inexistant end // Targeting inexistent end
(None, Some(e)) => Some(e..e), (None, Some(e)) => Some(e..e),
// Found the range // Found the range
(Some(s), Some(e)) => Some(s..e), (Some(s), Some(e)) => Some(s..e),

View File

@ -122,7 +122,7 @@ where
} }
} }
/// Invert the `is_match` flags for the wrapped Pattern. This is usefull /// Invert the `is_match` flags for the wrapped Pattern. This is useful
/// for example when we use a regex that matches words instead of a delimiter, /// for example when we use a regex that matches words instead of a delimiter,
/// and we want to match the delimiter. /// and we want to match the delimiter.
pub struct Invert<P: Pattern>(pub P); pub struct Invert<P: Pattern>(pub P);