mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fix typos in strings and comments (#1770)
This commit is contained in:
@ -122,7 +122,7 @@ describe('Encoding', () => {
|
||||
expect(indexes).toEqual([3, 5])
|
||||
})
|
||||
|
||||
it('returns the corrent indexes with pair sequences', () => {
|
||||
it('returns the correct indexes with pair sequences', () => {
|
||||
expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
|
||||
expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
|
||||
})
|
||||
|
@ -27,4 +27,4 @@ tempfile = "3.10"
|
||||
pyo3 = { version = "0.23", features = ["auto-initialize"] }
|
||||
|
||||
[features]
|
||||
defaut = ["pyo3/extension-module"]
|
||||
default = ["pyo3/extension-module"]
|
||||
|
@ -397,7 +397,7 @@ def main():
|
||||
"--models",
|
||||
type=lambda s: s.split(","),
|
||||
default=pretraineds,
|
||||
help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",
|
||||
help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
|
@ -404,7 +404,7 @@ impl PyMetaspaceDec {
|
||||
///
|
||||
/// Args:
|
||||
/// suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
|
||||
/// The suffix that was used to caracterize an end-of-word. This suffix will
|
||||
/// The suffix that was used to characterize an end-of-word. This suffix will
|
||||
/// be replaced by whitespaces during the decoding
|
||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
|
||||
pub struct PyBPEDecoder {}
|
||||
|
@ -221,7 +221,7 @@ pub struct BPE {
|
||||
pub unk_token: Option<String>,
|
||||
/// An optional prefix to use on any subword that exist only behind another one
|
||||
pub continuing_subword_prefix: Option<String>,
|
||||
/// An optional suffix to caracterize and end-of-word subword
|
||||
/// An optional suffix to characterize and end-of-word subword
|
||||
pub end_of_word_suffix: Option<String>,
|
||||
/// Do multiple unk tokens get fused
|
||||
pub fuse_unk: bool,
|
||||
|
@ -190,7 +190,7 @@ pub struct BpeTrainer {
|
||||
pub initial_alphabet: HashSet<char>,
|
||||
/// An optional prefix to use on any subword that exist only behind another one
|
||||
pub continuing_subword_prefix: Option<String>,
|
||||
/// An optional suffix to caracterize and end-of-word subword
|
||||
/// An optional suffix to characterize and end-of-word subword
|
||||
pub end_of_word_suffix: Option<String>,
|
||||
/// An optional parameter to limit the max length of any single token
|
||||
pub max_token_length: Option<usize>,
|
||||
|
@ -401,7 +401,7 @@ impl UnigramTrainer {
|
||||
|
||||
let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln();
|
||||
|
||||
// The frequencies of altenatives are increased by freq[i].
|
||||
// The frequencies of alternatives are increased by freq[i].
|
||||
let mut logprob_alt = 0.0;
|
||||
for n in &alternatives[id] {
|
||||
logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt;
|
||||
|
@ -73,7 +73,7 @@ impl WordLevelBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
|
||||
/// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
|
||||
pub fn build(mut self) -> Result<WordLevel> {
|
||||
if let Some(vocab) = self.config.files {
|
||||
self.config.vocab = WordLevel::read_file(&vocab)?;
|
||||
|
@ -93,7 +93,7 @@ impl WordPieceBuilder {
|
||||
self
|
||||
}
|
||||
|
||||
/// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
|
||||
/// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
|
||||
pub fn build(mut self) -> Result<WordPiece> {
|
||||
if let Some(vocab) = self.config.files {
|
||||
self.config.vocab = WordPiece::read_file(&vocab)?;
|
||||
|
@ -170,7 +170,7 @@ impl WordPieceTrainer {
|
||||
// Transfer the vocab
|
||||
model.vocab = new_wordpiece.vocab;
|
||||
model.vocab_r = new_wordpiece.vocab_r;
|
||||
// The continuing_subword_prefix is the only other option to be overriden by the trainer
|
||||
// The continuing_subword_prefix is the only other option to be overridden by the trainer
|
||||
model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix;
|
||||
|
||||
Ok(special_tokens)
|
||||
|
@ -12,7 +12,7 @@ fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &
|
||||
transformations.extend(new_part.chars().map(|c| (c, 0)));
|
||||
|
||||
match diff.cmp(&0) {
|
||||
// If we are adding some characters, the last DIFF characters shoud be == 1
|
||||
// If we are adding some characters, the last DIFF characters should be == 1
|
||||
Ordering::Greater => {
|
||||
transformations
|
||||
.iter_mut()
|
||||
|
@ -29,7 +29,7 @@ pub struct AddedToken {
|
||||
}
|
||||
|
||||
impl AddedToken {
|
||||
/// Build this token from the given content, specifying if it is intented to be a
|
||||
/// Build this token from the given content, specifying if it is intended to be a
|
||||
/// special token. Special tokens are not normalized by default.
|
||||
pub fn from<S: Into<String>>(content: S, special: bool) -> Self {
|
||||
Self {
|
||||
|
@ -389,7 +389,7 @@ where
|
||||
self
|
||||
}
|
||||
|
||||
/// Set the trunaction parameters.
|
||||
/// Set the truncation parameters.
|
||||
#[must_use]
|
||||
pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {
|
||||
self.truncation = trunc;
|
||||
|
@ -201,9 +201,9 @@ impl NormalizedString {
|
||||
});
|
||||
|
||||
match (start, end) {
|
||||
// Targeting inexistant beginning
|
||||
// Targeting inexistent beginning
|
||||
(Some(s), None) => Some(s..s),
|
||||
// Targeting inexistant end
|
||||
// Targeting inexistent end
|
||||
(None, Some(e)) => Some(e..e),
|
||||
// Found the range
|
||||
(Some(s), Some(e)) => Some(s..e),
|
||||
|
@ -122,7 +122,7 @@ where
|
||||
}
|
||||
}
|
||||
|
||||
/// Invert the `is_match` flags for the wrapped Pattern. This is usefull
|
||||
/// Invert the `is_match` flags for the wrapped Pattern. This is useful
|
||||
/// for example when we use a regex that matches words instead of a delimiter,
|
||||
/// and we want to match the delimiter.
|
||||
pub struct Invert<P: Pattern>(pub P);
|
||||
|
Reference in New Issue
Block a user