mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fix typos in strings and comments (#1770)
This commit is contained in:
@ -122,7 +122,7 @@ describe('Encoding', () => {
|
|||||||
expect(indexes).toEqual([3, 5])
|
expect(indexes).toEqual([3, 5])
|
||||||
})
|
})
|
||||||
|
|
||||||
it('returns the corrent indexes with pair sequences', () => {
|
it('returns the correct indexes with pair sequences', () => {
|
||||||
expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
|
expect(encodingDual.wordToTokens(3, 0)).toEqual([3, 5])
|
||||||
expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
|
expect(encodingDual.wordToTokens(3, 1)).toEqual([8, 9])
|
||||||
})
|
})
|
||||||
|
@ -27,4 +27,4 @@ tempfile = "3.10"
|
|||||||
pyo3 = { version = "0.23", features = ["auto-initialize"] }
|
pyo3 = { version = "0.23", features = ["auto-initialize"] }
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
defaut = ["pyo3/extension-module"]
|
default = ["pyo3/extension-module"]
|
||||||
|
@ -397,7 +397,7 @@ def main():
|
|||||||
"--models",
|
"--models",
|
||||||
type=lambda s: s.split(","),
|
type=lambda s: s.split(","),
|
||||||
default=pretraineds,
|
default=pretraineds,
|
||||||
help=f"The pretrained tokenizers you want to test agains, (default: {pretraineds})",
|
help=f"The pretrained tokenizers you want to test against, (default: {pretraineds})",
|
||||||
)
|
)
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
@ -404,7 +404,7 @@ impl PyMetaspaceDec {
|
|||||||
///
|
///
|
||||||
/// Args:
|
/// Args:
|
||||||
/// suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
|
/// suffix (:obj:`str`, `optional`, defaults to :obj:`</w>`):
|
||||||
/// The suffix that was used to caracterize an end-of-word. This suffix will
|
/// The suffix that was used to characterize an end-of-word. This suffix will
|
||||||
/// be replaced by whitespaces during the decoding
|
/// be replaced by whitespaces during the decoding
|
||||||
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
|
#[pyclass(extends=PyDecoder, module = "tokenizers.decoders", name = "BPEDecoder")]
|
||||||
pub struct PyBPEDecoder {}
|
pub struct PyBPEDecoder {}
|
||||||
|
@ -221,7 +221,7 @@ pub struct BPE {
|
|||||||
pub unk_token: Option<String>,
|
pub unk_token: Option<String>,
|
||||||
/// An optional prefix to use on any subword that exist only behind another one
|
/// An optional prefix to use on any subword that exist only behind another one
|
||||||
pub continuing_subword_prefix: Option<String>,
|
pub continuing_subword_prefix: Option<String>,
|
||||||
/// An optional suffix to caracterize and end-of-word subword
|
/// An optional suffix to characterize and end-of-word subword
|
||||||
pub end_of_word_suffix: Option<String>,
|
pub end_of_word_suffix: Option<String>,
|
||||||
/// Do multiple unk tokens get fused
|
/// Do multiple unk tokens get fused
|
||||||
pub fuse_unk: bool,
|
pub fuse_unk: bool,
|
||||||
|
@ -190,7 +190,7 @@ pub struct BpeTrainer {
|
|||||||
pub initial_alphabet: HashSet<char>,
|
pub initial_alphabet: HashSet<char>,
|
||||||
/// An optional prefix to use on any subword that exist only behind another one
|
/// An optional prefix to use on any subword that exist only behind another one
|
||||||
pub continuing_subword_prefix: Option<String>,
|
pub continuing_subword_prefix: Option<String>,
|
||||||
/// An optional suffix to caracterize and end-of-word subword
|
/// An optional suffix to characterize and end-of-word subword
|
||||||
pub end_of_word_suffix: Option<String>,
|
pub end_of_word_suffix: Option<String>,
|
||||||
/// An optional parameter to limit the max length of any single token
|
/// An optional parameter to limit the max length of any single token
|
||||||
pub max_token_length: Option<usize>,
|
pub max_token_length: Option<usize>,
|
||||||
|
@ -401,7 +401,7 @@ impl UnigramTrainer {
|
|||||||
|
|
||||||
let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln();
|
let logsum_alt = (sum + freq[id] * (alternatives.len() - 1) as f64).ln();
|
||||||
|
|
||||||
// The frequencies of altenatives are increased by freq[i].
|
// The frequencies of alternatives are increased by freq[i].
|
||||||
let mut logprob_alt = 0.0;
|
let mut logprob_alt = 0.0;
|
||||||
for n in &alternatives[id] {
|
for n in &alternatives[id] {
|
||||||
logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt;
|
logprob_alt += (freq[*n] + freq[id]).ln() - logsum_alt;
|
||||||
|
@ -73,7 +73,7 @@ impl WordLevelBuilder {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Contructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
|
/// Constructs a `WordLevel` model that uses the `WordLevelBuilder`'s configuration.
|
||||||
pub fn build(mut self) -> Result<WordLevel> {
|
pub fn build(mut self) -> Result<WordLevel> {
|
||||||
if let Some(vocab) = self.config.files {
|
if let Some(vocab) = self.config.files {
|
||||||
self.config.vocab = WordLevel::read_file(&vocab)?;
|
self.config.vocab = WordLevel::read_file(&vocab)?;
|
||||||
|
@ -93,7 +93,7 @@ impl WordPieceBuilder {
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Contructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
|
/// Constructs a `WordPiece` model that uses the `WordPieceBuilder`'s configuration.
|
||||||
pub fn build(mut self) -> Result<WordPiece> {
|
pub fn build(mut self) -> Result<WordPiece> {
|
||||||
if let Some(vocab) = self.config.files {
|
if let Some(vocab) = self.config.files {
|
||||||
self.config.vocab = WordPiece::read_file(&vocab)?;
|
self.config.vocab = WordPiece::read_file(&vocab)?;
|
||||||
|
@ -170,7 +170,7 @@ impl WordPieceTrainer {
|
|||||||
// Transfer the vocab
|
// Transfer the vocab
|
||||||
model.vocab = new_wordpiece.vocab;
|
model.vocab = new_wordpiece.vocab;
|
||||||
model.vocab_r = new_wordpiece.vocab_r;
|
model.vocab_r = new_wordpiece.vocab_r;
|
||||||
// The continuing_subword_prefix is the only other option to be overriden by the trainer
|
// The continuing_subword_prefix is the only other option to be overridden by the trainer
|
||||||
model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix;
|
model.continuing_subword_prefix = new_wordpiece.continuing_subword_prefix;
|
||||||
|
|
||||||
Ok(special_tokens)
|
Ok(special_tokens)
|
||||||
|
@ -12,7 +12,7 @@ fn replace(transformations: &mut Vec<(char, isize)>, old_part: &str, new_part: &
|
|||||||
transformations.extend(new_part.chars().map(|c| (c, 0)));
|
transformations.extend(new_part.chars().map(|c| (c, 0)));
|
||||||
|
|
||||||
match diff.cmp(&0) {
|
match diff.cmp(&0) {
|
||||||
// If we are adding some characters, the last DIFF characters shoud be == 1
|
// If we are adding some characters, the last DIFF characters should be == 1
|
||||||
Ordering::Greater => {
|
Ordering::Greater => {
|
||||||
transformations
|
transformations
|
||||||
.iter_mut()
|
.iter_mut()
|
||||||
|
@ -29,7 +29,7 @@ pub struct AddedToken {
|
|||||||
}
|
}
|
||||||
|
|
||||||
impl AddedToken {
|
impl AddedToken {
|
||||||
/// Build this token from the given content, specifying if it is intented to be a
|
/// Build this token from the given content, specifying if it is intended to be a
|
||||||
/// special token. Special tokens are not normalized by default.
|
/// special token. Special tokens are not normalized by default.
|
||||||
pub fn from<S: Into<String>>(content: S, special: bool) -> Self {
|
pub fn from<S: Into<String>>(content: S, special: bool) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
@ -389,7 +389,7 @@ where
|
|||||||
self
|
self
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Set the trunaction parameters.
|
/// Set the truncation parameters.
|
||||||
#[must_use]
|
#[must_use]
|
||||||
pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {
|
pub fn with_truncation(mut self, trunc: Option<TruncationParams>) -> Self {
|
||||||
self.truncation = trunc;
|
self.truncation = trunc;
|
||||||
|
@ -201,9 +201,9 @@ impl NormalizedString {
|
|||||||
});
|
});
|
||||||
|
|
||||||
match (start, end) {
|
match (start, end) {
|
||||||
// Targeting inexistant beginning
|
// Targeting inexistent beginning
|
||||||
(Some(s), None) => Some(s..s),
|
(Some(s), None) => Some(s..s),
|
||||||
// Targeting inexistant end
|
// Targeting inexistent end
|
||||||
(None, Some(e)) => Some(e..e),
|
(None, Some(e)) => Some(e..e),
|
||||||
// Found the range
|
// Found the range
|
||||||
(Some(s), Some(e)) => Some(s..e),
|
(Some(s), Some(e)) => Some(s..e),
|
||||||
|
@ -122,7 +122,7 @@ where
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Invert the `is_match` flags for the wrapped Pattern. This is usefull
|
/// Invert the `is_match` flags for the wrapped Pattern. This is useful
|
||||||
/// for example when we use a regex that matches words instead of a delimiter,
|
/// for example when we use a regex that matches words instead of a delimiter,
|
||||||
/// and we want to match the delimiter.
|
/// and we want to match the delimiter.
|
||||||
pub struct Invert<P: Pattern>(pub P);
|
pub struct Invert<P: Pattern>(pub P);
|
||||||
|
Reference in New Issue
Block a user