mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
style: simplify string formatting for readability (#1632)
This commit is contained in:
@ -21,7 +21,7 @@ pub fn bench_train(c: &mut Criterion) {
|
|||||||
let mut word_counts = HashMap::new();
|
let mut word_counts = HashMap::new();
|
||||||
content.split_whitespace().for_each(|word| {
|
content.split_whitespace().for_each(|word| {
|
||||||
// This is important for the test of char vs u8
|
// This is important for the test of char vs u8
|
||||||
let word = format!("▁{}", word);
|
let word = format!("▁{word}");
|
||||||
*word_counts.entry(word).or_insert(0) += 1;
|
*word_counts.entry(word).or_insert(0) += 1;
|
||||||
});
|
});
|
||||||
|
|
||||||
@ -49,7 +49,7 @@ pub fn bench_train(c: &mut Criterion) {
|
|||||||
let mut word_counts = HashMap::new();
|
let mut word_counts = HashMap::new();
|
||||||
content.split_whitespace().for_each(|word| {
|
content.split_whitespace().for_each(|word| {
|
||||||
// This is important for the test of char vs u8
|
// This is important for the test of char vs u8
|
||||||
let word = format!("▁{}", word);
|
let word = format!("▁{word}");
|
||||||
*word_counts.entry(word).or_insert(0) += 1;
|
*word_counts.entry(word).or_insert(0) += 1;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
@ -8,7 +8,7 @@ fn main() {
|
|||||||
// Mix special and not special
|
// Mix special and not special
|
||||||
// You can make sure ids are in order, and special status is correct.
|
// You can make sure ids are in order, and special status is correct.
|
||||||
let tokens: Vec<_> = (0..120_000)
|
let tokens: Vec<_> = (0..120_000)
|
||||||
.map(|i| AddedToken::from(format!("[SPECIAL_{}]", i), i % 2 == 0))
|
.map(|i| AddedToken::from(format!("[SPECIAL_{i}]"), i % 2 == 0))
|
||||||
.collect();
|
.collect();
|
||||||
tokenizer.add_tokens(&tokens);
|
tokenizer.add_tokens(&tokens);
|
||||||
tokenizer.save("_tok.json", true).unwrap();
|
tokenizer.save("_tok.json", true).unwrap();
|
||||||
|
@ -53,7 +53,7 @@ impl Decoder for WordPiece {
|
|||||||
if token.starts_with(&self.prefix) {
|
if token.starts_with(&self.prefix) {
|
||||||
*token = token.replacen(&self.prefix, "", 1);
|
*token = token.replacen(&self.prefix, "", 1);
|
||||||
} else {
|
} else {
|
||||||
*token = format!(" {}", token);
|
*token = format!(" {token}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if self.cleanup {
|
if self.cleanup {
|
||||||
|
@ -385,13 +385,13 @@ impl BPE {
|
|||||||
// Add the `continuing_subword_prefix` if relevant
|
// Add the `continuing_subword_prefix` if relevant
|
||||||
if !is_first {
|
if !is_first {
|
||||||
if let Some(ref prefix) = self.continuing_subword_prefix {
|
if let Some(ref prefix) = self.continuing_subword_prefix {
|
||||||
s = format!("{}{}", prefix, s).into()
|
s = format!("{prefix}{s}").into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Add the `end_of_word_suffix` if relevant
|
// Add the `end_of_word_suffix` if relevant
|
||||||
if is_last {
|
if is_last {
|
||||||
if let Some(ref suffix) = self.end_of_word_suffix {
|
if let Some(ref suffix) = self.end_of_word_suffix {
|
||||||
s = format!("{}{}", s, suffix).into()
|
s = format!("{s}{suffix}").into()
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -406,7 +406,7 @@ impl BPE {
|
|||||||
let tokens: Option<Vec<_>> = s
|
let tokens: Option<Vec<_>> = s
|
||||||
.bytes()
|
.bytes()
|
||||||
.map(|b| -> Option<&u32> {
|
.map(|b| -> Option<&u32> {
|
||||||
let code = format!("<{:#04X}>", b);
|
let code = format!("<{b:#04X}>");
|
||||||
|
|
||||||
self.vocab.get(&code)
|
self.vocab.get(&code)
|
||||||
})
|
})
|
||||||
@ -515,7 +515,7 @@ impl Model for BPE {
|
|||||||
|
|
||||||
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
||||||
let vocab_file_name = match name {
|
let vocab_file_name = match name {
|
||||||
Some(name) => format!("{}-vocab.json", name),
|
Some(name) => format!("{name}-vocab.json"),
|
||||||
None => "vocab.json".to_string(),
|
None => "vocab.json".to_string(),
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -530,7 +530,7 @@ impl Model for BPE {
|
|||||||
|
|
||||||
// Write merges.txt
|
// Write merges.txt
|
||||||
let merges_file_name = match name {
|
let merges_file_name = match name {
|
||||||
Some(name) => format!("{}-merges.txt", name),
|
Some(name) => format!("{name}-merges.txt"),
|
||||||
None => "merges.txt".to_string(),
|
None => "merges.txt".to_string(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -342,13 +342,13 @@ impl BpeTrainer {
|
|||||||
// Add the `continuing_subword_prefix` if relevant
|
// Add the `continuing_subword_prefix` if relevant
|
||||||
if !is_first {
|
if !is_first {
|
||||||
if let Some(prefix) = &self.continuing_subword_prefix {
|
if let Some(prefix) = &self.continuing_subword_prefix {
|
||||||
s = format!("{}{}", prefix, s);
|
s = format!("{prefix}{s}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Add the `end_of_word_suffix` if relevant
|
// Add the `end_of_word_suffix` if relevant
|
||||||
if is_last {
|
if is_last {
|
||||||
if let Some(suffix) = &self.end_of_word_suffix {
|
if let Some(suffix) = &self.end_of_word_suffix {
|
||||||
s = format!("{}{}", s, suffix);
|
s = format!("{s}{suffix}");
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -513,7 +513,7 @@ impl BpeTrainer {
|
|||||||
part_b = part_b[prefix_byte_len..].to_string();
|
part_b = part_b[prefix_byte_len..].to_string();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
let new_token = format!("{}{}", part_a, part_b);
|
let new_token = format!("{part_a}{part_b}");
|
||||||
// implement sentencepiece-like merge.
|
// implement sentencepiece-like merge.
|
||||||
// if this code were to be merged, integrate a way in the python bindings to communicate this variable
|
// if this code were to be merged, integrate a way in the python bindings to communicate this variable
|
||||||
// default should be 0/None to maintain previous behavior. 16 is the spm default.
|
// default should be 0/None to maintain previous behavior. 16 is the spm default.
|
||||||
|
@ -51,7 +51,7 @@ impl<'a> Serialize for OrderedVocabIter<'a> {
|
|||||||
|
|
||||||
if !holes.is_empty() {
|
if !holes.is_empty() {
|
||||||
warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
|
warn!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
|
||||||
println!("The OrderedVocab you are attempting to save contains holes for indices {:?}, your vocabulary could be corrupted !", holes);
|
println!("The OrderedVocab you are attempting to save contains holes for indices {holes:?}, your vocabulary could be corrupted !");
|
||||||
}
|
}
|
||||||
result
|
result
|
||||||
}
|
}
|
||||||
|
@ -425,7 +425,7 @@ impl Model for Unigram {
|
|||||||
let byte_tokens: Option<Vec<_>> = string
|
let byte_tokens: Option<Vec<_>> = string
|
||||||
.bytes()
|
.bytes()
|
||||||
.map(|byte| -> Option<Token> {
|
.map(|byte| -> Option<Token> {
|
||||||
let byte_string = format!("<0x{:02X}>", byte);
|
let byte_string = format!("<0x{byte:02X}>");
|
||||||
let id = self.token_to_ids.get(&byte_string);
|
let id = self.token_to_ids.get(&byte_string);
|
||||||
id.map(|id| Token::new(*id, byte_string, (offset, offset + len)))
|
id.map(|id| Token::new(*id, byte_string, (offset, offset + len)))
|
||||||
})
|
})
|
||||||
@ -457,7 +457,7 @@ impl Model for Unigram {
|
|||||||
|
|
||||||
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
||||||
let name = match name {
|
let name = match name {
|
||||||
Some(name) => format!("{}-unigram.json", name),
|
Some(name) => format!("{name}-unigram.json"),
|
||||||
None => "unigram.json".to_string(),
|
None => "unigram.json".to_string(),
|
||||||
};
|
};
|
||||||
let mut fullpath = PathBuf::new();
|
let mut fullpath = PathBuf::new();
|
||||||
@ -568,7 +568,7 @@ mod tests {
|
|||||||
|
|
||||||
for is_optimized in &[true, false] {
|
for is_optimized in &[true, false] {
|
||||||
model.set_optimized(*is_optimized);
|
model.set_optimized(*is_optimized);
|
||||||
println!("IsOptimized {:?}", is_optimized);
|
println!("IsOptimized {is_optimized:?}");
|
||||||
assert_eq!(model.encode("abc").unwrap(), vec!["abc"]);
|
assert_eq!(model.encode("abc").unwrap(), vec!["abc"]);
|
||||||
assert_eq!(model.encode("AB").unwrap(), vec!["AB"]);
|
assert_eq!(model.encode("AB").unwrap(), vec!["AB"]);
|
||||||
|
|
||||||
|
@ -70,7 +70,7 @@ impl<'de> Visitor<'de> for UnigramVisitor {
|
|||||||
}
|
}
|
||||||
match (vocab, unk_id, byte_fallback) {
|
match (vocab, unk_id, byte_fallback) {
|
||||||
(Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)
|
(Some(vocab), unk_id, byte_fallback) => Ok(Unigram::from(vocab, unk_id, byte_fallback)
|
||||||
.map_err(|err| Error::custom(format!("Unable to load vocab {:?}", err)))?),
|
.map_err(|err| Error::custom(format!("Unable to load vocab {err:?}")))?),
|
||||||
(None, _, _) => Err(Error::custom("Missing vocab")),
|
(None, _, _) => Err(Error::custom("Missing vocab")),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -194,7 +194,7 @@ impl Model for WordLevel {
|
|||||||
|
|
||||||
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
||||||
let vocab_file_name = match name {
|
let vocab_file_name = match name {
|
||||||
Some(name) => format!("{}-vocab.json", name),
|
Some(name) => format!("{name}-vocab.json"),
|
||||||
None => "vocab.json".to_string(),
|
None => "vocab.json".to_string(),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -271,7 +271,7 @@ impl Model for WordPiece {
|
|||||||
|
|
||||||
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
fn save(&self, folder: &Path, name: Option<&str>) -> Result<Vec<PathBuf>> {
|
||||||
let vocab_file_name = match name {
|
let vocab_file_name = match name {
|
||||||
Some(name) => format!("{}-vocab.txt", name),
|
Some(name) => format!("{name}-vocab.txt"),
|
||||||
None => "vocab.txt".to_string(),
|
None => "vocab.txt".to_string(),
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -285,7 +285,7 @@ impl Model for WordPiece {
|
|||||||
vocab_file.write_all(
|
vocab_file.write_all(
|
||||||
&vocab
|
&vocab
|
||||||
.into_iter()
|
.into_iter()
|
||||||
.flat_map(|(token, _)| format!("{}\n", token).as_bytes().to_owned())
|
.flat_map(|(token, _)| format!("{token}\n").as_bytes().to_owned())
|
||||||
.collect::<Vec<_>>()[..],
|
.collect::<Vec<_>>()[..],
|
||||||
)?;
|
)?;
|
||||||
|
|
||||||
|
@ -150,7 +150,7 @@ impl TryFrom<String> for Piece {
|
|||||||
fn try_from(s: String) -> StdResult<Self, Self::Error> {
|
fn try_from(s: String) -> StdResult<Self, Self::Error> {
|
||||||
let parts = s.split(':').collect::<Vec<_>>();
|
let parts = s.split(':').collect::<Vec<_>>();
|
||||||
|
|
||||||
let err = || format!("Cannot build Piece from string \"{}\"", s);
|
let err = || format!("Cannot build Piece from string \"{s}\"");
|
||||||
match parts.as_slice() {
|
match parts.as_slice() {
|
||||||
[id, type_id] => {
|
[id, type_id] => {
|
||||||
let type_id: u32 = type_id.parse().map_err(|_| err())?;
|
let type_id: u32 = type_id.parse().map_err(|_| err())?;
|
||||||
|
@ -351,7 +351,7 @@ impl NormalizedString {
|
|||||||
match changes {
|
match changes {
|
||||||
0 => "Replacing".into(),
|
0 => "Replacing".into(),
|
||||||
ch if ch > 0 => "Adding".into(),
|
ch if ch > 0 => "Adding".into(),
|
||||||
ch if ch < 0 => format!("Replacing + removing {} following chars", ch),
|
ch if ch < 0 => format!("Replacing + removing {ch} following chars"),
|
||||||
_ => "Undefined".into(),
|
_ => "Undefined".into(),
|
||||||
},
|
},
|
||||||
offset
|
offset
|
||||||
|
@ -116,7 +116,7 @@ where
|
|||||||
"version" => {
|
"version" => {
|
||||||
let v: String = map.next_value()?;
|
let v: String = map.next_value()?;
|
||||||
if &v != "1.0" {
|
if &v != "1.0" {
|
||||||
return Err(Error::custom(format!("Unknown tokenizer version '{}'", v)));
|
return Err(Error::custom(format!("Unknown tokenizer version '{v}'")));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
"truncation" => {
|
"truncation" => {
|
||||||
|
@ -199,7 +199,7 @@ fn quicktour() -> tokenizers::Result<()> {
|
|||||||
// START quicktour_encode_batch
|
// START quicktour_encode_batch
|
||||||
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
|
let output = tokenizer.encode_batch(vec!["Hello, y'all!", "How are you 😁 ?"], true)?;
|
||||||
// END quicktour_encode_batch
|
// END quicktour_encode_batch
|
||||||
println!("{:?}", output);
|
println!("{output:?}");
|
||||||
// START quicktour_encode_batch_pair
|
// START quicktour_encode_batch_pair
|
||||||
let output = tokenizer.encode_batch(
|
let output = tokenizer.encode_batch(
|
||||||
vec![
|
vec![
|
||||||
@ -209,7 +209,7 @@ fn quicktour() -> tokenizers::Result<()> {
|
|||||||
true,
|
true,
|
||||||
)?;
|
)?;
|
||||||
// END quicktour_encode_batch_pair
|
// END quicktour_encode_batch_pair
|
||||||
println!("{:?}", output);
|
println!("{output:?}");
|
||||||
// START quicktour_enable_padding
|
// START quicktour_enable_padding
|
||||||
use tokenizers::PaddingParams;
|
use tokenizers::PaddingParams;
|
||||||
|
|
||||||
@ -350,7 +350,7 @@ fn pipeline() -> tokenizers::Result<()> {
|
|||||||
&[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
|
&[1, 27253, 16, 93, 11, 5097, 5, 7961, 5112, 6218, 0, 35, 2],
|
||||||
true,
|
true,
|
||||||
)?;
|
)?;
|
||||||
println!("{}", decoded);
|
println!("{decoded}");
|
||||||
// "Hello , y ' all ! How are you ?"
|
// "Hello , y ' all ! How are you ?"
|
||||||
// END pipeline_test_decoding
|
// END pipeline_test_decoding
|
||||||
|
|
||||||
@ -436,7 +436,7 @@ fn pipeline_bert() -> tokenizers::Result<()> {
|
|||||||
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
// ["[CLS]", "welcome", "to", "the", "[UNK]", "tok", "##eni", "##zer", "##s", "library", ".", "[SEP]"]
|
||||||
|
|
||||||
let decoded = bert_tokenizer.decode(output.get_ids(), true)?;
|
let decoded = bert_tokenizer.decode(output.get_ids(), true)?;
|
||||||
println!("{}", decoded);
|
println!("{decoded}");
|
||||||
// "welcome to the tok ##eni ##zer ##s library ."
|
// "welcome to the tok ##eni ##zer ##s library ."
|
||||||
// END bert_test_decoding
|
// END bert_test_decoding
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
@ -44,7 +44,7 @@ fn test_train_unigram_from_file() {
|
|||||||
let mut word_counts = HashMap::new();
|
let mut word_counts = HashMap::new();
|
||||||
content.split_whitespace().for_each(|word| {
|
content.split_whitespace().for_each(|word| {
|
||||||
// This is important for the test of char vs u8
|
// This is important for the test of char vs u8
|
||||||
let word = format!("▁{}", word);
|
let word = format!("▁{word}");
|
||||||
*word_counts.entry(word).or_insert(0) += 1;
|
*word_counts.entry(word).or_insert(0) += 1;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
Reference in New Issue
Block a user