From 69212e17e9853bb3fe70829e19a04a9755f59211 Mon Sep 17 00:00:00 2001 From: epwalsh Date: Thu, 19 Dec 2019 15:07:27 -0800 Subject: [PATCH] formatting --- tokenizers/src/models/bpe/model.rs | 4 ++++ tokenizers/src/models/bpe/trainer.rs | 7 +++++-- tokenizers/src/models/bpe/word.rs | 5 +++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/tokenizers/src/models/bpe/model.rs b/tokenizers/src/models/bpe/model.rs index be4b77a2..c938227e 100644 --- a/tokenizers/src/models/bpe/model.rs +++ b/tokenizers/src/models/bpe/model.rs @@ -202,6 +202,7 @@ mod tests { use tempfile::NamedTempFile; #[test] + // Ensure `BPE::from_files` works as expected. fn test_bpe_from_files() { // Set up vocab file. let mut vocab_file = NamedTempFile::new().unwrap(); @@ -224,6 +225,7 @@ mod tests { } #[test] + // Ensure `MergeTokenOutOfVocabulary` error is returned when it should be. fn test_bpe_from_files_merge_token_oov() { // Set up vocab file. let mut vocab_file = NamedTempFile::new().unwrap(); @@ -253,6 +255,8 @@ mod tests { } #[test] + // Ensure `BadMerges` error is returned when there is an invalid line in the + // merges.txt file. fn test_bpe_from_files_bad_merges() { // Set up vocab file. let mut vocab_file = NamedTempFile::new().unwrap(); diff --git a/tokenizers/src/models/bpe/trainer.rs b/tokenizers/src/models/bpe/trainer.rs index 88769606..e4c4741a 100644 --- a/tokenizers/src/models/bpe/trainer.rs +++ b/tokenizers/src/models/bpe/trainer.rs @@ -13,14 +13,15 @@ use std::{ }; pub struct BpeTrainerConfig { - vocab_size: usize, min_frequency: u32, + vocab_size: usize, } + impl BpeTrainerConfig { pub fn new(min_frequency: u32, vocab_size: usize) -> Self { BpeTrainerConfig { - vocab_size, min_frequency, + vocab_size, } } @@ -32,12 +33,14 @@ impl BpeTrainerConfig { self.min_frequency = value; } } + impl Default for BpeTrainerConfig { fn default() -> Self { BpeTrainerConfig::new(0, 30000) } } +#[derive(Default)] pub struct BpeTrainer { // Training parameters config: BpeTrainerConfig, diff --git a/tokenizers/src/models/bpe/word.rs b/tokenizers/src/models/bpe/word.rs index b61b5b49..b853290f 100644 --- a/tokenizers/src/models/bpe/word.rs +++ b/tokenizers/src/models/bpe/word.rs @@ -5,6 +5,7 @@ pub struct Word { chars: Vec, sizes: Vec, } + impl Word { pub fn new() -> Self { Word { @@ -109,8 +110,8 @@ mod tests { // training. This merge affects the counts for the pairs // ('e', 'l') ~= (1, 2), // ('e', 'll') ~= (1, 4), - // ('ll', 'o') ~= (4, 3), and - // ('l', 'o') ~= (2, 3). + // ('l', 'o') ~= (2, 3), and + // ('ll', 'o') ~= (4, 3). // So the changes should reflect that: assert_eq!( changes,