mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
formatting
This commit is contained in:
@ -202,6 +202,7 @@ mod tests {
|
||||
use tempfile::NamedTempFile;
|
||||
|
||||
#[test]
|
||||
// Ensure `BPE::from_files` works as expected.
|
||||
fn test_bpe_from_files() {
|
||||
// Set up vocab file.
|
||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||
@ -224,6 +225,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
// Ensure `MergeTokenOutOfVocabulary` error is returned when it should be.
|
||||
fn test_bpe_from_files_merge_token_oov() {
|
||||
// Set up vocab file.
|
||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||
@ -253,6 +255,8 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
// Ensure `BadMerges` error is returned when there is an invalid line in the
|
||||
// merges.txt file.
|
||||
fn test_bpe_from_files_bad_merges() {
|
||||
// Set up vocab file.
|
||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||
|
@ -13,14 +13,15 @@ use std::{
|
||||
};
|
||||
|
||||
pub struct BpeTrainerConfig {
|
||||
vocab_size: usize,
|
||||
min_frequency: u32,
|
||||
vocab_size: usize,
|
||||
}
|
||||
|
||||
impl BpeTrainerConfig {
|
||||
pub fn new(min_frequency: u32, vocab_size: usize) -> Self {
|
||||
BpeTrainerConfig {
|
||||
vocab_size,
|
||||
min_frequency,
|
||||
vocab_size,
|
||||
}
|
||||
}
|
||||
|
||||
@ -32,12 +33,14 @@ impl BpeTrainerConfig {
|
||||
self.min_frequency = value;
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for BpeTrainerConfig {
|
||||
fn default() -> Self {
|
||||
BpeTrainerConfig::new(0, 30000)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct BpeTrainer {
|
||||
// Training parameters
|
||||
config: BpeTrainerConfig,
|
||||
|
@ -5,6 +5,7 @@ pub struct Word {
|
||||
chars: Vec<u32>,
|
||||
sizes: Vec<usize>,
|
||||
}
|
||||
|
||||
impl Word {
|
||||
pub fn new() -> Self {
|
||||
Word {
|
||||
@ -109,8 +110,8 @@ mod tests {
|
||||
// training. This merge affects the counts for the pairs
|
||||
// ('e', 'l') ~= (1, 2),
|
||||
// ('e', 'll') ~= (1, 4),
|
||||
// ('ll', 'o') ~= (4, 3), and
|
||||
// ('l', 'o') ~= (2, 3).
|
||||
// ('l', 'o') ~= (2, 3), and
|
||||
// ('ll', 'o') ~= (4, 3).
|
||||
// So the changes should reflect that:
|
||||
assert_eq!(
|
||||
changes,
|
||||
|
Reference in New Issue
Block a user