mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
formatting
This commit is contained in:
@ -202,6 +202,7 @@ mod tests {
|
|||||||
use tempfile::NamedTempFile;
|
use tempfile::NamedTempFile;
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
// Ensure `BPE::from_files` works as expected.
|
||||||
fn test_bpe_from_files() {
|
fn test_bpe_from_files() {
|
||||||
// Set up vocab file.
|
// Set up vocab file.
|
||||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||||
@ -224,6 +225,7 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
// Ensure `MergeTokenOutOfVocabulary` error is returned when it should be.
|
||||||
fn test_bpe_from_files_merge_token_oov() {
|
fn test_bpe_from_files_merge_token_oov() {
|
||||||
// Set up vocab file.
|
// Set up vocab file.
|
||||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||||
@ -253,6 +255,8 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
// Ensure `BadMerges` error is returned when there is an invalid line in the
|
||||||
|
// merges.txt file.
|
||||||
fn test_bpe_from_files_bad_merges() {
|
fn test_bpe_from_files_bad_merges() {
|
||||||
// Set up vocab file.
|
// Set up vocab file.
|
||||||
let mut vocab_file = NamedTempFile::new().unwrap();
|
let mut vocab_file = NamedTempFile::new().unwrap();
|
||||||
|
@ -13,14 +13,15 @@ use std::{
|
|||||||
};
|
};
|
||||||
|
|
||||||
pub struct BpeTrainerConfig {
|
pub struct BpeTrainerConfig {
|
||||||
vocab_size: usize,
|
|
||||||
min_frequency: u32,
|
min_frequency: u32,
|
||||||
|
vocab_size: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl BpeTrainerConfig {
|
impl BpeTrainerConfig {
|
||||||
pub fn new(min_frequency: u32, vocab_size: usize) -> Self {
|
pub fn new(min_frequency: u32, vocab_size: usize) -> Self {
|
||||||
BpeTrainerConfig {
|
BpeTrainerConfig {
|
||||||
vocab_size,
|
|
||||||
min_frequency,
|
min_frequency,
|
||||||
|
vocab_size,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -32,12 +33,14 @@ impl BpeTrainerConfig {
|
|||||||
self.min_frequency = value;
|
self.min_frequency = value;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for BpeTrainerConfig {
|
impl Default for BpeTrainerConfig {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
BpeTrainerConfig::new(0, 30000)
|
BpeTrainerConfig::new(0, 30000)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Default)]
|
||||||
pub struct BpeTrainer {
|
pub struct BpeTrainer {
|
||||||
// Training parameters
|
// Training parameters
|
||||||
config: BpeTrainerConfig,
|
config: BpeTrainerConfig,
|
||||||
|
@ -5,6 +5,7 @@ pub struct Word {
|
|||||||
chars: Vec<u32>,
|
chars: Vec<u32>,
|
||||||
sizes: Vec<usize>,
|
sizes: Vec<usize>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Word {
|
impl Word {
|
||||||
pub fn new() -> Self {
|
pub fn new() -> Self {
|
||||||
Word {
|
Word {
|
||||||
@ -109,8 +110,8 @@ mod tests {
|
|||||||
// training. This merge affects the counts for the pairs
|
// training. This merge affects the counts for the pairs
|
||||||
// ('e', 'l') ~= (1, 2),
|
// ('e', 'l') ~= (1, 2),
|
||||||
// ('e', 'll') ~= (1, 4),
|
// ('e', 'll') ~= (1, 4),
|
||||||
// ('ll', 'o') ~= (4, 3), and
|
// ('l', 'o') ~= (2, 3), and
|
||||||
// ('l', 'o') ~= (2, 3).
|
// ('ll', 'o') ~= (4, 3).
|
||||||
// So the changes should reflect that:
|
// So the changes should reflect that:
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
changes,
|
changes,
|
||||||
|
Reference in New Issue
Block a user