mirror of
https://github.com/mii443/tokenizers.git
synced 2025-12-03 19:28:20 +00:00
Use train_from_files in benchmarks
This commit is contained in:
@@ -70,7 +70,7 @@ pub fn bench_bert(c: &mut Criterion) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn bench_train(c: &mut Criterion) {
|
fn bench_train(c: &mut Criterion) {
|
||||||
let trainer = WordPieceTrainerBuilder::default()
|
let mut trainer = WordPieceTrainerBuilder::default()
|
||||||
.show_progress(false)
|
.show_progress(false)
|
||||||
.build();
|
.build();
|
||||||
type Tok = TokenizerImpl<
|
type Tok = TokenizerImpl<
|
||||||
@@ -87,7 +87,7 @@ fn bench_train(c: &mut Criterion) {
|
|||||||
iter_bench_train(
|
iter_bench_train(
|
||||||
iters,
|
iters,
|
||||||
&mut tokenizer,
|
&mut tokenizer,
|
||||||
&trainer,
|
&mut trainer,
|
||||||
vec!["data/small.txt".to_string()],
|
vec!["data/small.txt".to_string()],
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
@@ -100,7 +100,7 @@ fn bench_train(c: &mut Criterion) {
|
|||||||
iter_bench_train(
|
iter_bench_train(
|
||||||
iters,
|
iters,
|
||||||
&mut tokenizer,
|
&mut tokenizer,
|
||||||
&trainer,
|
&mut trainer,
|
||||||
vec!["data/big.txt".to_string()],
|
vec!["data/big.txt".to_string()],
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -69,7 +69,7 @@ fn bench_gpt2(c: &mut Criterion) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn bench_train(c: &mut Criterion) {
|
fn bench_train(c: &mut Criterion) {
|
||||||
let trainer: TrainerWrapper = BpeTrainerBuilder::default()
|
let mut trainer: TrainerWrapper = BpeTrainerBuilder::default()
|
||||||
.show_progress(false)
|
.show_progress(false)
|
||||||
.build()
|
.build()
|
||||||
.into();
|
.into();
|
||||||
@@ -80,7 +80,7 @@ fn bench_train(c: &mut Criterion) {
|
|||||||
iter_bench_train(
|
iter_bench_train(
|
||||||
iters,
|
iters,
|
||||||
&mut tokenizer,
|
&mut tokenizer,
|
||||||
&trainer,
|
&mut trainer,
|
||||||
vec!["data/small.txt".to_string()],
|
vec!["data/small.txt".to_string()],
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
@@ -93,7 +93,7 @@ fn bench_train(c: &mut Criterion) {
|
|||||||
iter_bench_train(
|
iter_bench_train(
|
||||||
iters,
|
iters,
|
||||||
&mut tokenizer,
|
&mut tokenizer,
|
||||||
&trainer,
|
&mut trainer,
|
||||||
vec!["data/big.txt".to_string()],
|
vec!["data/big.txt".to_string()],
|
||||||
)
|
)
|
||||||
})
|
})
|
||||||
|
|||||||
@@ -61,7 +61,7 @@ where
|
|||||||
pub fn iter_bench_train<T, M, N, PT, PP, D>(
|
pub fn iter_bench_train<T, M, N, PT, PP, D>(
|
||||||
iters: u64,
|
iters: u64,
|
||||||
tokenizer: &mut TokenizerImpl<M, N, PT, PP, D>,
|
tokenizer: &mut TokenizerImpl<M, N, PT, PP, D>,
|
||||||
trainer: &T,
|
trainer: &mut T,
|
||||||
files: Vec<String>,
|
files: Vec<String>,
|
||||||
) -> Duration
|
) -> Duration
|
||||||
where
|
where
|
||||||
@@ -75,7 +75,7 @@ where
|
|||||||
let mut duration = Duration::new(0, 0);
|
let mut duration = Duration::new(0, 0);
|
||||||
for _i in 0..iters {
|
for _i in 0..iters {
|
||||||
let start = Instant::now();
|
let start = Instant::now();
|
||||||
tokenizer.train(trainer, files.clone()).unwrap();
|
tokenizer.train_from_files(trainer, files.clone()).unwrap();
|
||||||
duration = duration.checked_add(start.elapsed()).unwrap();
|
duration = duration.checked_add(start.elapsed()).unwrap();
|
||||||
}
|
}
|
||||||
duration
|
duration
|
||||||
|
|||||||
Reference in New Issue
Block a user