mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
add benchmark on many batches
This commit is contained in:
@ -57,6 +57,7 @@ fn line_to_input(line: io::Result<String>) -> EncodeInput {
|
|||||||
fn bench_gpt2_encode(c: &mut Criterion) {
|
fn bench_gpt2_encode(c: &mut Criterion) {
|
||||||
let bpe = BPE::from_files("benches/gpt2-vocab.json", "benches/gpt2-merges.txt").unwrap();
|
let bpe = BPE::from_files("benches/gpt2-vocab.json", "benches/gpt2-merges.txt").unwrap();
|
||||||
|
|
||||||
|
// Benchmarks encoding a single input from a fresh tokenizer.
|
||||||
c.bench_function("BPE GPT2 encode", |b| {
|
c.bench_function("BPE GPT2 encode", |b| {
|
||||||
b.iter_batched(
|
b.iter_batched(
|
||||||
|| {
|
|| {
|
||||||
@ -70,6 +71,7 @@ fn bench_gpt2_encode(c: &mut Criterion) {
|
|||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Benchmarks encoding many inputs on a single tokenizer.
|
||||||
c.bench_function("BPE GPT2 encode many", |b| {
|
c.bench_function("BPE GPT2 encode many", |b| {
|
||||||
b.iter_custom(|iters| {
|
b.iter_custom(|iters| {
|
||||||
let tokenizer = create_gpt2_tokenizer(&bpe);
|
let tokenizer = create_gpt2_tokenizer(&bpe);
|
||||||
@ -104,6 +106,7 @@ fn bench_gpt2_encode_batch(c: &mut Criterion) {
|
|||||||
.map(line_to_input)
|
.map(line_to_input)
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
|
// Benchmarks encoding a single big batch on a new tokenizer.
|
||||||
c.bench_function("BPE GPT2 encode batch", |b| {
|
c.bench_function("BPE GPT2 encode batch", |b| {
|
||||||
b.iter_batched(
|
b.iter_batched(
|
||||||
|| (create_gpt2_tokenizer(&bpe), lines.clone()),
|
|| (create_gpt2_tokenizer(&bpe), lines.clone()),
|
||||||
@ -111,6 +114,39 @@ fn bench_gpt2_encode_batch(c: &mut Criterion) {
|
|||||||
BatchSize::LargeInput,
|
BatchSize::LargeInput,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Benchmarks encoding a many smaller batches on a single tokenizer.
|
||||||
|
c.bench_function("BPE GPT2 encode batch many", |b| {
|
||||||
|
b.iter_custom(|iters| {
|
||||||
|
let tokenizer = create_gpt2_tokenizer(&bpe);
|
||||||
|
let batch_size: usize = 1000;
|
||||||
|
|
||||||
|
// Collect lines into batches of size `batch_size`.
|
||||||
|
let mut batches: Vec<Vec<EncodeInput>> = vec![vec![]];
|
||||||
|
for line in BufReader::new(File::open(Path::new("benches/big.txt")).unwrap())
|
||||||
|
.lines()
|
||||||
|
.map(line_to_input)
|
||||||
|
{
|
||||||
|
if batches.last().unwrap().len() >= batch_size {
|
||||||
|
batches.push(vec![]);
|
||||||
|
}
|
||||||
|
batches.last_mut().unwrap().push(line);
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut duration = Duration::new(0, 0);
|
||||||
|
let mut batch_index: usize = 0;
|
||||||
|
for _i in 0..iters {
|
||||||
|
if batch_index >= batches.len() {
|
||||||
|
batch_index = 0;
|
||||||
|
}
|
||||||
|
let batch = batches[batch_index].clone();
|
||||||
|
let start = Instant::now();
|
||||||
|
let _ = black_box(tokenizer.encode_batch(batch));
|
||||||
|
duration = duration.checked_add(start.elapsed()).unwrap();
|
||||||
|
}
|
||||||
|
duration
|
||||||
|
})
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group! {
|
criterion_group! {
|
||||||
|
Reference in New Issue
Block a user