mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-01 23:09:34 +00:00
add no cache benchmarks
This commit is contained in:
@ -12,11 +12,7 @@ use tokenizers::tokenizer::{AddedToken, EncodeInput, Tokenizer};
|
|||||||
|
|
||||||
static BATCH_SIZE: usize = 1_000;
|
static BATCH_SIZE: usize = 1_000;
|
||||||
|
|
||||||
fn create_gpt2_tokenizer() -> Tokenizer {
|
fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer {
|
||||||
let bpe = BPE::from_files("benches/gpt2-vocab.json", "benches/gpt2-merges.txt")
|
|
||||||
.unwrap()
|
|
||||||
.build()
|
|
||||||
.unwrap();
|
|
||||||
let mut tokenizer = Tokenizer::new(Box::new(bpe));
|
let mut tokenizer = Tokenizer::new(Box::new(bpe));
|
||||||
tokenizer.with_pre_tokenizer(Box::new(ByteLevel::new(true)));
|
tokenizer.with_pre_tokenizer(Box::new(ByteLevel::new(true)));
|
||||||
tokenizer.with_decoder(Box::new(ByteLevel::new(false)));
|
tokenizer.with_decoder(Box::new(ByteLevel::new(false)));
|
||||||
@ -38,7 +34,11 @@ fn line_to_input(line: io::Result<String>) -> EncodeInput {
|
|||||||
}
|
}
|
||||||
|
|
||||||
fn bench_gpt2(c: &mut Criterion) {
|
fn bench_gpt2(c: &mut Criterion) {
|
||||||
let tokenizer = create_gpt2_tokenizer();
|
let bpe = BPE::from_files("benches/gpt2-vocab.json", "benches/gpt2-merges.txt")
|
||||||
|
.unwrap()
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
let tokenizer = create_gpt2_tokenizer(bpe);
|
||||||
let mut lines: Vec<EncodeInput> = vec![];
|
let mut lines: Vec<EncodeInput> = vec![];
|
||||||
let mut batches: Vec<Vec<EncodeInput>> = vec![vec![]];
|
let mut batches: Vec<Vec<EncodeInput>> = vec![vec![]];
|
||||||
for line in BufReader::new(File::open(Path::new("benches/big.txt")).unwrap())
|
for line in BufReader::new(File::open(Path::new("benches/big.txt")).unwrap())
|
||||||
@ -85,6 +85,47 @@ fn bench_gpt2(c: &mut Criterion) {
|
|||||||
duration
|
duration
|
||||||
})
|
})
|
||||||
});
|
});
|
||||||
|
|
||||||
|
let bpe = BPE::from_files("benches/gpt2-vocab.json", "benches/gpt2-merges.txt")
|
||||||
|
.unwrap()
|
||||||
|
.cache_capacity(0)
|
||||||
|
.build()
|
||||||
|
.unwrap();
|
||||||
|
let tokenizer = create_gpt2_tokenizer(bpe);
|
||||||
|
|
||||||
|
c.bench_function("BPE GPT2 encode many (no cache)", |b| {
|
||||||
|
b.iter_custom(|iters| {
|
||||||
|
let mut duration = Duration::new(0, 0);
|
||||||
|
let mut line_index: usize = 0;
|
||||||
|
for _i in 0..iters {
|
||||||
|
if line_index >= lines.len() {
|
||||||
|
line_index = 0;
|
||||||
|
}
|
||||||
|
let input = lines[line_index].clone();
|
||||||
|
let start = Instant::now();
|
||||||
|
let _ = black_box(tokenizer.encode(input));
|
||||||
|
duration = duration.checked_add(start.elapsed()).unwrap();
|
||||||
|
}
|
||||||
|
duration
|
||||||
|
})
|
||||||
|
});
|
||||||
|
|
||||||
|
c.bench_function("BPE GPT2 encode batch many (no cache)", |b| {
|
||||||
|
b.iter_custom(|iters| {
|
||||||
|
let mut duration = Duration::new(0, 0);
|
||||||
|
let mut batch_index: usize = 0;
|
||||||
|
for _i in 0..iters {
|
||||||
|
if batch_index >= batches.len() {
|
||||||
|
batch_index = 0;
|
||||||
|
}
|
||||||
|
let batch = batches[batch_index].clone();
|
||||||
|
let start = Instant::now();
|
||||||
|
let _ = black_box(tokenizer.encode_batch(batch));
|
||||||
|
duration = duration.checked_add(start.elapsed()).unwrap();
|
||||||
|
}
|
||||||
|
duration
|
||||||
|
})
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
criterion_group! {
|
criterion_group! {
|
||||||
|
Reference in New Issue
Block a user