add no cache benchmarks

2025-08-22 16:25:30 +00:00 · 2020-01-03 11:56:35 -08:00
parent 6b238a77b9
commit 246d87dc7d
1 changed files with 47 additions and 6 deletions
--- a/tokenizers/benches/bpe_benchmark.rs
+++ b/tokenizers/benches/bpe_benchmark.rs
@ -12,11 +12,7 @@ use tokenizers::tokenizer::{AddedToken, EncodeInput, Tokenizer};

 static BATCH_SIZE: usize = 1_000;

-fn create_gpt2_tokenizer() -> Tokenizer {
-    let bpe = BPE::from_files("benches/gpt2-vocab.json", "benches/gpt2-merges.txt")
-        .unwrap()
-        .build()
-        .unwrap();
+fn create_gpt2_tokenizer(bpe: BPE) -> Tokenizer {
    let mut tokenizer = Tokenizer::new(Box::new(bpe));
    tokenizer.with_pre_tokenizer(Box::new(ByteLevel::new(true)));
    tokenizer.with_decoder(Box::new(ByteLevel::new(false)));
@ -38,7 +34,11 @@ fn line_to_input(line: io::Result<String>) -> EncodeInput {
 }

 fn bench_gpt2(c: &mut Criterion) {
-    let tokenizer = create_gpt2_tokenizer();
+    let bpe = BPE::from_files("benches/gpt2-vocab.json", "benches/gpt2-merges.txt")
+        .unwrap()
+        .build()
+        .unwrap();
+    let tokenizer = create_gpt2_tokenizer(bpe);
    let mut lines: Vec<EncodeInput> = vec![];
    let mut batches: Vec<Vec<EncodeInput>> = vec![vec![]];
    for line in BufReader::new(File::open(Path::new("benches/big.txt")).unwrap())
@ -85,6 +85,47 @@ fn bench_gpt2(c: &mut Criterion) {
            duration
        })
    });
+
+    let bpe = BPE::from_files("benches/gpt2-vocab.json", "benches/gpt2-merges.txt")
+        .unwrap()
+        .cache_capacity(0)
+        .build()
+        .unwrap();
+    let tokenizer = create_gpt2_tokenizer(bpe);
+
+    c.bench_function("BPE GPT2 encode many (no cache)", |b| {
+        b.iter_custom(|iters| {
+            let mut duration = Duration::new(0, 0);
+            let mut line_index: usize = 0;
+            for _i in 0..iters {
+                if line_index >= lines.len() {
+                    line_index = 0;
+                }
+                let input = lines[line_index].clone();
+                let start = Instant::now();
+                let _ = black_box(tokenizer.encode(input));
+                duration = duration.checked_add(start.elapsed()).unwrap();
+            }
+            duration
+        })
+    });
+
+    c.bench_function("BPE GPT2 encode batch many (no cache)", |b| {
+        b.iter_custom(|iters| {
+            let mut duration = Duration::new(0, 0);
+            let mut batch_index: usize = 0;
+            for _i in 0..iters {
+                if batch_index >= batches.len() {
+                    batch_index = 0;
+                }
+                let batch = batches[batch_index].clone();
+                let start = Instant::now();
+                let _ = black_box(tokenizer.encode_batch(batch));
+                duration = duration.checked_add(start.elapsed()).unwrap();
+            }
+            duration
+        })
+    });
 }

 criterion_group! {