From 35f338a7b89e5240794fc0b75e0e43038fc401be Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 31 Jul 2024 17:09:23 +0200 Subject: [PATCH] Add benchmark vs tiktoken (#1582) * Adding a simple tiktoken benchmark. * Adding 1 large fused document case. --- bindings/python/benches/test_tiktoken.py | 117 +++++++++++++++++++++++ 1 file changed, 117 insertions(+) create mode 100644 bindings/python/benches/test_tiktoken.py diff --git a/bindings/python/benches/test_tiktoken.py b/bindings/python/benches/test_tiktoken.py new file mode 100644 index 00000000..ce2e2821 --- /dev/null +++ b/bindings/python/benches/test_tiktoken.py @@ -0,0 +1,117 @@ +import os +import time +import argparse +from datasets import load_dataset +from tiktoken.load import load_tiktoken_bpe +import tiktoken +from tokenizers import Tokenizer +from huggingface_hub import hf_hub_download +from typing import Tuple, List + +MODEL_ID = "meta-llama/Meta-Llama-3.1-8B" +DATASET = "facebook/xnli" +DATASET_CONFIG = "all_languages" +DEFAULT_THREADS = [2**i for i in range(8) if 2**i <= os.cpu_count()] + + +def format_byte_size(num_bytes: int) -> Tuple[str, str]: + """Convert bytes to a human-readable format (KB, MB, GB).""" + num_bytes_f = float(num_bytes) + for unit in ["B", "KB", "MB", "GB", "TB"]: + if num_bytes_f < 1024: + return f"{num_bytes_f:.2f} {unit}", unit + num_bytes_f /= 1024 + return f"{num_bytes_f:.2f} PB", "PB" + + +def benchmark_batch(model: str, documents: list[str]) -> None: + num_threads = int(os.environ["RAYON_NUM_THREADS"]) + num_bytes = sum(map(len, map(str.encode, documents))) + readable_size, unit = format_byte_size(num_bytes) + print(f"==============") + print(f"num_threads: {num_threads}, data size: {readable_size}, documents: {len(documents)}") + filename = hf_hub_download(MODEL_ID, "original/tokenizer.model") + mergeable_ranks = load_tiktoken_bpe(filename) + pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+" + num_reserved_special_tokens = 256 + special_tokens = [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", # end of turn + ] + [ + f"<|reserved_special_token_{i}|>" + for i in range(5, num_reserved_special_tokens - 5) + ] + num_base_tokens = len(mergeable_ranks) + special_tokens = { + token: num_base_tokens + i for i, token in enumerate(special_tokens) + } + enc = tiktoken.Encoding( + name=model, + pat_str=pat_str, + mergeable_ranks=mergeable_ranks, + special_tokens=special_tokens, + ) + enc.encode("warmup") + + start = time.perf_counter_ns() + enc.encode_ordinary_batch(documents, num_threads=num_threads) + end = time.perf_counter_ns() + + readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) + print(f"tiktoken \t{readable_size} / s") + + hf_enc = Tokenizer.from_pretrained(model) + hf_enc.encode("warmup") + + start = time.perf_counter_ns() + hf_enc.encode_batch(documents) + end = time.perf_counter_ns() + readable_size, unit = format_byte_size(num_bytes / (end - start) * 1e9) + print(f"huggingface \t{readable_size} / s") + + +def test(model: str, dataset: str, dataset_config: str, threads: List[int]): + dataset_xnli = load_dataset(dataset, dataset_config) + + input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths + + for num_threads in threads: + os.environ["RAYON_NUM_THREADS"] = str(num_threads) + os.environ["TOKENIZER_PARALLELISM"] = str(num_threads) + os.environ["RAYON_RS_NUM_THREADS"] = str(num_threads) + for length, fuse in input_lengths: + documents = [] + for i, item in enumerate(dataset_xnli["train"]): + documents.append("".join(item["premise"].values())) + if i >= length: + break + if fuse: + documents=["".join(documents)] + benchmark_batch(model, documents) + + +def main(): + + parser = argparse.ArgumentParser( + prog='bench_tokenizer', + description='Getting a feel for speed when tokenizing', + ) + parser.add_argument('-m', '--model', default=MODEL_ID, type=str) + parser.add_argument('-d', '--dataset', default=DATASET, type=str) + parser.add_argument('-ds', '--dataset-config', default=DATASET_CONFIG, type=str) + parser.add_argument('-t', '--threads', nargs='+', default=DEFAULT_THREADS, type=int) + args = parser.parse_args() + test(args.model, args.dataset, args.dataset_config, args.threads) + + +# Call the function to run the benchmark +if __name__ == "__main__": + main()