mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Fixing the benchmark. (#1583)
This commit is contained in:
@ -7,6 +7,7 @@ import tiktoken
|
|||||||
from tokenizers import Tokenizer
|
from tokenizers import Tokenizer
|
||||||
from huggingface_hub import hf_hub_download
|
from huggingface_hub import hf_hub_download
|
||||||
from typing import Tuple, List
|
from typing import Tuple, List
|
||||||
|
from multiprocessing import Process
|
||||||
|
|
||||||
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B"
|
MODEL_ID = "meta-llama/Meta-Llama-3.1-8B"
|
||||||
DATASET = "facebook/xnli"
|
DATASET = "facebook/xnli"
|
||||||
@ -24,8 +25,8 @@ def format_byte_size(num_bytes: int) -> Tuple[str, str]:
|
|||||||
return f"{num_bytes_f:.2f} PB", "PB"
|
return f"{num_bytes_f:.2f} PB", "PB"
|
||||||
|
|
||||||
|
|
||||||
def benchmark_batch(model: str, documents: list[str]) -> None:
|
def benchmark_batch(model: str, documents: list[str], num_threads: int) -> None:
|
||||||
num_threads = int(os.environ["RAYON_NUM_THREADS"])
|
os.environ["RAYON_NUM_THREADS"] = str(num_threads)
|
||||||
num_bytes = sum(map(len, map(str.encode, documents)))
|
num_bytes = sum(map(len, map(str.encode, documents)))
|
||||||
readable_size, unit = format_byte_size(num_bytes)
|
readable_size, unit = format_byte_size(num_bytes)
|
||||||
print(f"==============")
|
print(f"==============")
|
||||||
@ -35,20 +36,20 @@ def benchmark_batch(model: str, documents: list[str]) -> None:
|
|||||||
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
|
pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"
|
||||||
num_reserved_special_tokens = 256
|
num_reserved_special_tokens = 256
|
||||||
special_tokens = [
|
special_tokens = [
|
||||||
"<|begin_of_text|>",
|
"<|begin_of_text|>",
|
||||||
"<|end_of_text|>",
|
"<|end_of_text|>",
|
||||||
"<|reserved_special_token_0|>",
|
"<|reserved_special_token_0|>",
|
||||||
"<|reserved_special_token_1|>",
|
"<|reserved_special_token_1|>",
|
||||||
"<|reserved_special_token_2|>",
|
"<|reserved_special_token_2|>",
|
||||||
"<|reserved_special_token_3|>",
|
"<|reserved_special_token_3|>",
|
||||||
"<|start_header_id|>",
|
"<|start_header_id|>",
|
||||||
"<|end_header_id|>",
|
"<|end_header_id|>",
|
||||||
"<|reserved_special_token_4|>",
|
"<|reserved_special_token_4|>",
|
||||||
"<|eot_id|>", # end of turn
|
"<|eot_id|>", # end of turn
|
||||||
] + [
|
] + [
|
||||||
f"<|reserved_special_token_{i}|>"
|
f"<|reserved_special_token_{i}|>"
|
||||||
for i in range(5, num_reserved_special_tokens - 5)
|
for i in range(5, num_reserved_special_tokens - 5)
|
||||||
]
|
]
|
||||||
num_base_tokens = len(mergeable_ranks)
|
num_base_tokens = len(mergeable_ranks)
|
||||||
special_tokens = {
|
special_tokens = {
|
||||||
token: num_base_tokens + i for i, token in enumerate(special_tokens)
|
token: num_base_tokens + i for i, token in enumerate(special_tokens)
|
||||||
@ -84,18 +85,25 @@ def test(model: str, dataset: str, dataset_config: str, threads: List[int]):
|
|||||||
input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths
|
input_lengths = [(10, False), (10_000, False), (10_000, True)] # Example input lengths
|
||||||
|
|
||||||
for num_threads in threads:
|
for num_threads in threads:
|
||||||
os.environ["RAYON_NUM_THREADS"] = str(num_threads)
|
|
||||||
os.environ["TOKENIZER_PARALLELISM"] = str(num_threads)
|
|
||||||
os.environ["RAYON_RS_NUM_THREADS"] = str(num_threads)
|
|
||||||
for length, fuse in input_lengths:
|
for length, fuse in input_lengths:
|
||||||
documents = []
|
documents = []
|
||||||
for i, item in enumerate(dataset_xnli["train"]):
|
for i, item in enumerate(dataset_xnli["train"]):
|
||||||
documents.append("".join(item["premise"].values()))
|
|
||||||
if i >= length:
|
if i >= length:
|
||||||
break
|
break
|
||||||
|
documents.append("".join(item["premise"].values()))
|
||||||
if fuse:
|
if fuse:
|
||||||
documents=["".join(documents)]
|
documents=["".join(documents)]
|
||||||
benchmark_batch(model, documents)
|
|
||||||
|
# Rayon thread pool is global to a process, we need to launch
|
||||||
|
# separate processes in order to accurately use the correct number of threads.
|
||||||
|
# Otherwise, we're simply running tokenizers in whatever tests comes first.
|
||||||
|
# tokenizers does NOT provide a method to change the number of threads during
|
||||||
|
# runtime.
|
||||||
|
p = Process(target=benchmark_batch, args=(model, documents, num_threads))
|
||||||
|
p.start()
|
||||||
|
p.join()
|
||||||
|
|
||||||
|
# benchmark_batch(model, documents, num_threads)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
Reference in New Issue
Block a user