mirror of
https://github.com/mii443/tokenizers.git
synced 2025-09-04 00:09:34 +00:00
Python - Tests for parallelism with multiprocessing
Co-authored-by: Evan Pete Walsh <epwalsh10@gmail.com>
This commit is contained in:
@ -1,6 +1,6 @@
|
||||
import pickle
|
||||
import pytest
|
||||
from ..utils import data_dir, roberta_files, bert_files
|
||||
from ..utils import data_dir, roberta_files, bert_files, multiprocessing_with_parallelism
|
||||
|
||||
from tokenizers import AddedToken, Tokenizer, Encoding
|
||||
from tokenizers.models import Model, BPE, WordPiece
|
||||
@ -289,3 +289,8 @@ class TestTokenizer:
|
||||
# Can post process a pair of encodings
|
||||
output = tokenizer.post_process(encoding, pair_encoding)
|
||||
assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
|
||||
|
||||
def test_multiprocessing_with_parallelism(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
@ -1,4 +1,4 @@
|
||||
from ..utils import data_dir, bert_files
|
||||
from ..utils import data_dir, bert_files, multiprocessing_with_parallelism
|
||||
from tokenizers import BertWordPieceTokenizer
|
||||
|
||||
|
||||
@ -19,3 +19,8 @@ class TestBertWordPieceBPE:
|
||||
assert output.tokens == ["my", "name", "is", "john", "pair"]
|
||||
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
|
||||
assert output.type_ids == [0, 0, 0, 0, 1]
|
||||
|
||||
def test_multiprocessing_with_parallelism(self, bert_files):
|
||||
tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
@ -1,4 +1,4 @@
|
||||
from ..utils import data_dir, roberta_files
|
||||
from ..utils import data_dir, roberta_files, multiprocessing_with_parallelism
|
||||
from tokenizers import ByteLevelBPETokenizer
|
||||
|
||||
|
||||
@ -79,3 +79,8 @@ class TestByteLevelBPE:
|
||||
"Ġlazy",
|
||||
"Ġdog",
|
||||
]
|
||||
|
||||
def test_multiprocessing_with_parallelism(self, roberta_files):
|
||||
tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
@ -1,4 +1,4 @@
|
||||
from ..utils import data_dir, openai_files
|
||||
from ..utils import data_dir, openai_files, multiprocessing_with_parallelism
|
||||
from tokenizers import CharBPETokenizer
|
||||
|
||||
|
||||
@ -42,3 +42,8 @@ class TestBertWordPieceBPE:
|
||||
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
|
||||
decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
|
||||
assert decoded == "my name is john"
|
||||
|
||||
def test_multiprocessing_with_parallelism(self, openai_files):
|
||||
tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
|
||||
multiprocessing_with_parallelism(tokenizer, False)
|
||||
multiprocessing_with_parallelism(tokenizer, True)
|
||||
|
@ -1,3 +1,4 @@
|
||||
import multiprocessing as mp
|
||||
import os
|
||||
import requests
|
||||
import pytest
|
||||
@ -56,3 +57,33 @@ def openai_files(data_dir):
|
||||
"https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
|
||||
),
|
||||
}
|
||||
|
||||
|
||||
def multiprocessing_with_parallelism(tokenizer, enabled: bool):
|
||||
"""
|
||||
This helper can be used to test that disabling parallelism avoids dead locks when the
|
||||
same tokenizer is used after forking.
|
||||
"""
|
||||
# It's essential to this test that we call 'encode' or 'encode_batch'
|
||||
# before the fork. This causes the main process to "lock" some resources
|
||||
# provided by the Rust "rayon" crate that are needed for parallel processing.
|
||||
tokenizer.encode("Hi")
|
||||
tokenizer.encode_batch(["hi", "there"])
|
||||
|
||||
def encode(tokenizer):
|
||||
tokenizer.encode("Hi")
|
||||
tokenizer.encode_batch(["hi", "there"])
|
||||
|
||||
# Make sure this environment variable is set before the fork happens
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = str(enabled)
|
||||
p = mp.Process(target=encode, args=(tokenizer,))
|
||||
p.start()
|
||||
p.join(timeout=1)
|
||||
|
||||
# At this point the process should have successfully exited, depending on whether parallelism
|
||||
# was activated or not. So we check the status and kill it if needed
|
||||
alive = p.is_alive()
|
||||
if alive:
|
||||
p.terminate()
|
||||
|
||||
assert (alive and mp.get_start_method() == "fork") == enabled
|
||||
|
Reference in New Issue
Block a user