Python - Tests for parallelism with multiprocessing

Co-authored-by: Evan Pete Walsh <epwalsh10@gmail.com>
2025-09-04 00:09:34 +00:00 · 2020-06-23 11:25:39 -04:00
parent 5f760df231
commit aa3b39f692
5 changed files with 55 additions and 4 deletions
--- a/bindings/python/tests/bindings/test_tokenizer.py
+++ b/bindings/python/tests/bindings/test_tokenizer.py
@ -1,6 +1,6 @@
 import pickle
 import pytest
-from ..utils import data_dir, roberta_files, bert_files
+from ..utils import data_dir, roberta_files, bert_files, multiprocessing_with_parallelism

 from tokenizers import AddedToken, Tokenizer, Encoding
 from tokenizers.models import Model, BPE, WordPiece
@ -289,3 +289,8 @@ class TestTokenizer:
        # Can post process a pair of encodings
        output = tokenizer.post_process(encoding, pair_encoding)
        assert output.tokens == ["my", "pair", "[PAD]", "[PAD]"]
+
+    def test_multiprocessing_with_parallelism(self):
+        tokenizer = Tokenizer(BPE())
+        multiprocessing_with_parallelism(tokenizer, False)
+        multiprocessing_with_parallelism(tokenizer, True)
--- a/bindings/python/tests/implementations/test_bert_wordpiece.py
+++ b/bindings/python/tests/implementations/test_bert_wordpiece.py
@ -1,4 +1,4 @@
-from ..utils import data_dir, bert_files
+from ..utils import data_dir, bert_files, multiprocessing_with_parallelism
 from tokenizers import BertWordPieceTokenizer


@ -19,3 +19,8 @@ class TestBertWordPieceBPE:
        assert output.tokens == ["my", "name", "is", "john", "pair"]
        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15), (0, 4)]
        assert output.type_ids == [0, 0, 0, 0, 1]
+
+    def test_multiprocessing_with_parallelism(self, bert_files):
+        tokenizer = BertWordPieceTokenizer(bert_files["vocab"])
+        multiprocessing_with_parallelism(tokenizer, False)
+        multiprocessing_with_parallelism(tokenizer, True)
--- a/bindings/python/tests/implementations/test_byte_level_bpe.py
+++ b/bindings/python/tests/implementations/test_byte_level_bpe.py
@ -1,4 +1,4 @@
-from ..utils import data_dir, roberta_files
+from ..utils import data_dir, roberta_files, multiprocessing_with_parallelism
 from tokenizers import ByteLevelBPETokenizer


@ -79,3 +79,8 @@ class TestByteLevelBPE:
            "Ġlazy",
            "Ġdog",
        ]
+
+    def test_multiprocessing_with_parallelism(self, roberta_files):
+        tokenizer = ByteLevelBPETokenizer(roberta_files["vocab"], roberta_files["merges"])
+        multiprocessing_with_parallelism(tokenizer, False)
+        multiprocessing_with_parallelism(tokenizer, True)
--- a/bindings/python/tests/implementations/test_char_bpe.py
+++ b/bindings/python/tests/implementations/test_char_bpe.py
@ -1,4 +1,4 @@
-from ..utils import data_dir, openai_files
+from ..utils import data_dir, openai_files, multiprocessing_with_parallelism
 from tokenizers import CharBPETokenizer


@ -42,3 +42,8 @@ class TestBertWordPieceBPE:
        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"], lowercase=True)
        decoded = tokenizer.decode(tokenizer.encode("my name is john").ids)
        assert decoded == "my name is john"
+
+    def test_multiprocessing_with_parallelism(self, openai_files):
+        tokenizer = CharBPETokenizer(openai_files["vocab"], openai_files["merges"])
+        multiprocessing_with_parallelism(tokenizer, False)
+        multiprocessing_with_parallelism(tokenizer, True)
--- a/bindings/python/tests/utils.py
+++ b/bindings/python/tests/utils.py
@ -1,3 +1,4 @@
+import multiprocessing as mp
 import os
 import requests
 import pytest
@ -56,3 +57,33 @@ def openai_files(data_dir):
            "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt"
        ),
    }
+
+
+def multiprocessing_with_parallelism(tokenizer, enabled: bool):
+    """
+    This helper can be used to test that disabling parallelism avoids dead locks when the
+    same tokenizer is used after forking.
+    """
+    # It's essential to this test that we call 'encode' or 'encode_batch'
+    # before the fork. This causes the main process to "lock" some resources
+    # provided by the Rust "rayon" crate that are needed for parallel processing.
+    tokenizer.encode("Hi")
+    tokenizer.encode_batch(["hi", "there"])
+
+    def encode(tokenizer):
+        tokenizer.encode("Hi")
+        tokenizer.encode_batch(["hi", "there"])
+
+    # Make sure this environment variable is set before the fork happens
+    os.environ["TOKENIZERS_PARALLELISM"] = str(enabled)
+    p = mp.Process(target=encode, args=(tokenizer,))
+    p.start()
+    p.join(timeout=1)
+
+    # At this point the process should have successfully exited, depending on whether parallelism
+    # was activated or not. So we check the status and kill it if needed
+    alive = p.is_alive()
+    if alive:
+        p.terminate()
+
+    assert (alive and mp.get_start_method() == "fork") == enabled