mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 00:35:35 +00:00
Fixing a bug where long tokenizer files would be incorrectly deserialized (#459)
* Fixing a bug where long tokenizer files would be incorrectly deserialized - Add a bunch of tests to check deserialization behaviour - One tests also confirms current Single deserialization of Sequence. * Better test locations for Windows + no file dependency in Python binding Rust side. * Adressing @n1t0 comments.
This commit is contained in:
@ -75,6 +75,13 @@ def train_files(data_dir):
|
||||
}
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def albert_base(data_dir):
|
||||
return download(
|
||||
"https://s3.amazonaws.com/models.huggingface.co/bert/albert-base-v1-tokenizer.json"
|
||||
)
|
||||
|
||||
|
||||
def multiprocessing_with_parallelism(tokenizer, enabled: bool):
|
||||
"""
|
||||
This helper can be used to test that disabling parallelism avoids dead locks when the
|
||||
|
Reference in New Issue
Block a user