mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-26 18:19:37 +00:00
Fixing a bug where long tokenizer files would be incorrectly deserialized (#459)
* Fixing a bug where long tokenizer files would be incorrectly deserialized - Add a bunch of tests to check deserialization behaviour - One tests also confirms current Single deserialization of Sequence. * Better test locations for Windows + no file dependency in Python binding Rust side. * Adressing @n1t0 comments.
This commit is contained in:
10
bindings/python/tests/test_serialization.py
Normal file
10
bindings/python/tests/test_serialization.py
Normal file
@ -0,0 +1,10 @@
|
||||
from tokenizers import Tokenizer, models, normalizers
|
||||
from .utils import data_dir, albert_base
|
||||
|
||||
|
||||
class TestSerialization:
|
||||
def test_full_serialization_albert(self, albert_base):
|
||||
# Check we can read this file.
|
||||
# This used to fail because of BufReader that would fail because the
|
||||
# file exceeds the buffer capacity
|
||||
tokenizer = Tokenizer.from_file(albert_base)
|
Reference in New Issue
Block a user