Python - Fix ByteLevel instantiation from state (#621)

2025-12-06 12:48:18 +00:00 · 2021-02-04 10:16:05 -05:00
parent 324cb8d380
commit 57200144ca
6 changed files with 30 additions and 4 deletions
--- a/bindings/python/CHANGELOG.md
+++ b/bindings/python/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - [#617]: Fix offsets produced by Precompiled Normalizer (used by tokenizers converted from SPM)
 - [#618]: Fix Normalizer.normalize with `PyNormalizedStringRefMut`
 - [#620]: Fix serialization/deserialization for overlapping models
+- [#621]: Fix `ByteLevel` instantiation from a previously saved state (using `__getstate__()`)

 ## [0.10.0]

@@ -301,6 +302,7 @@ delimiter (Works like `.split(delimiter)`)
 - Fix a bug that was causing crashes in Python 3.5


+[#621]: https://github.com/huggingface/tokenizers/pull/621
 [#620]: https://github.com/huggingface/tokenizers/pull/620
 [#618]: https://github.com/huggingface/tokenizers/pull/618
 [#617]: https://github.com/huggingface/tokenizers/pull/617
--- a/bindings/python/src/pre_tokenizers.rs
+++ b/bindings/python/src/pre_tokenizers.rs
@@ -243,8 +243,8 @@ impl PyByteLevel {
    }

    #[new]
-    #[args(add_prefix_space = "true")]
-    fn new(add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
+    #[args(add_prefix_space = "true", _kwargs = "**")]
+    fn new(add_prefix_space: bool, _kwargs: Option<&PyDict>) -> PyResult<(Self, PyPreTokenizer)> {
        Ok((
            PyByteLevel {},
            ByteLevel::default()
--- a/bindings/python/src/processors.rs
+++ b/bindings/python/src/processors.rs
@@ -232,8 +232,11 @@ pub struct PyByteLevel {}
 #[pymethods]
 impl PyByteLevel {
    #[new]
-    #[args(trim_offsets = "None")]
-    fn new(trim_offsets: Option<bool>) -> PyResult<(Self, PyPostProcessor)> {
+    #[args(trim_offsets = "None", _kwargs = "**")]
+    fn new(
+        trim_offsets: Option<bool>,
+        _kwargs: Option<&PyDict>,
+    ) -> PyResult<(Self, PyPostProcessor)> {
        let mut byte_level = ByteLevel::default();

        if let Some(to) = trim_offsets {
--- a/bindings/python/tests/bindings/test_decoders.py
+++ b/bindings/python/tests/bindings/test_decoders.py
@@ -1,5 +1,6 @@
 import pytest
 import pickle
+import json

 from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder

@@ -15,6 +16,12 @@ class TestByteLevel:
        decoder = ByteLevel()
        assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John"

+    def test_manual_reload(self):
+        byte_level = ByteLevel()
+        state = json.loads(byte_level.__getstate__())
+        reloaded = ByteLevel(**state)
+        assert isinstance(reloaded, ByteLevel)
+

 class TestWordPiece:
    def test_instantiate(self):
--- a/bindings/python/tests/bindings/test_pre_tokenizers.py
+++ b/bindings/python/tests/bindings/test_pre_tokenizers.py
@@ -1,5 +1,6 @@
 import pytest
 import pickle
+import json

 from tokenizers.pre_tokenizers import (
    PreTokenizer,
@@ -39,6 +40,12 @@ class TestByteLevel:
        pretok.add_prefix_space = True
        assert pretok.add_prefix_space == True

+    def test_manual_reload(self):
+        byte_level = ByteLevel()
+        state = json.loads(byte_level.__getstate__())
+        reloaded = ByteLevel(**state)
+        assert isinstance(reloaded, ByteLevel)
+

 class TestSplit:
    def test_instantiate(self):
--- a/bindings/python/tests/bindings/test_processors.py
+++ b/bindings/python/tests/bindings/test_processors.py
@@ -1,5 +1,6 @@
 import pytest
 import pickle
+import json

 from ..utils import data_dir, roberta_files

@@ -84,6 +85,12 @@ class TestByteLevelProcessing:
        assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
        assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]

+    def test_manual_reload(self):
+        byte_level = ByteLevel()
+        state = json.loads(byte_level.__getstate__())
+        reloaded = ByteLevel(**state)
+        assert isinstance(reloaded, ByteLevel)
+

 class TestTemplateProcessing:
    def get_bert(self):