Python - Fix ByteLevel instantiation from state (#621)

This commit is contained in:
Anthony MOI
2021-02-04 10:16:05 -05:00
committed by GitHub
parent 324cb8d380
commit 57200144ca
6 changed files with 30 additions and 4 deletions

View File

@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- [#617]: Fix offsets produced by Precompiled Normalizer (used by tokenizers converted from SPM)
- [#618]: Fix Normalizer.normalize with `PyNormalizedStringRefMut`
- [#620]: Fix serialization/deserialization for overlapping models
- [#621]: Fix `ByteLevel` instantiation from a previously saved state (using `__getstate__()`)
## [0.10.0]
@ -301,6 +302,7 @@ delimiter (Works like `.split(delimiter)`)
- Fix a bug that was causing crashes in Python 3.5
[#621]: https://github.com/huggingface/tokenizers/pull/621
[#620]: https://github.com/huggingface/tokenizers/pull/620
[#618]: https://github.com/huggingface/tokenizers/pull/618
[#617]: https://github.com/huggingface/tokenizers/pull/617

View File

@ -243,8 +243,8 @@ impl PyByteLevel {
}
#[new]
#[args(add_prefix_space = "true")]
fn new(add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
#[args(add_prefix_space = "true", _kwargs = "**")]
fn new(add_prefix_space: bool, _kwargs: Option<&PyDict>) -> PyResult<(Self, PyPreTokenizer)> {
Ok((
PyByteLevel {},
ByteLevel::default()

View File

@ -232,8 +232,11 @@ pub struct PyByteLevel {}
#[pymethods]
impl PyByteLevel {
#[new]
#[args(trim_offsets = "None")]
fn new(trim_offsets: Option<bool>) -> PyResult<(Self, PyPostProcessor)> {
#[args(trim_offsets = "None", _kwargs = "**")]
fn new(
trim_offsets: Option<bool>,
_kwargs: Option<&PyDict>,
) -> PyResult<(Self, PyPostProcessor)> {
let mut byte_level = ByteLevel::default();
if let Some(to) = trim_offsets {

View File

@ -1,5 +1,6 @@
import pytest
import pickle
import json
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
@ -15,6 +16,12 @@ class TestByteLevel:
decoder = ByteLevel()
assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John"
def test_manual_reload(self):
byte_level = ByteLevel()
state = json.loads(byte_level.__getstate__())
reloaded = ByteLevel(**state)
assert isinstance(reloaded, ByteLevel)
class TestWordPiece:
def test_instantiate(self):

View File

@ -1,5 +1,6 @@
import pytest
import pickle
import json
from tokenizers.pre_tokenizers import (
PreTokenizer,
@ -39,6 +40,12 @@ class TestByteLevel:
pretok.add_prefix_space = True
assert pretok.add_prefix_space == True
def test_manual_reload(self):
byte_level = ByteLevel()
state = json.loads(byte_level.__getstate__())
reloaded = ByteLevel(**state)
assert isinstance(reloaded, ByteLevel)
class TestSplit:
def test_instantiate(self):

View File

@ -1,5 +1,6 @@
import pytest
import pickle
import json
from ..utils import data_dir, roberta_files
@ -84,6 +85,12 @@ class TestByteLevelProcessing:
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
def test_manual_reload(self):
byte_level = ByteLevel()
state = json.loads(byte_level.__getstate__())
reloaded = ByteLevel(**state)
assert isinstance(reloaded, ByteLevel)
class TestTemplateProcessing:
def get_bert(self):