mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-23 16:49:27 +00:00
Python - Fix ByteLevel instantiation from state (#621)
This commit is contained in:
@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
|
|||||||
- [#617]: Fix offsets produced by Precompiled Normalizer (used by tokenizers converted from SPM)
|
- [#617]: Fix offsets produced by Precompiled Normalizer (used by tokenizers converted from SPM)
|
||||||
- [#618]: Fix Normalizer.normalize with `PyNormalizedStringRefMut`
|
- [#618]: Fix Normalizer.normalize with `PyNormalizedStringRefMut`
|
||||||
- [#620]: Fix serialization/deserialization for overlapping models
|
- [#620]: Fix serialization/deserialization for overlapping models
|
||||||
|
- [#621]: Fix `ByteLevel` instantiation from a previously saved state (using `__getstate__()`)
|
||||||
|
|
||||||
## [0.10.0]
|
## [0.10.0]
|
||||||
|
|
||||||
@ -301,6 +302,7 @@ delimiter (Works like `.split(delimiter)`)
|
|||||||
- Fix a bug that was causing crashes in Python 3.5
|
- Fix a bug that was causing crashes in Python 3.5
|
||||||
|
|
||||||
|
|
||||||
|
[#621]: https://github.com/huggingface/tokenizers/pull/621
|
||||||
[#620]: https://github.com/huggingface/tokenizers/pull/620
|
[#620]: https://github.com/huggingface/tokenizers/pull/620
|
||||||
[#618]: https://github.com/huggingface/tokenizers/pull/618
|
[#618]: https://github.com/huggingface/tokenizers/pull/618
|
||||||
[#617]: https://github.com/huggingface/tokenizers/pull/617
|
[#617]: https://github.com/huggingface/tokenizers/pull/617
|
||||||
|
@ -243,8 +243,8 @@ impl PyByteLevel {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[new]
|
#[new]
|
||||||
#[args(add_prefix_space = "true")]
|
#[args(add_prefix_space = "true", _kwargs = "**")]
|
||||||
fn new(add_prefix_space: bool) -> PyResult<(Self, PyPreTokenizer)> {
|
fn new(add_prefix_space: bool, _kwargs: Option<&PyDict>) -> PyResult<(Self, PyPreTokenizer)> {
|
||||||
Ok((
|
Ok((
|
||||||
PyByteLevel {},
|
PyByteLevel {},
|
||||||
ByteLevel::default()
|
ByteLevel::default()
|
||||||
|
@ -232,8 +232,11 @@ pub struct PyByteLevel {}
|
|||||||
#[pymethods]
|
#[pymethods]
|
||||||
impl PyByteLevel {
|
impl PyByteLevel {
|
||||||
#[new]
|
#[new]
|
||||||
#[args(trim_offsets = "None")]
|
#[args(trim_offsets = "None", _kwargs = "**")]
|
||||||
fn new(trim_offsets: Option<bool>) -> PyResult<(Self, PyPostProcessor)> {
|
fn new(
|
||||||
|
trim_offsets: Option<bool>,
|
||||||
|
_kwargs: Option<&PyDict>,
|
||||||
|
) -> PyResult<(Self, PyPostProcessor)> {
|
||||||
let mut byte_level = ByteLevel::default();
|
let mut byte_level = ByteLevel::default();
|
||||||
|
|
||||||
if let Some(to) = trim_offsets {
|
if let Some(to) = trim_offsets {
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import pickle
|
import pickle
|
||||||
|
import json
|
||||||
|
|
||||||
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
|
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
|
||||||
|
|
||||||
@ -15,6 +16,12 @@ class TestByteLevel:
|
|||||||
decoder = ByteLevel()
|
decoder = ByteLevel()
|
||||||
assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John"
|
assert decoder.decode(["My", "Ġname", "Ġis", "ĠJohn"]) == "My name is John"
|
||||||
|
|
||||||
|
def test_manual_reload(self):
|
||||||
|
byte_level = ByteLevel()
|
||||||
|
state = json.loads(byte_level.__getstate__())
|
||||||
|
reloaded = ByteLevel(**state)
|
||||||
|
assert isinstance(reloaded, ByteLevel)
|
||||||
|
|
||||||
|
|
||||||
class TestWordPiece:
|
class TestWordPiece:
|
||||||
def test_instantiate(self):
|
def test_instantiate(self):
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import pickle
|
import pickle
|
||||||
|
import json
|
||||||
|
|
||||||
from tokenizers.pre_tokenizers import (
|
from tokenizers.pre_tokenizers import (
|
||||||
PreTokenizer,
|
PreTokenizer,
|
||||||
@ -39,6 +40,12 @@ class TestByteLevel:
|
|||||||
pretok.add_prefix_space = True
|
pretok.add_prefix_space = True
|
||||||
assert pretok.add_prefix_space == True
|
assert pretok.add_prefix_space == True
|
||||||
|
|
||||||
|
def test_manual_reload(self):
|
||||||
|
byte_level = ByteLevel()
|
||||||
|
state = json.loads(byte_level.__getstate__())
|
||||||
|
reloaded = ByteLevel(**state)
|
||||||
|
assert isinstance(reloaded, ByteLevel)
|
||||||
|
|
||||||
|
|
||||||
class TestSplit:
|
class TestSplit:
|
||||||
def test_instantiate(self):
|
def test_instantiate(self):
|
||||||
|
@ -1,5 +1,6 @@
|
|||||||
import pytest
|
import pytest
|
||||||
import pickle
|
import pickle
|
||||||
|
import json
|
||||||
|
|
||||||
from ..utils import data_dir, roberta_files
|
from ..utils import data_dir, roberta_files
|
||||||
|
|
||||||
@ -84,6 +85,12 @@ class TestByteLevelProcessing:
|
|||||||
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
assert output.tokens == ["ĠMy", "Ġname", "Ġis", "ĠJohn"]
|
||||||
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
|
assert output.offsets == [(0, 2), (3, 7), (8, 10), (11, 15)]
|
||||||
|
|
||||||
|
def test_manual_reload(self):
|
||||||
|
byte_level = ByteLevel()
|
||||||
|
state = json.loads(byte_level.__getstate__())
|
||||||
|
reloaded = ByteLevel(**state)
|
||||||
|
assert isinstance(reloaded, ByteLevel)
|
||||||
|
|
||||||
|
|
||||||
class TestTemplateProcessing:
|
class TestTemplateProcessing:
|
||||||
def get_bert(self):
|
def get_bert(self):
|
||||||
|
Reference in New Issue
Block a user