Python - Test and fix classes pickling

This commit is contained in:
Anthony MOI
2020-05-18 19:52:10 -04:00
parent 6a70162d78
commit c5bba91bf4
11 changed files with 76 additions and 24 deletions

View File

@ -56,7 +56,7 @@ impl Decoder {
}
}
#[pyclass(extends=Decoder)]
#[pyclass(extends=Decoder, module = "tokenizers.decoders")]
pub struct ByteLevel {}
#[pymethods]
impl ByteLevel {
@ -71,7 +71,7 @@ impl ByteLevel {
}
}
#[pyclass(extends=Decoder)]
#[pyclass(extends=Decoder, module = "tokenizers.decoders")]
pub struct WordPiece {}
#[pymethods]
impl WordPiece {
@ -101,7 +101,7 @@ impl WordPiece {
}
}
#[pyclass(extends=Decoder)]
#[pyclass(extends=Decoder, module = "tokenizers.decoders")]
pub struct Metaspace {}
#[pymethods]
impl Metaspace {
@ -139,7 +139,7 @@ impl Metaspace {
}
}
#[pyclass(extends=Decoder)]
#[pyclass(extends=Decoder, module = "tokenizers.decoders")]
pub struct BPEDecoder {}
#[pymethods]
impl BPEDecoder {

View File

@ -173,7 +173,7 @@ impl Model {
/// BPE Model
/// Allows the creation of a BPE Model to be used with a Tokenizer
#[pyclass(extends=Model)]
#[pyclass(extends=Model, module = "tokenizers.models")]
pub struct BPE {}
#[pymethods]
@ -235,7 +235,7 @@ impl BPE {
}
/// WordPiece Model
#[pyclass(extends=Model)]
#[pyclass(extends=Model, module = "tokenizers.models")]
pub struct WordPiece {}
#[pymethods]
@ -284,7 +284,7 @@ impl WordPiece {
}
}
#[pyclass(extends=Model)]
#[pyclass(extends=Model, module = "tokenizers.models")]
pub struct WordLevel {}
#[pymethods]

View File

@ -42,7 +42,7 @@ impl Normalizer {
}
}
#[pyclass(extends=Normalizer)]
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct BertNormalizer {}
#[pymethods]
impl BertNormalizer {
@ -81,7 +81,7 @@ impl BertNormalizer {
}
}
#[pyclass(extends=Normalizer)]
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct NFD {}
#[pymethods]
impl NFD {
@ -96,7 +96,7 @@ impl NFD {
}
}
#[pyclass(extends=Normalizer)]
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct NFKD {}
#[pymethods]
impl NFKD {
@ -111,7 +111,7 @@ impl NFKD {
}
}
#[pyclass(extends=Normalizer)]
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct NFC {}
#[pymethods]
impl NFC {
@ -126,7 +126,7 @@ impl NFC {
}
}
#[pyclass(extends=Normalizer)]
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct NFKC {}
#[pymethods]
impl NFKC {
@ -141,7 +141,7 @@ impl NFKC {
}
}
#[pyclass(extends=Normalizer)]
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct Sequence {}
#[pymethods]
impl Sequence {
@ -170,9 +170,13 @@ impl Sequence {
},
))
}
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
Ok(PyTuple::new(py, &[PyList::empty(py)]))
}
}
#[pyclass(extends=Normalizer)]
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct Lowercase {}
#[pymethods]
impl Lowercase {
@ -187,7 +191,7 @@ impl Lowercase {
}
}
#[pyclass(extends=Normalizer)]
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct Strip {}
#[pymethods]
impl Strip {

View File

@ -62,7 +62,7 @@ impl PreTokenizer {
}
}
#[pyclass(extends=PreTokenizer)]
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct ByteLevel {}
#[pymethods]
impl ByteLevel {
@ -99,7 +99,7 @@ impl ByteLevel {
}
}
#[pyclass(extends=PreTokenizer)]
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct Whitespace {}
#[pymethods]
impl Whitespace {
@ -114,7 +114,7 @@ impl Whitespace {
}
}
#[pyclass(extends=PreTokenizer)]
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct WhitespaceSplit {}
#[pymethods]
impl WhitespaceSplit {
@ -129,7 +129,7 @@ impl WhitespaceSplit {
}
}
#[pyclass(extends=PreTokenizer)]
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct CharDelimiterSplit {}
#[pymethods]
impl CharDelimiterSplit {
@ -150,9 +150,13 @@ impl CharDelimiterSplit {
},
))
}
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
Ok(PyTuple::new(py, &[" "]))
}
}
#[pyclass(extends=PreTokenizer)]
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct BertPreTokenizer {}
#[pymethods]
impl BertPreTokenizer {
@ -167,7 +171,7 @@ impl BertPreTokenizer {
}
}
#[pyclass(extends=PreTokenizer)]
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct Metaspace {}
#[pymethods]
impl Metaspace {

View File

@ -46,7 +46,7 @@ impl PostProcessor {
}
}
#[pyclass(extends=PostProcessor)]
#[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
pub struct BertProcessing {}
#[pymethods]
impl BertProcessing {
@ -61,9 +61,13 @@ impl BertProcessing {
},
))
}
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
Ok(PyTuple::new(py, &[("", 0), ("", 0)]))
}
}
#[pyclass(extends=PostProcessor)]
#[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
pub struct RobertaProcessing {}
#[pymethods]
impl RobertaProcessing {
@ -86,9 +90,13 @@ impl RobertaProcessing {
},
))
}
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
Ok(PyTuple::new(py, &[("", 0), ("", 0)]))
}
}
#[pyclass(extends=PostProcessor)]
#[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
pub struct ByteLevel {}
#[pymethods]
impl ByteLevel {

View File

@ -1,4 +1,5 @@
import pytest
import pickle
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
@ -8,6 +9,7 @@ class TestByteLevel:
assert ByteLevel() is not None
assert isinstance(ByteLevel(), Decoder)
assert isinstance(ByteLevel(), ByteLevel)
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
def test_decoding(self):
decoder = ByteLevel()
@ -21,6 +23,7 @@ class TestWordPiece:
assert WordPiece(cleanup=True) is not None
assert isinstance(WordPiece(), Decoder)
assert isinstance(WordPiece(), WordPiece)
assert isinstance(pickle.loads(pickle.dumps(WordPiece())), WordPiece)
def test_decoding(self):
decoder = WordPiece()
@ -40,6 +43,7 @@ class TestMetaspace:
assert Metaspace(add_prefix_space=True) is not None
assert isinstance(Metaspace(), Decoder)
assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
def test_decoding(self):
decoder = Metaspace()
@ -54,6 +58,7 @@ class TestBPEDecoder:
assert BPEDecoder(suffix="_") is not None
assert isinstance(BPEDecoder(), Decoder)
assert isinstance(BPEDecoder(), BPEDecoder)
assert isinstance(pickle.loads(pickle.dumps(BPEDecoder())), BPEDecoder)
def test_decoding(self):
decoder = BPEDecoder()

View File

@ -1,4 +1,5 @@
import pytest
import pickle
from ..utils import data_dir, roberta_files, bert_files
@ -13,6 +14,9 @@ class TestBPE:
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
BPE(vocab=roberta_files["vocab"])
BPE(merges=roberta_files["merges"])
assert isinstance(
pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))), BPE
)
class TestWordPiece:
@ -20,6 +24,7 @@ class TestWordPiece:
assert isinstance(WordPiece(), Model)
assert isinstance(WordPiece(), WordPiece)
assert isinstance(WordPiece(bert_files["vocab"]), Model)
assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)
class TestWordLevel:
@ -29,3 +34,4 @@ class TestWordLevel:
# The WordLevel model expects a vocab.json using the same format as roberta
# so we can just try to load with this file
assert isinstance(WordLevel(roberta_files["vocab"]), Model)
assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)

View File

@ -1,3 +1,5 @@
import pickle
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.normalizers import Normalizer, BertNormalizer, Sequence, Lowercase, Strip
@ -7,6 +9,7 @@ class TestBertNormalizer:
def test_instantiate(self):
assert isinstance(BertNormalizer(), Normalizer)
assert isinstance(BertNormalizer(), BertNormalizer)
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
def test_strip_accents(self):
tokenizer = Tokenizer(BPE())
@ -49,6 +52,7 @@ class TestSequence:
def test_instantiate(self):
assert isinstance(Sequence([]), Normalizer)
assert isinstance(Sequence([]), Sequence)
assert isinstance(pickle.loads(pickle.dumps(Sequence([]))), Sequence)
def test_can_make_sequences(self):
tokenizer = Tokenizer(BPE())
@ -62,6 +66,7 @@ class TestLowercase:
def test_instantiate(self):
assert isinstance(Lowercase(), Normalizer)
assert isinstance(Lowercase(), Lowercase)
assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
def test_lowercase(self):
tokenizer = Tokenizer(BPE())
@ -75,6 +80,7 @@ class TestStrip:
def test_instantiate(self):
assert isinstance(Strip(), Normalizer)
assert isinstance(Strip(), Strip)
assert isinstance(pickle.loads(pickle.dumps(Strip())), Strip)
def test_left_strip(self):
tokenizer = Tokenizer(BPE())

View File

@ -1,4 +1,5 @@
import pytest
import pickle
from tokenizers.pre_tokenizers import (
PreTokenizer,
@ -18,6 +19,7 @@ class TestByteLevel:
assert ByteLevel(add_prefix_space=False) is not None
assert isinstance(ByteLevel(), PreTokenizer)
assert isinstance(ByteLevel(), ByteLevel)
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
def test_has_alphabet(self):
assert isinstance(ByteLevel.alphabet(), list)
@ -29,6 +31,7 @@ class TestWhitespace:
assert Whitespace() is not None
assert isinstance(Whitespace(), PreTokenizer)
assert isinstance(Whitespace(), Whitespace)
assert isinstance(pickle.loads(pickle.dumps(Whitespace())), Whitespace)
class TestWhitespaceSplit:
@ -36,6 +39,7 @@ class TestWhitespaceSplit:
assert WhitespaceSplit() is not None
assert isinstance(WhitespaceSplit(), PreTokenizer)
assert isinstance(WhitespaceSplit(), WhitespaceSplit)
assert isinstance(pickle.loads(pickle.dumps(WhitespaceSplit())), WhitespaceSplit)
class TestBertPreTokenizer:
@ -43,6 +47,7 @@ class TestBertPreTokenizer:
assert BertPreTokenizer() is not None
assert isinstance(BertPreTokenizer(), PreTokenizer)
assert isinstance(BertPreTokenizer(), BertPreTokenizer)
assert isinstance(pickle.loads(pickle.dumps(BertPreTokenizer())), BertPreTokenizer)
class TestMetaspace:
@ -54,6 +59,7 @@ class TestMetaspace:
assert Metaspace(add_prefix_space=True) is not None
assert isinstance(Metaspace(), PreTokenizer)
assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
class TestCharDelimiterSplit:
@ -63,3 +69,4 @@ class TestCharDelimiterSplit:
CharDelimiterSplit("")
assert isinstance(CharDelimiterSplit(" "), PreTokenizer)
assert isinstance(CharDelimiterSplit(" "), CharDelimiterSplit)
assert isinstance(pickle.loads(pickle.dumps(CharDelimiterSplit("-"))), CharDelimiterSplit)

View File

@ -1,3 +1,5 @@
import pickle
from ..utils import data_dir, roberta_files
from tokenizers import Tokenizer
@ -12,6 +14,9 @@ class TestBertProcessing:
assert processor is not None
assert isinstance(processor, PostProcessor)
assert isinstance(processor, BertProcessing)
assert isinstance(
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing
)
def test_processing(self):
tokenizer = Tokenizer(BPE())
@ -30,6 +35,10 @@ class TestRobertaProcessing:
assert processor is not None
assert isinstance(processor, PostProcessor)
assert isinstance(processor, RobertaProcessing)
assert isinstance(
pickle.loads(pickle.dumps(RobertaProcessing(("</s>", 1), ("<s>", 0)))),
RobertaProcessing,
)
def test_processing(self):
tokenizer = Tokenizer(BPE())
@ -48,6 +57,7 @@ class TestByteLevelProcessing:
assert ByteLevel(trim_offsets=True) is not None
assert isinstance(ByteLevel(), PostProcessor)
assert isinstance(ByteLevel(), ByteLevel)
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
def test_processing(self, roberta_files):
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))

View File

@ -1,3 +1,4 @@
import pickle
import pytest
from ..utils import data_dir, roberta_files, bert_files
@ -68,6 +69,7 @@ class TestTokenizer:
assert tokenizer.pre_tokenizer is None
assert tokenizer.post_processor is None
assert tokenizer.decoder is None
assert isinstance(pickle.loads(pickle.dumps(Tokenizer(BPE()))), Tokenizer)
def test_add_tokens(self):
tokenizer = Tokenizer(BPE())