Python - Test and fix classes pickling

This commit is contained in:
Anthony MOI
2020-05-18 19:52:10 -04:00
parent 6a70162d78
commit c5bba91bf4
11 changed files with 76 additions and 24 deletions

View File

@ -56,7 +56,7 @@ impl Decoder {
} }
} }
#[pyclass(extends=Decoder)] #[pyclass(extends=Decoder, module = "tokenizers.decoders")]
pub struct ByteLevel {} pub struct ByteLevel {}
#[pymethods] #[pymethods]
impl ByteLevel { impl ByteLevel {
@ -71,7 +71,7 @@ impl ByteLevel {
} }
} }
#[pyclass(extends=Decoder)] #[pyclass(extends=Decoder, module = "tokenizers.decoders")]
pub struct WordPiece {} pub struct WordPiece {}
#[pymethods] #[pymethods]
impl WordPiece { impl WordPiece {
@ -101,7 +101,7 @@ impl WordPiece {
} }
} }
#[pyclass(extends=Decoder)] #[pyclass(extends=Decoder, module = "tokenizers.decoders")]
pub struct Metaspace {} pub struct Metaspace {}
#[pymethods] #[pymethods]
impl Metaspace { impl Metaspace {
@ -139,7 +139,7 @@ impl Metaspace {
} }
} }
#[pyclass(extends=Decoder)] #[pyclass(extends=Decoder, module = "tokenizers.decoders")]
pub struct BPEDecoder {} pub struct BPEDecoder {}
#[pymethods] #[pymethods]
impl BPEDecoder { impl BPEDecoder {

View File

@ -173,7 +173,7 @@ impl Model {
/// BPE Model /// BPE Model
/// Allows the creation of a BPE Model to be used with a Tokenizer /// Allows the creation of a BPE Model to be used with a Tokenizer
#[pyclass(extends=Model)] #[pyclass(extends=Model, module = "tokenizers.models")]
pub struct BPE {} pub struct BPE {}
#[pymethods] #[pymethods]
@ -235,7 +235,7 @@ impl BPE {
} }
/// WordPiece Model /// WordPiece Model
#[pyclass(extends=Model)] #[pyclass(extends=Model, module = "tokenizers.models")]
pub struct WordPiece {} pub struct WordPiece {}
#[pymethods] #[pymethods]
@ -284,7 +284,7 @@ impl WordPiece {
} }
} }
#[pyclass(extends=Model)] #[pyclass(extends=Model, module = "tokenizers.models")]
pub struct WordLevel {} pub struct WordLevel {}
#[pymethods] #[pymethods]

View File

@ -42,7 +42,7 @@ impl Normalizer {
} }
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct BertNormalizer {} pub struct BertNormalizer {}
#[pymethods] #[pymethods]
impl BertNormalizer { impl BertNormalizer {
@ -81,7 +81,7 @@ impl BertNormalizer {
} }
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct NFD {} pub struct NFD {}
#[pymethods] #[pymethods]
impl NFD { impl NFD {
@ -96,7 +96,7 @@ impl NFD {
} }
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct NFKD {} pub struct NFKD {}
#[pymethods] #[pymethods]
impl NFKD { impl NFKD {
@ -111,7 +111,7 @@ impl NFKD {
} }
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct NFC {} pub struct NFC {}
#[pymethods] #[pymethods]
impl NFC { impl NFC {
@ -126,7 +126,7 @@ impl NFC {
} }
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct NFKC {} pub struct NFKC {}
#[pymethods] #[pymethods]
impl NFKC { impl NFKC {
@ -141,7 +141,7 @@ impl NFKC {
} }
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct Sequence {} pub struct Sequence {}
#[pymethods] #[pymethods]
impl Sequence { impl Sequence {
@ -170,9 +170,13 @@ impl Sequence {
}, },
)) ))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
Ok(PyTuple::new(py, &[PyList::empty(py)]))
}
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct Lowercase {} pub struct Lowercase {}
#[pymethods] #[pymethods]
impl Lowercase { impl Lowercase {
@ -187,7 +191,7 @@ impl Lowercase {
} }
} }
#[pyclass(extends=Normalizer)] #[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
pub struct Strip {} pub struct Strip {}
#[pymethods] #[pymethods]
impl Strip { impl Strip {

View File

@ -62,7 +62,7 @@ impl PreTokenizer {
} }
} }
#[pyclass(extends=PreTokenizer)] #[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct ByteLevel {} pub struct ByteLevel {}
#[pymethods] #[pymethods]
impl ByteLevel { impl ByteLevel {
@ -99,7 +99,7 @@ impl ByteLevel {
} }
} }
#[pyclass(extends=PreTokenizer)] #[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct Whitespace {} pub struct Whitespace {}
#[pymethods] #[pymethods]
impl Whitespace { impl Whitespace {
@ -114,7 +114,7 @@ impl Whitespace {
} }
} }
#[pyclass(extends=PreTokenizer)] #[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct WhitespaceSplit {} pub struct WhitespaceSplit {}
#[pymethods] #[pymethods]
impl WhitespaceSplit { impl WhitespaceSplit {
@ -129,7 +129,7 @@ impl WhitespaceSplit {
} }
} }
#[pyclass(extends=PreTokenizer)] #[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct CharDelimiterSplit {} pub struct CharDelimiterSplit {}
#[pymethods] #[pymethods]
impl CharDelimiterSplit { impl CharDelimiterSplit {
@ -150,9 +150,13 @@ impl CharDelimiterSplit {
}, },
)) ))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
Ok(PyTuple::new(py, &[" "]))
}
} }
#[pyclass(extends=PreTokenizer)] #[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct BertPreTokenizer {} pub struct BertPreTokenizer {}
#[pymethods] #[pymethods]
impl BertPreTokenizer { impl BertPreTokenizer {
@ -167,7 +171,7 @@ impl BertPreTokenizer {
} }
} }
#[pyclass(extends=PreTokenizer)] #[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
pub struct Metaspace {} pub struct Metaspace {}
#[pymethods] #[pymethods]
impl Metaspace { impl Metaspace {

View File

@ -46,7 +46,7 @@ impl PostProcessor {
} }
} }
#[pyclass(extends=PostProcessor)] #[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
pub struct BertProcessing {} pub struct BertProcessing {}
#[pymethods] #[pymethods]
impl BertProcessing { impl BertProcessing {
@ -61,9 +61,13 @@ impl BertProcessing {
}, },
)) ))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
Ok(PyTuple::new(py, &[("", 0), ("", 0)]))
}
} }
#[pyclass(extends=PostProcessor)] #[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
pub struct RobertaProcessing {} pub struct RobertaProcessing {}
#[pymethods] #[pymethods]
impl RobertaProcessing { impl RobertaProcessing {
@ -86,9 +90,13 @@ impl RobertaProcessing {
}, },
)) ))
} }
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
Ok(PyTuple::new(py, &[("", 0), ("", 0)]))
}
} }
#[pyclass(extends=PostProcessor)] #[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
pub struct ByteLevel {} pub struct ByteLevel {}
#[pymethods] #[pymethods]
impl ByteLevel { impl ByteLevel {

View File

@ -1,4 +1,5 @@
import pytest import pytest
import pickle
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
@ -8,6 +9,7 @@ class TestByteLevel:
assert ByteLevel() is not None assert ByteLevel() is not None
assert isinstance(ByteLevel(), Decoder) assert isinstance(ByteLevel(), Decoder)
assert isinstance(ByteLevel(), ByteLevel) assert isinstance(ByteLevel(), ByteLevel)
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
def test_decoding(self): def test_decoding(self):
decoder = ByteLevel() decoder = ByteLevel()
@ -21,6 +23,7 @@ class TestWordPiece:
assert WordPiece(cleanup=True) is not None assert WordPiece(cleanup=True) is not None
assert isinstance(WordPiece(), Decoder) assert isinstance(WordPiece(), Decoder)
assert isinstance(WordPiece(), WordPiece) assert isinstance(WordPiece(), WordPiece)
assert isinstance(pickle.loads(pickle.dumps(WordPiece())), WordPiece)
def test_decoding(self): def test_decoding(self):
decoder = WordPiece() decoder = WordPiece()
@ -40,6 +43,7 @@ class TestMetaspace:
assert Metaspace(add_prefix_space=True) is not None assert Metaspace(add_prefix_space=True) is not None
assert isinstance(Metaspace(), Decoder) assert isinstance(Metaspace(), Decoder)
assert isinstance(Metaspace(), Metaspace) assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
def test_decoding(self): def test_decoding(self):
decoder = Metaspace() decoder = Metaspace()
@ -54,6 +58,7 @@ class TestBPEDecoder:
assert BPEDecoder(suffix="_") is not None assert BPEDecoder(suffix="_") is not None
assert isinstance(BPEDecoder(), Decoder) assert isinstance(BPEDecoder(), Decoder)
assert isinstance(BPEDecoder(), BPEDecoder) assert isinstance(BPEDecoder(), BPEDecoder)
assert isinstance(pickle.loads(pickle.dumps(BPEDecoder())), BPEDecoder)
def test_decoding(self): def test_decoding(self):
decoder = BPEDecoder() decoder = BPEDecoder()

View File

@ -1,4 +1,5 @@
import pytest import pytest
import pickle
from ..utils import data_dir, roberta_files, bert_files from ..utils import data_dir, roberta_files, bert_files
@ -13,6 +14,9 @@ class TestBPE:
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"): with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
BPE(vocab=roberta_files["vocab"]) BPE(vocab=roberta_files["vocab"])
BPE(merges=roberta_files["merges"]) BPE(merges=roberta_files["merges"])
assert isinstance(
pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))), BPE
)
class TestWordPiece: class TestWordPiece:
@ -20,6 +24,7 @@ class TestWordPiece:
assert isinstance(WordPiece(), Model) assert isinstance(WordPiece(), Model)
assert isinstance(WordPiece(), WordPiece) assert isinstance(WordPiece(), WordPiece)
assert isinstance(WordPiece(bert_files["vocab"]), Model) assert isinstance(WordPiece(bert_files["vocab"]), Model)
assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)
class TestWordLevel: class TestWordLevel:
@ -29,3 +34,4 @@ class TestWordLevel:
# The WordLevel model expects a vocab.json using the same format as roberta # The WordLevel model expects a vocab.json using the same format as roberta
# so we can just try to load with this file # so we can just try to load with this file
assert isinstance(WordLevel(roberta_files["vocab"]), Model) assert isinstance(WordLevel(roberta_files["vocab"]), Model)
assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)

View File

@ -1,3 +1,5 @@
import pickle
from tokenizers import Tokenizer from tokenizers import Tokenizer
from tokenizers.models import BPE from tokenizers.models import BPE
from tokenizers.normalizers import Normalizer, BertNormalizer, Sequence, Lowercase, Strip from tokenizers.normalizers import Normalizer, BertNormalizer, Sequence, Lowercase, Strip
@ -7,6 +9,7 @@ class TestBertNormalizer:
def test_instantiate(self): def test_instantiate(self):
assert isinstance(BertNormalizer(), Normalizer) assert isinstance(BertNormalizer(), Normalizer)
assert isinstance(BertNormalizer(), BertNormalizer) assert isinstance(BertNormalizer(), BertNormalizer)
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
def test_strip_accents(self): def test_strip_accents(self):
tokenizer = Tokenizer(BPE()) tokenizer = Tokenizer(BPE())
@ -49,6 +52,7 @@ class TestSequence:
def test_instantiate(self): def test_instantiate(self):
assert isinstance(Sequence([]), Normalizer) assert isinstance(Sequence([]), Normalizer)
assert isinstance(Sequence([]), Sequence) assert isinstance(Sequence([]), Sequence)
assert isinstance(pickle.loads(pickle.dumps(Sequence([]))), Sequence)
def test_can_make_sequences(self): def test_can_make_sequences(self):
tokenizer = Tokenizer(BPE()) tokenizer = Tokenizer(BPE())
@ -62,6 +66,7 @@ class TestLowercase:
def test_instantiate(self): def test_instantiate(self):
assert isinstance(Lowercase(), Normalizer) assert isinstance(Lowercase(), Normalizer)
assert isinstance(Lowercase(), Lowercase) assert isinstance(Lowercase(), Lowercase)
assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
def test_lowercase(self): def test_lowercase(self):
tokenizer = Tokenizer(BPE()) tokenizer = Tokenizer(BPE())
@ -75,6 +80,7 @@ class TestStrip:
def test_instantiate(self): def test_instantiate(self):
assert isinstance(Strip(), Normalizer) assert isinstance(Strip(), Normalizer)
assert isinstance(Strip(), Strip) assert isinstance(Strip(), Strip)
assert isinstance(pickle.loads(pickle.dumps(Strip())), Strip)
def test_left_strip(self): def test_left_strip(self):
tokenizer = Tokenizer(BPE()) tokenizer = Tokenizer(BPE())

View File

@ -1,4 +1,5 @@
import pytest import pytest
import pickle
from tokenizers.pre_tokenizers import ( from tokenizers.pre_tokenizers import (
PreTokenizer, PreTokenizer,
@ -18,6 +19,7 @@ class TestByteLevel:
assert ByteLevel(add_prefix_space=False) is not None assert ByteLevel(add_prefix_space=False) is not None
assert isinstance(ByteLevel(), PreTokenizer) assert isinstance(ByteLevel(), PreTokenizer)
assert isinstance(ByteLevel(), ByteLevel) assert isinstance(ByteLevel(), ByteLevel)
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
def test_has_alphabet(self): def test_has_alphabet(self):
assert isinstance(ByteLevel.alphabet(), list) assert isinstance(ByteLevel.alphabet(), list)
@ -29,6 +31,7 @@ class TestWhitespace:
assert Whitespace() is not None assert Whitespace() is not None
assert isinstance(Whitespace(), PreTokenizer) assert isinstance(Whitespace(), PreTokenizer)
assert isinstance(Whitespace(), Whitespace) assert isinstance(Whitespace(), Whitespace)
assert isinstance(pickle.loads(pickle.dumps(Whitespace())), Whitespace)
class TestWhitespaceSplit: class TestWhitespaceSplit:
@ -36,6 +39,7 @@ class TestWhitespaceSplit:
assert WhitespaceSplit() is not None assert WhitespaceSplit() is not None
assert isinstance(WhitespaceSplit(), PreTokenizer) assert isinstance(WhitespaceSplit(), PreTokenizer)
assert isinstance(WhitespaceSplit(), WhitespaceSplit) assert isinstance(WhitespaceSplit(), WhitespaceSplit)
assert isinstance(pickle.loads(pickle.dumps(WhitespaceSplit())), WhitespaceSplit)
class TestBertPreTokenizer: class TestBertPreTokenizer:
@ -43,6 +47,7 @@ class TestBertPreTokenizer:
assert BertPreTokenizer() is not None assert BertPreTokenizer() is not None
assert isinstance(BertPreTokenizer(), PreTokenizer) assert isinstance(BertPreTokenizer(), PreTokenizer)
assert isinstance(BertPreTokenizer(), BertPreTokenizer) assert isinstance(BertPreTokenizer(), BertPreTokenizer)
assert isinstance(pickle.loads(pickle.dumps(BertPreTokenizer())), BertPreTokenizer)
class TestMetaspace: class TestMetaspace:
@ -54,6 +59,7 @@ class TestMetaspace:
assert Metaspace(add_prefix_space=True) is not None assert Metaspace(add_prefix_space=True) is not None
assert isinstance(Metaspace(), PreTokenizer) assert isinstance(Metaspace(), PreTokenizer)
assert isinstance(Metaspace(), Metaspace) assert isinstance(Metaspace(), Metaspace)
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
class TestCharDelimiterSplit: class TestCharDelimiterSplit:
@ -63,3 +69,4 @@ class TestCharDelimiterSplit:
CharDelimiterSplit("") CharDelimiterSplit("")
assert isinstance(CharDelimiterSplit(" "), PreTokenizer) assert isinstance(CharDelimiterSplit(" "), PreTokenizer)
assert isinstance(CharDelimiterSplit(" "), CharDelimiterSplit) assert isinstance(CharDelimiterSplit(" "), CharDelimiterSplit)
assert isinstance(pickle.loads(pickle.dumps(CharDelimiterSplit("-"))), CharDelimiterSplit)

View File

@ -1,3 +1,5 @@
import pickle
from ..utils import data_dir, roberta_files from ..utils import data_dir, roberta_files
from tokenizers import Tokenizer from tokenizers import Tokenizer
@ -12,6 +14,9 @@ class TestBertProcessing:
assert processor is not None assert processor is not None
assert isinstance(processor, PostProcessor) assert isinstance(processor, PostProcessor)
assert isinstance(processor, BertProcessing) assert isinstance(processor, BertProcessing)
assert isinstance(
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing
)
def test_processing(self): def test_processing(self):
tokenizer = Tokenizer(BPE()) tokenizer = Tokenizer(BPE())
@ -30,6 +35,10 @@ class TestRobertaProcessing:
assert processor is not None assert processor is not None
assert isinstance(processor, PostProcessor) assert isinstance(processor, PostProcessor)
assert isinstance(processor, RobertaProcessing) assert isinstance(processor, RobertaProcessing)
assert isinstance(
pickle.loads(pickle.dumps(RobertaProcessing(("</s>", 1), ("<s>", 0)))),
RobertaProcessing,
)
def test_processing(self): def test_processing(self):
tokenizer = Tokenizer(BPE()) tokenizer = Tokenizer(BPE())
@ -48,6 +57,7 @@ class TestByteLevelProcessing:
assert ByteLevel(trim_offsets=True) is not None assert ByteLevel(trim_offsets=True) is not None
assert isinstance(ByteLevel(), PostProcessor) assert isinstance(ByteLevel(), PostProcessor)
assert isinstance(ByteLevel(), ByteLevel) assert isinstance(ByteLevel(), ByteLevel)
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
def test_processing(self, roberta_files): def test_processing(self, roberta_files):
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"])) tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))

View File

@ -1,3 +1,4 @@
import pickle
import pytest import pytest
from ..utils import data_dir, roberta_files, bert_files from ..utils import data_dir, roberta_files, bert_files
@ -68,6 +69,7 @@ class TestTokenizer:
assert tokenizer.pre_tokenizer is None assert tokenizer.pre_tokenizer is None
assert tokenizer.post_processor is None assert tokenizer.post_processor is None
assert tokenizer.decoder is None assert tokenizer.decoder is None
assert isinstance(pickle.loads(pickle.dumps(Tokenizer(BPE()))), Tokenizer)
def test_add_tokens(self): def test_add_tokens(self):
tokenizer = Tokenizer(BPE()) tokenizer = Tokenizer(BPE())