mirror of
https://github.com/mii443/tokenizers.git
synced 2025-08-22 16:25:30 +00:00
Python - Test and fix classes pickling
This commit is contained in:
@ -56,7 +56,7 @@ impl Decoder {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Decoder)]
|
||||
#[pyclass(extends=Decoder, module = "tokenizers.decoders")]
|
||||
pub struct ByteLevel {}
|
||||
#[pymethods]
|
||||
impl ByteLevel {
|
||||
@ -71,7 +71,7 @@ impl ByteLevel {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Decoder)]
|
||||
#[pyclass(extends=Decoder, module = "tokenizers.decoders")]
|
||||
pub struct WordPiece {}
|
||||
#[pymethods]
|
||||
impl WordPiece {
|
||||
@ -101,7 +101,7 @@ impl WordPiece {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Decoder)]
|
||||
#[pyclass(extends=Decoder, module = "tokenizers.decoders")]
|
||||
pub struct Metaspace {}
|
||||
#[pymethods]
|
||||
impl Metaspace {
|
||||
@ -139,7 +139,7 @@ impl Metaspace {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Decoder)]
|
||||
#[pyclass(extends=Decoder, module = "tokenizers.decoders")]
|
||||
pub struct BPEDecoder {}
|
||||
#[pymethods]
|
||||
impl BPEDecoder {
|
||||
|
@ -173,7 +173,7 @@ impl Model {
|
||||
|
||||
/// BPE Model
|
||||
/// Allows the creation of a BPE Model to be used with a Tokenizer
|
||||
#[pyclass(extends=Model)]
|
||||
#[pyclass(extends=Model, module = "tokenizers.models")]
|
||||
pub struct BPE {}
|
||||
|
||||
#[pymethods]
|
||||
@ -235,7 +235,7 @@ impl BPE {
|
||||
}
|
||||
|
||||
/// WordPiece Model
|
||||
#[pyclass(extends=Model)]
|
||||
#[pyclass(extends=Model, module = "tokenizers.models")]
|
||||
pub struct WordPiece {}
|
||||
|
||||
#[pymethods]
|
||||
@ -284,7 +284,7 @@ impl WordPiece {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Model)]
|
||||
#[pyclass(extends=Model, module = "tokenizers.models")]
|
||||
pub struct WordLevel {}
|
||||
|
||||
#[pymethods]
|
||||
|
@ -42,7 +42,7 @@ impl Normalizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
|
||||
pub struct BertNormalizer {}
|
||||
#[pymethods]
|
||||
impl BertNormalizer {
|
||||
@ -81,7 +81,7 @@ impl BertNormalizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
|
||||
pub struct NFD {}
|
||||
#[pymethods]
|
||||
impl NFD {
|
||||
@ -96,7 +96,7 @@ impl NFD {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
|
||||
pub struct NFKD {}
|
||||
#[pymethods]
|
||||
impl NFKD {
|
||||
@ -111,7 +111,7 @@ impl NFKD {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
|
||||
pub struct NFC {}
|
||||
#[pymethods]
|
||||
impl NFC {
|
||||
@ -126,7 +126,7 @@ impl NFC {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
|
||||
pub struct NFKC {}
|
||||
#[pymethods]
|
||||
impl NFKC {
|
||||
@ -141,7 +141,7 @@ impl NFKC {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
|
||||
pub struct Sequence {}
|
||||
#[pymethods]
|
||||
impl Sequence {
|
||||
@ -170,9 +170,13 @@ impl Sequence {
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[PyList::empty(py)]))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
|
||||
pub struct Lowercase {}
|
||||
#[pymethods]
|
||||
impl Lowercase {
|
||||
@ -187,7 +191,7 @@ impl Lowercase {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=Normalizer)]
|
||||
#[pyclass(extends=Normalizer, module = "tokenizers.normalizers")]
|
||||
pub struct Strip {}
|
||||
#[pymethods]
|
||||
impl Strip {
|
||||
|
@ -62,7 +62,7 @@ impl PreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
|
||||
pub struct ByteLevel {}
|
||||
#[pymethods]
|
||||
impl ByteLevel {
|
||||
@ -99,7 +99,7 @@ impl ByteLevel {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
|
||||
pub struct Whitespace {}
|
||||
#[pymethods]
|
||||
impl Whitespace {
|
||||
@ -114,7 +114,7 @@ impl Whitespace {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
|
||||
pub struct WhitespaceSplit {}
|
||||
#[pymethods]
|
||||
impl WhitespaceSplit {
|
||||
@ -129,7 +129,7 @@ impl WhitespaceSplit {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
|
||||
pub struct CharDelimiterSplit {}
|
||||
#[pymethods]
|
||||
impl CharDelimiterSplit {
|
||||
@ -150,9 +150,13 @@ impl CharDelimiterSplit {
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[" "]))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
|
||||
pub struct BertPreTokenizer {}
|
||||
#[pymethods]
|
||||
impl BertPreTokenizer {
|
||||
@ -167,7 +171,7 @@ impl BertPreTokenizer {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PreTokenizer)]
|
||||
#[pyclass(extends=PreTokenizer, module = "tokenizers.pre_tokenizers")]
|
||||
pub struct Metaspace {}
|
||||
#[pymethods]
|
||||
impl Metaspace {
|
||||
|
@ -46,7 +46,7 @@ impl PostProcessor {
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PostProcessor)]
|
||||
#[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
|
||||
pub struct BertProcessing {}
|
||||
#[pymethods]
|
||||
impl BertProcessing {
|
||||
@ -61,9 +61,13 @@ impl BertProcessing {
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[("", 0), ("", 0)]))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PostProcessor)]
|
||||
#[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
|
||||
pub struct RobertaProcessing {}
|
||||
#[pymethods]
|
||||
impl RobertaProcessing {
|
||||
@ -86,9 +90,13 @@ impl RobertaProcessing {
|
||||
},
|
||||
))
|
||||
}
|
||||
|
||||
fn __getnewargs__<'p>(&self, py: Python<'p>) -> PyResult<&'p PyTuple> {
|
||||
Ok(PyTuple::new(py, &[("", 0), ("", 0)]))
|
||||
}
|
||||
}
|
||||
|
||||
#[pyclass(extends=PostProcessor)]
|
||||
#[pyclass(extends=PostProcessor, module = "tokenizers.processors")]
|
||||
pub struct ByteLevel {}
|
||||
#[pymethods]
|
||||
impl ByteLevel {
|
||||
|
@ -1,4 +1,5 @@
|
||||
import pytest
|
||||
import pickle
|
||||
|
||||
from tokenizers.decoders import Decoder, ByteLevel, WordPiece, Metaspace, BPEDecoder
|
||||
|
||||
@ -8,6 +9,7 @@ class TestByteLevel:
|
||||
assert ByteLevel() is not None
|
||||
assert isinstance(ByteLevel(), Decoder)
|
||||
assert isinstance(ByteLevel(), ByteLevel)
|
||||
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = ByteLevel()
|
||||
@ -21,6 +23,7 @@ class TestWordPiece:
|
||||
assert WordPiece(cleanup=True) is not None
|
||||
assert isinstance(WordPiece(), Decoder)
|
||||
assert isinstance(WordPiece(), WordPiece)
|
||||
assert isinstance(pickle.loads(pickle.dumps(WordPiece())), WordPiece)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = WordPiece()
|
||||
@ -40,6 +43,7 @@ class TestMetaspace:
|
||||
assert Metaspace(add_prefix_space=True) is not None
|
||||
assert isinstance(Metaspace(), Decoder)
|
||||
assert isinstance(Metaspace(), Metaspace)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = Metaspace()
|
||||
@ -54,6 +58,7 @@ class TestBPEDecoder:
|
||||
assert BPEDecoder(suffix="_") is not None
|
||||
assert isinstance(BPEDecoder(), Decoder)
|
||||
assert isinstance(BPEDecoder(), BPEDecoder)
|
||||
assert isinstance(pickle.loads(pickle.dumps(BPEDecoder())), BPEDecoder)
|
||||
|
||||
def test_decoding(self):
|
||||
decoder = BPEDecoder()
|
||||
|
@ -1,4 +1,5 @@
|
||||
import pytest
|
||||
import pickle
|
||||
|
||||
from ..utils import data_dir, roberta_files, bert_files
|
||||
|
||||
@ -13,6 +14,9 @@ class TestBPE:
|
||||
with pytest.raises(ValueError, match="`vocab` and `merges` must be both specified"):
|
||||
BPE(vocab=roberta_files["vocab"])
|
||||
BPE(merges=roberta_files["merges"])
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(BPE(roberta_files["vocab"], roberta_files["merges"]))), BPE
|
||||
)
|
||||
|
||||
|
||||
class TestWordPiece:
|
||||
@ -20,6 +24,7 @@ class TestWordPiece:
|
||||
assert isinstance(WordPiece(), Model)
|
||||
assert isinstance(WordPiece(), WordPiece)
|
||||
assert isinstance(WordPiece(bert_files["vocab"]), Model)
|
||||
assert isinstance(pickle.loads(pickle.dumps(WordPiece(bert_files["vocab"]))), WordPiece)
|
||||
|
||||
|
||||
class TestWordLevel:
|
||||
@ -29,3 +34,4 @@ class TestWordLevel:
|
||||
# The WordLevel model expects a vocab.json using the same format as roberta
|
||||
# so we can just try to load with this file
|
||||
assert isinstance(WordLevel(roberta_files["vocab"]), Model)
|
||||
assert isinstance(WordLevel(roberta_files["vocab"]), WordLevel)
|
||||
|
@ -1,3 +1,5 @@
|
||||
import pickle
|
||||
|
||||
from tokenizers import Tokenizer
|
||||
from tokenizers.models import BPE
|
||||
from tokenizers.normalizers import Normalizer, BertNormalizer, Sequence, Lowercase, Strip
|
||||
@ -7,6 +9,7 @@ class TestBertNormalizer:
|
||||
def test_instantiate(self):
|
||||
assert isinstance(BertNormalizer(), Normalizer)
|
||||
assert isinstance(BertNormalizer(), BertNormalizer)
|
||||
assert isinstance(pickle.loads(pickle.dumps(BertNormalizer())), BertNormalizer)
|
||||
|
||||
def test_strip_accents(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
@ -49,6 +52,7 @@ class TestSequence:
|
||||
def test_instantiate(self):
|
||||
assert isinstance(Sequence([]), Normalizer)
|
||||
assert isinstance(Sequence([]), Sequence)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Sequence([]))), Sequence)
|
||||
|
||||
def test_can_make_sequences(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
@ -62,6 +66,7 @@ class TestLowercase:
|
||||
def test_instantiate(self):
|
||||
assert isinstance(Lowercase(), Normalizer)
|
||||
assert isinstance(Lowercase(), Lowercase)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Lowercase())), Lowercase)
|
||||
|
||||
def test_lowercase(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
@ -75,6 +80,7 @@ class TestStrip:
|
||||
def test_instantiate(self):
|
||||
assert isinstance(Strip(), Normalizer)
|
||||
assert isinstance(Strip(), Strip)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Strip())), Strip)
|
||||
|
||||
def test_left_strip(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
@ -1,4 +1,5 @@
|
||||
import pytest
|
||||
import pickle
|
||||
|
||||
from tokenizers.pre_tokenizers import (
|
||||
PreTokenizer,
|
||||
@ -18,6 +19,7 @@ class TestByteLevel:
|
||||
assert ByteLevel(add_prefix_space=False) is not None
|
||||
assert isinstance(ByteLevel(), PreTokenizer)
|
||||
assert isinstance(ByteLevel(), ByteLevel)
|
||||
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
|
||||
|
||||
def test_has_alphabet(self):
|
||||
assert isinstance(ByteLevel.alphabet(), list)
|
||||
@ -29,6 +31,7 @@ class TestWhitespace:
|
||||
assert Whitespace() is not None
|
||||
assert isinstance(Whitespace(), PreTokenizer)
|
||||
assert isinstance(Whitespace(), Whitespace)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Whitespace())), Whitespace)
|
||||
|
||||
|
||||
class TestWhitespaceSplit:
|
||||
@ -36,6 +39,7 @@ class TestWhitespaceSplit:
|
||||
assert WhitespaceSplit() is not None
|
||||
assert isinstance(WhitespaceSplit(), PreTokenizer)
|
||||
assert isinstance(WhitespaceSplit(), WhitespaceSplit)
|
||||
assert isinstance(pickle.loads(pickle.dumps(WhitespaceSplit())), WhitespaceSplit)
|
||||
|
||||
|
||||
class TestBertPreTokenizer:
|
||||
@ -43,6 +47,7 @@ class TestBertPreTokenizer:
|
||||
assert BertPreTokenizer() is not None
|
||||
assert isinstance(BertPreTokenizer(), PreTokenizer)
|
||||
assert isinstance(BertPreTokenizer(), BertPreTokenizer)
|
||||
assert isinstance(pickle.loads(pickle.dumps(BertPreTokenizer())), BertPreTokenizer)
|
||||
|
||||
|
||||
class TestMetaspace:
|
||||
@ -54,6 +59,7 @@ class TestMetaspace:
|
||||
assert Metaspace(add_prefix_space=True) is not None
|
||||
assert isinstance(Metaspace(), PreTokenizer)
|
||||
assert isinstance(Metaspace(), Metaspace)
|
||||
assert isinstance(pickle.loads(pickle.dumps(Metaspace())), Metaspace)
|
||||
|
||||
|
||||
class TestCharDelimiterSplit:
|
||||
@ -63,3 +69,4 @@ class TestCharDelimiterSplit:
|
||||
CharDelimiterSplit("")
|
||||
assert isinstance(CharDelimiterSplit(" "), PreTokenizer)
|
||||
assert isinstance(CharDelimiterSplit(" "), CharDelimiterSplit)
|
||||
assert isinstance(pickle.loads(pickle.dumps(CharDelimiterSplit("-"))), CharDelimiterSplit)
|
||||
|
@ -1,3 +1,5 @@
|
||||
import pickle
|
||||
|
||||
from ..utils import data_dir, roberta_files
|
||||
|
||||
from tokenizers import Tokenizer
|
||||
@ -12,6 +14,9 @@ class TestBertProcessing:
|
||||
assert processor is not None
|
||||
assert isinstance(processor, PostProcessor)
|
||||
assert isinstance(processor, BertProcessing)
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(BertProcessing(("[SEP]", 0), ("[CLS]", 1)))), BertProcessing
|
||||
)
|
||||
|
||||
def test_processing(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
@ -30,6 +35,10 @@ class TestRobertaProcessing:
|
||||
assert processor is not None
|
||||
assert isinstance(processor, PostProcessor)
|
||||
assert isinstance(processor, RobertaProcessing)
|
||||
assert isinstance(
|
||||
pickle.loads(pickle.dumps(RobertaProcessing(("</s>", 1), ("<s>", 0)))),
|
||||
RobertaProcessing,
|
||||
)
|
||||
|
||||
def test_processing(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
@ -48,6 +57,7 @@ class TestByteLevelProcessing:
|
||||
assert ByteLevel(trim_offsets=True) is not None
|
||||
assert isinstance(ByteLevel(), PostProcessor)
|
||||
assert isinstance(ByteLevel(), ByteLevel)
|
||||
assert isinstance(pickle.loads(pickle.dumps(ByteLevel())), ByteLevel)
|
||||
|
||||
def test_processing(self, roberta_files):
|
||||
tokenizer = Tokenizer(BPE(roberta_files["vocab"], roberta_files["merges"]))
|
||||
|
@ -1,3 +1,4 @@
|
||||
import pickle
|
||||
import pytest
|
||||
from ..utils import data_dir, roberta_files, bert_files
|
||||
|
||||
@ -68,6 +69,7 @@ class TestTokenizer:
|
||||
assert tokenizer.pre_tokenizer is None
|
||||
assert tokenizer.post_processor is None
|
||||
assert tokenizer.decoder is None
|
||||
assert isinstance(pickle.loads(pickle.dumps(Tokenizer(BPE()))), Tokenizer)
|
||||
|
||||
def test_add_tokens(self):
|
||||
tokenizer = Tokenizer(BPE())
|
||||
|
Reference in New Issue
Block a user