Implement __new__ for PreTokenizers

__new__ allows PreTokenizers to be instansiated through the python
constructor.
This commit is contained in:
Bjarte Johansen
2020-02-07 11:30:51 +01:00
parent f32e0c09fc
commit 6a4976ddd6
6 changed files with 47 additions and 51 deletions

View File

@@ -26,13 +26,13 @@ impl PreTokenizer {
}
}
#[pyclass]
#[pyclass(extends=PreTokenizer)]
pub struct ByteLevel {}
#[pymethods]
impl ByteLevel {
#[staticmethod]
#[new]
#[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut add_prefix_space = true;
if let Some(kwargs) = kwargs {
@@ -45,11 +45,11 @@ impl ByteLevel {
}
}
Ok(PreTokenizer {
Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::byte_level::ByteLevel::new(
add_prefix_space,
))),
})
}))
}
#[staticmethod]
@@ -61,66 +61,66 @@ impl ByteLevel {
}
}
#[pyclass]
#[pyclass(extends=PreTokenizer)]
pub struct Whitespace {}
#[pymethods]
impl Whitespace {
#[staticmethod]
fn new() -> PyResult<PreTokenizer> {
Ok(PreTokenizer {
#[new]
fn new(obj: &PyRawObject) -> PyResult<()> {
Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)),
})
}))
}
}
#[pyclass]
#[pyclass(extends=PreTokenizer)]
pub struct WhitespaceSplit {}
#[pymethods]
impl WhitespaceSplit {
#[staticmethod]
fn new() -> PyResult<PreTokenizer> {
Ok(PreTokenizer {
#[new]
fn new(obj: &PyRawObject) -> PyResult<()> {
Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit)),
})
}))
}
}
#[pyclass]
#[pyclass(extends=PreTokenizer)]
pub struct CharDelimiterSplit {}
#[pymethods]
impl CharDelimiterSplit {
#[staticmethod]
pub fn new(delimiter: &str) -> PyResult<PreTokenizer> {
#[new]
pub fn new(obj: &PyRawObject, delimiter: &str) -> PyResult<()> {
let chr_delimiter = delimiter.chars().nth(0).ok_or(exceptions::Exception::py_err(
"delimiter must be a single character",
))?;
Ok(PreTokenizer{
Ok(obj.init(PreTokenizer{
pretok:Container::Owned(Box::new(
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(chr_delimiter)
))
})
}))
}
}
#[pyclass]
#[pyclass(extends=PreTokenizer)]
pub struct BertPreTokenizer {}
#[pymethods]
impl BertPreTokenizer {
#[staticmethod]
fn new() -> PyResult<PreTokenizer> {
Ok(PreTokenizer {
#[new]
fn new(obj: &PyRawObject) -> PyResult<()> {
Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer)),
})
}))
}
}
#[pyclass]
#[pyclass(extends=PreTokenizer)]
pub struct Metaspace {}
#[pymethods]
impl Metaspace {
#[staticmethod]
#[new]
#[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> {
fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut replacement = '▁';
let mut add_prefix_space = true;
@@ -140,12 +140,12 @@ impl Metaspace {
}
}
Ok(PreTokenizer {
Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new(
replacement,
add_prefix_space,
))),
})
}))
}
}

View File

@@ -37,7 +37,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
strip_accents=strip_accents,
lowercase=lowercase,
)
tokenizer.pre_tokenizer = BertPreTokenizer.new()
tokenizer.pre_tokenizer = BertPreTokenizer()
if add_special_tokens and vocab_file is not None:
sep_token_id = tokenizer.token_to_id(sep_token)

View File

@@ -45,13 +45,12 @@ class ByteLevelBPETokenizer(BaseTokenizer):
else:
tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space)
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.ByteLevel()
parameters = {
"model": "ByteLevelBPE",
"add_prefix_space": add_prefix_space,
}
parameters = {"model": "ByteLevelBPE", "add_prefix_space": add_prefix_space}
super().__init__(tokenizer, parameters)

View File

@@ -51,7 +51,7 @@ class CharBPETokenizer(BaseTokenizer):
else:
tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new()
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
parameters = {

View File

@@ -28,9 +28,11 @@ class SentencePieceBPETokenizer(BaseTokenizer):
tokenizer.add_special_tokens([ unk_token ])
tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(replacement=replacement,
add_prefix_space=add_prefix_space)
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space
)

View File

@@ -21,7 +21,7 @@ class ByteLevel(PreTokenizer):
"""
@staticmethod
def new(add_prefix_space: Optional[bool]=True) -> PreTokenizer:
def __init__(self, add_prefix_space: Optional[bool] = True) -> None:
""" Instantiate a new ByteLevel PreTokenizer
Args:
@@ -50,8 +50,7 @@ class Whitespace(PreTokenizer):
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
"""
@staticmethod
def new() -> PreTokenizer:
def __init__(self) -> None:
""" Instantiate a new Whitespace PreTokenizer """
pass
@@ -61,8 +60,7 @@ class WhitespaceSplit(PreTokenizer):
This pre-tokenizer simply splits on the whitespace. Works like `.split()`
"""
@staticmethod
def new() -> PreTokenizer:
def __init__(self) -> None:
""" Instantiate a new WhitespaceSplit PreTokenizer """
pass
@@ -73,8 +71,7 @@ class BertPreTokenizer(PreTokenizer):
Each occurence of a punctuation character will be treated separately.
"""
@staticmethod
def new() -> PreTokenizer:
def __init__(self) -> None:
""" Instantiate a new BertPreTokenizer """
pass
@@ -85,9 +82,7 @@ class Metaspace(PreTokenizer):
It then tries to split on these spaces.
"""
@staticmethod
def new(replacement: str="",
add_prefix_space: bool=True) -> PreTokenizer:
def __init__(self, replacement: str = "", add_prefix_space: bool = True) -> None:
""" Instantiate a new Metaspace
Args:
@@ -109,11 +104,11 @@ class CharDelimiterSplit(PreTokenizer):
"""
@staticmethod
def new(delimiter: str) -> PreTokenizer:
def __init__(self, delimiter: str) -> None:
""" Instantiate a new CharDelimiterSplit PreTokenizer
Args:
delimiter: str:
The delimiter char that will be used to split input
"""
pass
pass