Implement __new__ for PreTokenizers

__new__ allows PreTokenizers to be instansiated through the python
constructor.
This commit is contained in:
Bjarte Johansen
2020-02-07 11:30:51 +01:00
parent f32e0c09fc
commit 6a4976ddd6
6 changed files with 47 additions and 51 deletions

View File

@@ -26,13 +26,13 @@ impl PreTokenizer {
} }
} }
#[pyclass] #[pyclass(extends=PreTokenizer)]
pub struct ByteLevel {} pub struct ByteLevel {}
#[pymethods] #[pymethods]
impl ByteLevel { impl ByteLevel {
#[staticmethod] #[new]
#[args(kwargs = "**")] #[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> { fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut add_prefix_space = true; let mut add_prefix_space = true;
if let Some(kwargs) = kwargs { if let Some(kwargs) = kwargs {
@@ -45,11 +45,11 @@ impl ByteLevel {
} }
} }
Ok(PreTokenizer { Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::byte_level::ByteLevel::new( pretok: Container::Owned(Box::new(tk::pre_tokenizers::byte_level::ByteLevel::new(
add_prefix_space, add_prefix_space,
))), ))),
}) }))
} }
#[staticmethod] #[staticmethod]
@@ -61,66 +61,66 @@ impl ByteLevel {
} }
} }
#[pyclass] #[pyclass(extends=PreTokenizer)]
pub struct Whitespace {} pub struct Whitespace {}
#[pymethods] #[pymethods]
impl Whitespace { impl Whitespace {
#[staticmethod] #[new]
fn new() -> PyResult<PreTokenizer> { fn new(obj: &PyRawObject) -> PyResult<()> {
Ok(PreTokenizer { Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)), pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::Whitespace)),
}) }))
} }
} }
#[pyclass] #[pyclass(extends=PreTokenizer)]
pub struct WhitespaceSplit {} pub struct WhitespaceSplit {}
#[pymethods] #[pymethods]
impl WhitespaceSplit { impl WhitespaceSplit {
#[staticmethod] #[new]
fn new() -> PyResult<PreTokenizer> { fn new(obj: &PyRawObject) -> PyResult<()> {
Ok(PreTokenizer { Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit)), pretok: Container::Owned(Box::new(tk::pre_tokenizers::whitespace::WhitespaceSplit)),
}) }))
} }
} }
#[pyclass] #[pyclass(extends=PreTokenizer)]
pub struct CharDelimiterSplit {} pub struct CharDelimiterSplit {}
#[pymethods] #[pymethods]
impl CharDelimiterSplit { impl CharDelimiterSplit {
#[staticmethod] #[new]
pub fn new(delimiter: &str) -> PyResult<PreTokenizer> { pub fn new(obj: &PyRawObject, delimiter: &str) -> PyResult<()> {
let chr_delimiter = delimiter.chars().nth(0).ok_or(exceptions::Exception::py_err( let chr_delimiter = delimiter.chars().nth(0).ok_or(exceptions::Exception::py_err(
"delimiter must be a single character", "delimiter must be a single character",
))?; ))?;
Ok(PreTokenizer{ Ok(obj.init(PreTokenizer{
pretok:Container::Owned(Box::new( pretok:Container::Owned(Box::new(
tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(chr_delimiter) tk::pre_tokenizers::delimiter::CharDelimiterSplit::new(chr_delimiter)
)) ))
}) }))
} }
} }
#[pyclass] #[pyclass(extends=PreTokenizer)]
pub struct BertPreTokenizer {} pub struct BertPreTokenizer {}
#[pymethods] #[pymethods]
impl BertPreTokenizer { impl BertPreTokenizer {
#[staticmethod] #[new]
fn new() -> PyResult<PreTokenizer> { fn new(obj: &PyRawObject) -> PyResult<()> {
Ok(PreTokenizer { Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer)), pretok: Container::Owned(Box::new(tk::pre_tokenizers::bert::BertPreTokenizer)),
}) }))
} }
} }
#[pyclass] #[pyclass(extends=PreTokenizer)]
pub struct Metaspace {} pub struct Metaspace {}
#[pymethods] #[pymethods]
impl Metaspace { impl Metaspace {
#[staticmethod] #[new]
#[args(kwargs = "**")] #[args(kwargs = "**")]
fn new(kwargs: Option<&PyDict>) -> PyResult<PreTokenizer> { fn new(obj: &PyRawObject, kwargs: Option<&PyDict>) -> PyResult<()> {
let mut replacement = '▁'; let mut replacement = '▁';
let mut add_prefix_space = true; let mut add_prefix_space = true;
@@ -140,12 +140,12 @@ impl Metaspace {
} }
} }
Ok(PreTokenizer { Ok(obj.init(PreTokenizer {
pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new( pretok: Container::Owned(Box::new(tk::pre_tokenizers::metaspace::Metaspace::new(
replacement, replacement,
add_prefix_space, add_prefix_space,
))), ))),
}) }))
} }
} }

View File

@@ -37,7 +37,7 @@ class BertWordPieceTokenizer(BaseTokenizer):
strip_accents=strip_accents, strip_accents=strip_accents,
lowercase=lowercase, lowercase=lowercase,
) )
tokenizer.pre_tokenizer = BertPreTokenizer.new() tokenizer.pre_tokenizer = BertPreTokenizer()
if add_special_tokens and vocab_file is not None: if add_special_tokens and vocab_file is not None:
sep_token_id = tokenizer.token_to_id(sep_token) sep_token_id = tokenizer.token_to_id(sep_token)

View File

@@ -45,13 +45,12 @@ class ByteLevelBPETokenizer(BaseTokenizer):
else: else:
tokenizer.normalizer = normalizers[0] tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel.new(add_prefix_space=add_prefix_space) tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(
add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.ByteLevel() tokenizer.decoder = decoders.ByteLevel()
parameters = { parameters = {"model": "ByteLevelBPE", "add_prefix_space": add_prefix_space}
"model": "ByteLevelBPE",
"add_prefix_space": add_prefix_space,
}
super().__init__(tokenizer, parameters) super().__init__(tokenizer, parameters)

View File

@@ -51,7 +51,7 @@ class CharBPETokenizer(BaseTokenizer):
else: else:
tokenizer.normalizer = normalizers[0] tokenizer.normalizer = normalizers[0]
tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit.new() tokenizer.pre_tokenizer = pre_tokenizers.WhitespaceSplit()
tokenizer.decoder = decoders.BPEDecoder(suffix=suffix) tokenizer.decoder = decoders.BPEDecoder(suffix=suffix)
parameters = { parameters = {

View File

@@ -28,9 +28,11 @@ class SentencePieceBPETokenizer(BaseTokenizer):
tokenizer.add_special_tokens([ unk_token ]) tokenizer.add_special_tokens([ unk_token ])
tokenizer.normalizer = NFKC() tokenizer.normalizer = NFKC()
tokenizer.pre_tokenizer = pre_tokenizers.Metaspace.new(replacement=replacement, tokenizer.pre_tokenizer = pre_tokenizers.Metaspace(
add_prefix_space=add_prefix_space) replacement=replacement, add_prefix_space=add_prefix_space
)
tokenizer.decoder = decoders.Metaspace( tokenizer.decoder = decoders.Metaspace(
replacement=replacement, add_prefix_space=add_prefix_space replacement=replacement, add_prefix_space=add_prefix_space
) )

View File

@@ -21,7 +21,7 @@ class ByteLevel(PreTokenizer):
""" """
@staticmethod @staticmethod
def new(add_prefix_space: Optional[bool]=True) -> PreTokenizer: def __init__(self, add_prefix_space: Optional[bool] = True) -> None:
""" Instantiate a new ByteLevel PreTokenizer """ Instantiate a new ByteLevel PreTokenizer
Args: Args:
@@ -50,8 +50,7 @@ class Whitespace(PreTokenizer):
This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+` This pre-tokenizer simply splits using the following regex: `\w+|[^\w\s]+`
""" """
@staticmethod def __init__(self) -> None:
def new() -> PreTokenizer:
""" Instantiate a new Whitespace PreTokenizer """ """ Instantiate a new Whitespace PreTokenizer """
pass pass
@@ -61,8 +60,7 @@ class WhitespaceSplit(PreTokenizer):
This pre-tokenizer simply splits on the whitespace. Works like `.split()` This pre-tokenizer simply splits on the whitespace. Works like `.split()`
""" """
@staticmethod def __init__(self) -> None:
def new() -> PreTokenizer:
""" Instantiate a new WhitespaceSplit PreTokenizer """ """ Instantiate a new WhitespaceSplit PreTokenizer """
pass pass
@@ -73,8 +71,7 @@ class BertPreTokenizer(PreTokenizer):
Each occurence of a punctuation character will be treated separately. Each occurence of a punctuation character will be treated separately.
""" """
@staticmethod def __init__(self) -> None:
def new() -> PreTokenizer:
""" Instantiate a new BertPreTokenizer """ """ Instantiate a new BertPreTokenizer """
pass pass
@@ -85,9 +82,7 @@ class Metaspace(PreTokenizer):
It then tries to split on these spaces. It then tries to split on these spaces.
""" """
@staticmethod def __init__(self, replacement: str = "", add_prefix_space: bool = True) -> None:
def new(replacement: str="",
add_prefix_space: bool=True) -> PreTokenizer:
""" Instantiate a new Metaspace """ Instantiate a new Metaspace
Args: Args:
@@ -109,11 +104,11 @@ class CharDelimiterSplit(PreTokenizer):
""" """
@staticmethod @staticmethod
def new(delimiter: str) -> PreTokenizer: def __init__(self, delimiter: str) -> None:
""" Instantiate a new CharDelimiterSplit PreTokenizer """ Instantiate a new CharDelimiterSplit PreTokenizer
Args: Args:
delimiter: str: delimiter: str:
The delimiter char that will be used to split input The delimiter char that will be used to split input
""" """
pass pass